In [5]:
!pip install anthropic
!pip install faiss-cpu
!pip install tiktoken
!pip install transformers==4.31.0
!pip install -U sentence-transformers

Collecting anthropic
  Downloading anthropic-0.51.0-py3-none-any.whl.metadata (25 kB)
Downloading anthropic-0.51.0-py3-none-any.whl (263 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m264.0/264.0 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: anthropic
Successfully installed anthropic-0.51.0
Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.8 kB)
Downloading faiss_cpu-1.11.0-cp311-cp311-manylinux_2_28_x86_64.whl (31.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.3/31.3 MB[0m [31m55.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.11.0
Collecting transformers==4.31.0
  Downloading transformers-4.31.0-py3-none-any.whl.metadata (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.9/116.9 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0

In [6]:
import pandas as pd #library for dataframes
import tiktoken #library to estimate tokens used for each character

import faiss
# a library that allows developers to quickly search for embeddings of
# multimedia documents that are similar to each other. Can use any kind of Vector DB
# only used for demo purposes

from anthropic import Anthropic
import numpy as np


In [7]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [40]:
#never share your keys publicly. always load them using .env files. Directly assignment here is for demo purposes ONLY
ANTHROPIC_API_KEY = ""
client_ant = anthropic.Anthropic(api_key=ANTHROPIC_API_KEY)

In [10]:
import pandas as pd
df = pd.read_csv("/content/drive/My Drive/Colab Notebooks/Layoffs.csv")
df.head()


Unnamed: 0,Company,Location HQ,# Laid Off,Date,%,Industry,Source,Stage,$ Raised (mm),Country,Date Added
0,Match Group,New York City,325.0,2025-05-08,13%,Consumer,https://www.bloomberg.com/news/articles/2025-0...,Post-IPO,,United States,2025-05-09
1,CrowdStrike,SF Bay Area,500.0,2025-05-07,5%,Security,https://www.wsj.com/business/crowdstrike-to-cu...,Post-IPO,$1200,United States,2025-05-07
2,GenWise,"New Delhi,Non-U.S.",15.0,2025-05-05,20%,Other,https://entrackr.com/exclusive/exclusive-z47-b...,Seed,$3,India,2025-05-05
3,General Fusion,"Vancouver,Non-U.S.",,2025-05-05,25%,Energy,https://techcrunch.com/2025/05/05/layoffs-hit-...,Unknown,$370,Canada,2025-05-05
4,Deep Instinct,"Tel Aviv,Non-U.S.",20.0,2025-05-04,10%,Security,https://www.calcalistech.com/ctechnews/article...,Unknown,$322,Israel,2025-05-05


In [12]:
import pandas as pd
import tiktoken

# Load layoffs dataset
path = "/content/drive/My Drive/Colab Notebooks/Layoffs.csv"
layoffs_df = pd.read_csv(path)

# Encoding configuration
embedding_encoding = "cl100k_base"
max_tokens = 5000
encoding = tiktoken.get_encoding(embedding_encoding)

# Keep only rows with needed fields
layoffs_df = layoffs_df[["Date", "Company", "# Laid Off", "Industry", "Location HQ", "Stage", "Country"]]
layoffs_df = layoffs_df.dropna()

# Optional: Keep top N most recent events
top_n = 50
layoffs_df["Date"] = pd.to_datetime(layoffs_df["Date"], errors='coerce')
layoffs_df = layoffs_df.sort_values("Date", ascending=False).head(top_n)

# Create a combined semantic field for embedding
layoffs_df["combined"] = (
    "Date: " + layoffs_df["Date"].astype(str).str.strip() +
    "; Company: " + layoffs_df["Company"].str.strip() +
    "; Industry: " + layoffs_df["Industry"].str.strip() +
    "; Location: " + layoffs_df["Location HQ"].str.strip() +
    "; Country: " + layoffs_df["Country"].str.strip() +
    "; Stage: " + layoffs_df["Stage"].str.strip() +
    "; Laid Off: " + layoffs_df["# Laid Off"].astype(str).str.strip()
)

# Compute token count for each entry
layoffs_df["n_tokens"] = layoffs_df["combined"].apply(lambda x: len(encoding.encode(x)))

# Filter out overlong entries
layoffs_df = layoffs_df[layoffs_df["n_tokens"] <= max_tokens]

# Preview
layoffs_df[["combined", "n_tokens"]].head()


Unnamed: 0,combined,n_tokens
0,Date: 2025-05-08; Company: Match Group; Indust...,44
1,Date: 2025-05-07; Company: CrowdStrike; Indust...,44
2,Date: 2025-05-05; Company: GenWise; Industry: ...,45
4,Date: 2025-05-04; Company: Deep Instinct; Indu...,45
6,Date: 2025-05-01; Company: Beam; Industry: Oth...,43


In [13]:
print(layoffs_df.shape)
layoffs_df.head()

(50, 9)


Unnamed: 0,Date,Company,# Laid Off,Industry,Location HQ,Stage,Country,combined,n_tokens
0,2025-05-08,Match Group,325.0,Consumer,New York City,Post-IPO,United States,Date: 2025-05-08; Company: Match Group; Indust...,44
1,2025-05-07,CrowdStrike,500.0,Security,SF Bay Area,Post-IPO,United States,Date: 2025-05-07; Company: CrowdStrike; Indust...,44
2,2025-05-05,GenWise,15.0,Other,"New Delhi,Non-U.S.",Seed,India,Date: 2025-05-05; Company: GenWise; Industry: ...,45
4,2025-05-04,Deep Instinct,20.0,Security,"Tel Aviv,Non-U.S.",Unknown,Israel,Date: 2025-05-04; Company: Deep Instinct; Indu...,45
6,2025-05-01,Beam,200.0,Other,"Bristol,Non-U.S.",Unknown,United Kingdom,Date: 2025-05-01; Company: Beam; Industry: Oth...,43


In [14]:
# Initialize the SentenceTransformer model - embedding model
from sentence_transformers import SentenceTransformer

sentence_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

def get_embedding_st(text):
    return sentence_model.encode(text)

layoffs_df['ada_embedding'] = layoffs_df.combined.apply(lambda x: get_embedding_st(x))

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [15]:
layoffs_df.head()

Unnamed: 0,Date,Company,# Laid Off,Industry,Location HQ,Stage,Country,combined,n_tokens,ada_embedding
0,2025-05-08,Match Group,325.0,Consumer,New York City,Post-IPO,United States,Date: 2025-05-08; Company: Match Group; Indust...,44,"[-0.065450765, -0.03603269, 0.069790885, 0.033..."
1,2025-05-07,CrowdStrike,500.0,Security,SF Bay Area,Post-IPO,United States,Date: 2025-05-07; Company: CrowdStrike; Indust...,44,"[-0.031395357, -0.02367656, 0.07198583, 0.0276..."
2,2025-05-05,GenWise,15.0,Other,"New Delhi,Non-U.S.",Seed,India,Date: 2025-05-05; Company: GenWise; Industry: ...,45,"[-0.044260584, -0.025039958, 0.025649108, 0.00..."
4,2025-05-04,Deep Instinct,20.0,Security,"Tel Aviv,Non-U.S.",Unknown,Israel,Date: 2025-05-04; Company: Deep Instinct; Indu...,45,"[-0.05179641, -0.04188862, 0.057862904, -0.005..."
6,2025-05-01,Beam,200.0,Other,"Bristol,Non-U.S.",Unknown,United Kingdom,Date: 2025-05-01; Company: Beam; Industry: Oth...,43,"[-0.042615067, -0.04323138, 0.06596174, 0.0460..."


In [19]:
from sentence_transformers import SentenceTransformer
import numpy as np

# Load model (if not done yet)
sentence_model = SentenceTransformer('all-MiniLM-L6-v2')

# Create `combined` column if missing
if 'combined' not in layoffs_df.columns:
    layoffs_df["combined"] = (
        "Date: " + layoffs_df["Date"].astype(str).str.strip() +
        "; Company: " + layoffs_df["Company"].str.strip() +
        "; Industry: " + layoffs_df["Industry"].str.strip() +
        "; Location: " + layoffs_df["Location HQ"].str.strip() +
        "; Country: " + layoffs_df["Country"].str.strip() +
        "; Stage: " + layoffs_df["Stage"].str.strip() +
        "; Laid Off: " + layoffs_df["# Laid Off"].astype(str).str.strip()
    )

# Generate embeddings column
layoffs_df["embedding"] = layoffs_df["combined"].apply(lambda x: sentence_model.encode(x))

In [20]:
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer
from tqdm import tqdm

# Step 0: Ensure embeddings exist as NumPy arrays
layoffs_df["embedding"] = layoffs_df["embedding"].apply(lambda x: np.array(x, dtype=np.float32))

# Step 1: Initialize FAISS index for cosine similarity
embedding_dim = len(layoffs_df["embedding"].iloc[0])
index = faiss.IndexFlatIP(embedding_dim)

# Step 2: Normalize and add all embeddings to the index
embedding_matrix = np.vstack(layoffs_df["embedding"].values).astype("float32")
faiss.normalize_L2(embedding_matrix)
index.add(embedding_matrix)

# Step 3: Define search function
def search_layoffs(query_text, k=5):
    # Encode and normalize query
    query_vec = sentence_model.encode(query_text)
    query_vec = np.array(query_vec).reshape(1, -1).astype("float32")
    faiss.normalize_L2(query_vec)

    # Search the FAISS index
    distances, indices = index.search(query_vec, k)
    results = layoffs_df.iloc[indices[0]][["combined", "Company", "Date", "Industry", "Location HQ"]]
    return results

# 🔍 Example query
query = "Which companies recently laid off workers in the artificial intelligence sector?"
results = search_layoffs(query)

# Display results
print("Top relevant results:\n")
for i, row in results.iterrows():
    print(f"- {row['Date']} | {row['Company']} ({row['Industry']}) @ {row['Location HQ']}")
    print(f"  → {row['combined']}\n")

Top relevant results:

- 2025-05-04 00:00:00 | Deep Instinct (Security) @ Tel Aviv,Non-U.S.
  → Date: 2025-05-04; Company: Deep Instinct; Industry: Security; Location: Tel Aviv,Non-U.S.; Country: Israel; Stage: Unknown; Laid Off: 20.0

- 2025-02-17 00:00:00 | Logically (AI) @ Manchester,Non-U.S.
  → Date: 2025-02-17; Company: Logically; Industry: AI; Location: Manchester,Non-U.S.; Country: United Kingdom; Stage: Series A; Laid Off: 40.0

- 2025-04-23 00:00:00 | Intel (Hardware) @ Sacramento
  → Date: 2025-04-23; Company: Intel; Industry: Hardware; Location: Sacramento; Country: United States; Stage: Post-IPO; Laid Off: 22000.0

- 2025-03-10 00:00:00 | D-ID (AI) @ Tel Aviv,Non-U.S.
  → Date: 2025-03-10; Company: D-ID; Industry: AI; Location: Tel Aviv,Non-U.S.; Country: Israel; Stage: Series B; Laid Off: 22.0

- 2025-04-02 00:00:00 | Automattic (Other) @ SF Bay Area
  → Date: 2025-04-02; Company: Automattic; Industry: Other; Location: SF Bay Area; Country: United States; Stage: Series E;

In [26]:
# Claude-based RAG generation for layoffs
import os  # <-- add this

def generate_response_with_rag(query, k=5, model = "claude-3-sonnet-20240620"):
    # Step 1: Retrieve relevant layoff records
    relevant_docs = search_layoffs(query, k=k)
    context = "\n---\n".join(relevant_docs["combined"].tolist())

    print('\nRelevant Documents Retrieved from RAG Search:\n')
    print(context)

    # Step 2: Format the Claude system prompt
    system_prompt = f"""
You are a helpful assistant that answers user queries based on historical tech layoff events.

Use the following context (layoff reports) to answer the question.
If the context does not contain enough information, say "I don't know."

Context:
{context}
"""

    # Step 3: Call Claude Chat Completion API
    import anthropic
    client_ant = anthropic.Anthropic(api_key=os.getenv("ANTHROPIC_API_KEY"))

    response = client_ant.messages.create(
        model=model,
        system=system_prompt,
        messages=[
            {"role": "user", "content": query},
        ],
        temperature=0.3,
        max_tokens=300
    )

    return response.content

In [44]:
def generate_response_with_rag(query, k=5, model="claude-3-sonnet-20240620"):
    relevant_docs = search_layoffs(query, k=k)
    context_docs = relevant_docs["combined"].tolist()[:3]  # 🔧 limit to 3 docs
    context = "\n---\n".join(context_docs)

    print("\n📄 Relevant Documents Retrieved from RAG Search:\n")
    print(context)
    print("🔢 Approx context word count:", len(context.split()))

    system_prompt = f"""
You are a helpful assistant that answers user queries based on historical tech layoff events.

Use the following context (layoff reports) to answer the question.
If the context does not contain enough information, say "I don't know."

Context:
{context}
"""

    response = client_ant.messages.create(
        model=model,
        system=system_prompt.strip(),
        messages=[{"role": "user", "content": query}],
        temperature=0.3,
        max_tokens=300
    )

    try:
        return response.content[0].text
    except Exception as e:
        print("⚠️ Error extracting Claude response:", e)
        return str(response)


In [45]:
!pip install python-dotenv
from dotenv import load_dotenv
import os

load_dotenv()
import anthropic
from dotenv import load_dotenv
import os

load_dotenv()
client_ant = anthropic.Anthropic(api_key=os.getenv("ANTHROPIC_API_KEY"))




In [46]:
user_query = "Which companies laid off AI workers in 2025?"
answer = generate_response_with_rag(user_query)

print("\nQ:", user_query)
print("A:", answer)



📄 Relevant Documents Retrieved from RAG Search:

Date: 2025-02-17; Company: Logically; Industry: AI; Location: Manchester,Non-U.S.; Country: United Kingdom; Stage: Series A; Laid Off: 40.0
---
Date: 2025-03-10; Company: D-ID; Industry: AI; Location: Tel Aviv,Non-U.S.; Country: Israel; Stage: Series B; Laid Off: 22.0
---
Date: 2025-04-23; Company: Intel; Industry: Hardware; Location: Sacramento; Country: United States; Stage: Post-IPO; Laid Off: 22000.0
🔢 Approx context word count: 52


TypeError: "Could not resolve authentication method. Expected either api_key or auth_token to be set. Or for one of the `X-Api-Key` or `Authorization` headers to be explicitly omitted"