In [1]:
# Cell 1: Install dependencies (run once)
%pip install numpy pandas transformers scikit-learn faiss-cpu gensim datasets networkx matplotlib seaborn tqdm ipython sentencepiece torch torchvision torchaudio ipywidgets accelerate

Note: you may need to restart the kernel to use updated packages.


In [2]:
# Cell 2: Import libraries
import numpy as np
import pandas as pd
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
from sklearn.cluster import KMeans
from sklearn.preprocessing import normalize
from sklearn.metrics.pairwise import cosine_similarity
import faiss
import networkx as nx
from gensim.models import Word2Vec
from datasets import load_dataset, Dataset
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display, HTML
import ipywidgets
import torch

%matplotlib inline

In [3]:
# Cell 3: Load and preprocess data
print("Loading synthetic dataset...")
dataset = load_dataset("ilsilfverskiold/linkedin_profiles_synthetic")
df = pd.DataFrame(dataset['train'])
df['user_type'] = np.random.choice(['student', 'alumni', 'officer'], len(df))

# Rename columns to match metadata and AppConnX conventions
df = df.rename(columns={"About Me": "About", "Experience": "ExperienceText", "Education": "EducationText", "Skills": "SkillsText"})

# Combine profile fields into a single text field for summarization
def combine_profile(row):
    fields = [
        f"Headline: {row['Headline']}",
        f"About: {row['About']}",
        f"Skills: {row['SkillsText']}",
        f"Experience: {row['ExperienceText']}",
        f"Education: {row['EducationText']}",
        f"Certifications: {row['Certifications']}",
        f"Recommendations: {row['Recommendations']}",
        f"Location: {row['Location']}"
    ]
    return " | ".join([f for f in fields if pd.notna(f) and f.split(': ')[1]])

df['profile_text'] = df.apply(combine_profile, axis=1)
display(df[['Headline', 'About', 'SkillsText', 'ExperienceText', 'EducationText', 'profile_text']].head())

Loading synthetic dataset...


Unnamed: 0,Headline,About,SkillsText,ExperienceText,EducationText,profile_text
0,Augmented Reality Developer | Creating Immersi...,Dedicated and innovative AR developer with a p...,Unity; ARKit; ARCore; C#; Java; JavaScript; We...,"Senior AR Developer | Pixelloid | Zurich, Swit...",ETH Zurich | Bachelor of Science in Computer S...,Headline: Augmented Reality Developer | Creati...
1,UI Designer | Crafting Human-Centered Digital ...,As a UI Designer with a passion for human-cent...,User Experience Design; Visual Design; Human-C...,"Senior UI Designer at EVONIK | EVONIK | Essen,...",Hochschule für Gestaltung und Kunst | Bachelor...,Headline: UI Designer | Crafting Human-Centere...
2,Co-Founder AI Product | Transforming Industrie...,Results-driven technology entrepreneur with a ...,AI Strategy; Machine Learning; Deep Learning; ...,Mindera | Co-Founder and Head of AI Product | ...,EPFL | Master of Science in Computer Science |...,Headline: Co-Founder AI Product | Transforming...
3,NLP Specialist | Helping businesses unlock ins...,I'm a seasoned NLP professional with a passion...,NLP; Deep Learning; Natural Language Generatio...,"Senior NLP Engineer, IBM Research | IBM | Zuri...",École Polytechnique Fédérale de Lausanne (EPFL...,Headline: NLP Specialist | Helping businesses ...
4,Advertising Manager | Driving Brand Growth Thr...,Results-driven Advertising Manager with 8+ yea...,Advertising Strategy; Data Analysis; Team Mana...,Advertising Manager | Bonnard Advertising Agen...,Bachelor's Degree in Marketing | Marketing | U...,Headline: Advertising Manager | Driving Brand ...


In [4]:
# Cell 4: T5 setup for NLP
tokenizer = T5Tokenizer.from_pretrained('t5-small')
model = T5ForConditionalGeneration.from_pretrained('t5-small')

# Prepare fine-tuning dataset
def prepare_finetune_data(texts):
    input_texts = [f"summarize: {text}" for text in texts]
    # Use a concise version of profile as target (e.g., Headline + Skills snippet)
    target_texts = [f"{text.split(' | ')[0].replace('Headline: ', '')} - Skills: {text.split(' | ')[2].replace('Skills: ', '').split(', ')[0]}" 
                    for text in texts if len(text.split(' | ')) > 2]
    return Dataset.from_dict({"input_text": input_texts[:len(target_texts)], "target_text": target_texts})

finetune_dataset = prepare_finetune_data(df['profile_text'].tolist())

# Tokenize dataset
def tokenize_function(examples):
    inputs = tokenizer(examples['input_text'], max_length=512, truncation=True, padding="max_length")
    targets = tokenizer(examples['target_text'], max_length=100, truncation=True, padding="max_length")
    inputs["labels"] = targets["input_ids"]
    return inputs

tokenized_dataset = finetune_dataset.map(tokenize_function, batched=True)

# Fine-tuning arguments
training_args = TrainingArguments(
    output_dir="./t5_finetuned",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    save_steps=500,
    save_total_limit=2,
    logging_steps=100,
    learning_rate=3e-4,
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
)
print("Fine-tuning T5...")
trainer.train()
model.save_pretrained("./t5_finetuned")
tokenizer.save_pretrained("./t5_finetuned")

# Load fine-tuned model
model = T5ForConditionalGeneration.from_pretrained("./t5_finetuned")
tokenizer = T5Tokenizer.from_pretrained("./t5_finetuned")

def summarize_profile(text):
    input_text = f"summarize: {text}"
    inputs = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True)
    summary_ids = model.generate(inputs['input_ids'], max_length=100, num_beams=4, early_stopping=True)
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

print("Summarizing profiles with fine-tuned T5...")
tqdm.pandas()
df['summary'] = df['profile_text'].progress_apply(summarize_profile)
display(df[['profile_text', 'summary']].head())

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Map:   0%|          | 0/6904 [00:00<?, ? examples/s]

Fine-tuning T5...


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss
100,0.3059
200,0.0126
300,0.0066
400,0.0051
500,0.0041
600,0.0035
700,0.0028
800,0.0021
900,0.0023
1000,0.0022


Summarizing profiles with fine-tuned T5...


  0%|          | 0/6904 [00:00<?, ?it/s]

Unnamed: 0,profile_text,summary
0,Headline: Augmented Reality Developer | Creati...,Augmented Reality Developer - Skills: About: D...
1,Headline: UI Designer | Crafting Human-Centere...,UI Designer - Skills: About: As a UI Designer ...
2,Headline: Co-Founder AI Product | Transforming...,Co-Founder AI Product - Skills: About: Results...
3,Headline: NLP Specialist | Helping businesses ...,NLP Specialist - Skills: About: I'm a seasoned...
4,Headline: Advertising Manager | Driving Brand ...,Advertising Manager - Skills: About: Results-d...


In [1]:
# Cell 5: Generate embeddings
import numpy as np
from sklearn.preprocessing import normalize

# List of precomputed embedding columns from metadata
embedding_columns = [
    'embeddings_nv-embed-v1', 'embeddings_nv-embedqa-e5-v5', 'embeddings_bge-m3',
    'embeddings_arctic-embed-l', 'embeddings_mistral-7b-v2', 'embeddings_gte-large-en-v1.5',
    'embeddings_text-embedding-ada-002', 'embeddings_text-embedding-3-small',
    'embeddings_voyage-3', 'embeddings_mxbai-embed-large-v1 '
]

# Normalize precomputed embeddings with NaN handling
embedding_dict = {}
for col in embedding_columns:
    # Extract embeddings as a list
    embeddings_list = df[col].values
    
    # Convert to numpy array, replacing NaN rows with zeros
    embeddings = np.stack([np.nan_to_num(embed, nan=0.0) for embed in embeddings_list])
    
    # Check for rows that were originally all NaN (now all zeros after replacement)
    valid_mask = ~np.all(embeddings == 0, axis=1)  # True where row isn’t all zeros
    if not np.any(valid_mask):
        print(f"Warning: All embeddings in {col} are invalid (all NaN or zero). Skipping.")
        continue
    
    # Filter to valid embeddings only (optional: could keep zeros if desired)
    valid_embeddings = embeddings[valid_mask]
    print(f"{col}: {len(valid_embeddings)} valid embeddings out of {len(embeddings)}")
    
    # Normalize valid embeddings
    embedding_dict[col] = normalize(valid_embeddings)
    print(f"{col} shape: {embedding_dict[col].shape}")

# Example: Use one embedding model for downstream tasks (e.g., bge-m3)
selected_embedding = 'embeddings_bge-m3'
all_embeddings = embedding_dict.get(selected_embedding, None)
if all_embeddings is None:
    raise ValueError(f"Selected embedding {selected_embedding} failed to process.")
print(f"Using {selected_embedding} with shape: {all_embeddings.shape}")

NameError: name 'df' is not defined

In [None]:
# Cell 6: FAISS index for similarity search
faiss_indices = {}
for col in embedding_columns:
    d = embedding_dict[col].shape[1]
    index = faiss.IndexIVFFlat(faiss.IndexFlatL2(d), d, 50)
    index.train(embedding_dict[col])
    index.add(embedding_dict[col])
    index.nprobe = 5
    faiss_indices[col] = index
    print(f"FAISS IVF index for {col} created with {index.ntotal} vectors.")

In [None]:
# Cell 7: Graph-based matching
# Use one embedding model (e.g., 'embeddings_bge-m3') for graph
selected_embedding = 'embeddings_bge-m3'
all_embeddings = embedding_dict[selected_embedding]

G = nx.Graph()
for i, row in df.iterrows():
    G.add_node(i, user_type=row['user_type'], embedding=all_embeddings[i])

print("Building graph edges...")
for i in tqdm(range(len(df))):
    for j in range(i + 1, min(i + 50, len(df))):
        sim = cosine_similarity([all_embeddings[i]], [all_embeddings[j]])[0][0]
        if sim > 0.8:
            G.add_edge(i, j, weight=sim)

plt.figure(figsize=(8, 6))
nx.draw(G, with_labels=False, node_size=50, node_color=df['user_type'].map({'student': 'blue', 'alumni': 'green', 'officer': 'red'}))
plt.title(f"User Connection Graph ({selected_embedding})")
plt.show()

In [None]:
# Cell 8: Clustering
def cluster_embeddings(embeddings, n_clusters=5):
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    clusters = kmeans.fit_predict(embeddings)
    return clusters

for col in embedding_columns:
    df[f'cluster_{col}'] = cluster_embeddings(embedding_dict[col])
    plt.figure(figsize=(8, 6))
    sns.scatterplot(x=embedding_dict[col][:, 0], y=embedding_dict[col][:, 1], hue=df[f'cluster_{col}'], palette='viridis')
    plt.title(f"User Clusters ({col})")
    plt.show()

In [None]:
# Cell 9: Recommendation Systems Comparison
def analyze_prompt(prompt):
    input_text = f"extract_keywords: {prompt}"
    inputs = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True)
    output_ids = model.generate(inputs['input_ids'], max_length=50, num_beams=4, early_stopping=True)
    keywords = tokenizer.decode(output_ids[0], skip_special_tokens=True).split(", ")
    prompt_inputs = tokenizer(prompt, return_tensors="pt", max_length=128, truncation=True)
    with torch.no_grad():
        prompt_embedding = model.encoder(**prompt_inputs).last_hidden_state.mean(dim=1).detach().numpy()[0]
    return keywords, prompt_embedding

def recommend_with_prompt(user_idx, prompt, embedding_col='embeddings_bge-m3', top_n=5):
    print(f"Analyzing prompt: '{prompt}'")
    prompt_keywords, prompt_embedding = analyze_prompt(prompt)
    print(f"Extracted keywords: {prompt_keywords}")

    all_embeddings = embedding_dict[embedding_col]
    index = faiss_indices[embedding_col]
    user_embedding = all_embeddings[user_idx]
    user_type = df.iloc[user_idx]['user_type']
    user_skills = df.iloc[user_idx]['SkillsText'].split(', ')

    # Adjust prompt embedding dimension if needed (for compatibility)
    if prompt_embedding.shape[0] != all_embeddings.shape[1]:
        prompt_embedding = generate_t5_embeddings(prompt)[:all_embeddings.shape[1]]

    distances, indices = index.search(prompt_embedding.reshape(1, -1), top_n + 10)
    candidate_indices = indices[0][indices[0] != user_idx][:top_n + 10]
    candidates = df.iloc[candidate_indices]

    scores = []
    for idx in candidate_indices:
        candidate_embedding = all_embeddings[idx]
        candidate_type = df.iloc[idx]['user_type']
        candidate_skills = df.iloc[idx]['SkillsText'].split(', ')
        candidate_experience = df.iloc[idx]['ExperienceText'].lower()

        prompt_sim = cosine_similarity([prompt_embedding], [candidate_embedding])[0][0]
        user_sim = cosine_similarity([user_embedding], [candidate_embedding])[0][0]

        score = 0.7 * prompt_sim + 0.2 * user_sim
        if user_type == 'student' and candidate_type == 'alumni':
            score += 0.1
        skill_overlap = len(set(prompt_keywords) & set(candidate_skills)) / max(len(prompt_keywords), 1)
        score += 0.15 * skill_overlap
        exp_relevance = sum(1 for kw in prompt_keywords if kw.lower() in candidate_experience) / max(len(prompt_keywords), 1)
        score += 0.05 * exp_relevance

        scores.append((idx, score, prompt_sim, user_sim, skill_overlap, exp_relevance))

    scores = sorted(scores, key=lambda x: x[1], reverse=True)[:top_n]
    top_indices = [s[0] for s in scores]
    top_matches = df.iloc[top_indices][['FirstName', 'LastName', 'user_type', 'Headline', 'SkillsText']]

    explanations = [
        f"Match {i+1}: Score={s[1]:.3f}, Prompt Sim={s[2]:.3f}, User Sim={s[3]:.3f}, Skill Overlap={s[4]:.3f}, Exp Relevance={s[5]:.3f}"
        for i, s in enumerate(scores)
    ]
    return top_matches, explanations

In [None]:
# Cell 10: Evaluation and Comparison (Fixed)

from ipywidgets import interact, widgets

def test_prompt(user_idx, prompt, embedding_model):
    user_name = f"{df.iloc[user_idx]['FirstName']} {df.iloc[user_idx]['LastName']}"
    print(f"User: {user_name} (Type: {df.iloc[user_idx]['user_type']})")
    matches, explanations = recommend_with_prompt(user_idx, prompt, embedding_model)
    display(HTML(f"<b>Top Matches for '{prompt}' using {embedding_model}:</b>"))
    display(matches)
    display(HTML("<b>Explanations:</b>"))
    for exp in explanations:
        print(exp)

user_slider = widgets.IntSlider(min=0, max=len(df)-1, step=1, value=100, description='User Index')
prompt_text = widgets.Text(value="Frontend developer with React experience", description='Prompt')
embedding_dropdown = widgets.Dropdown(options=embedding_columns, value='embeddings_bge-m3', description='Embedding Model')
interact(test_prompt, user_idx=user_slider, prompt=prompt, embedding_model=embedding_dropdown)

In [None]:
# Cell 11: Visualize match quality
def plot_match_quality(user_idx, prompt, embedding_model):
    matches, explanations = recommend_with_prompt(user_idx, prompt, embedding_model)
    scores = [float(exp.split("Score=")[1].split(",")[0]) for exp in explanations]
    labels = [f"Match {i+1}" for i in range(len(scores))]
    
    plt.figure(figsize=(8, 5))
    sns.barplot(x=labels, y=scores, palette='viridis')
    plt.title(f"Match Quality for '{prompt}' ({embedding_model})")
    plt.ylabel("Score")
    plt.ylim(0, 1.5)
    plt.show()

plot_match_quality(100, "Android developer with Kotlin experience", 'embeddings_bge-m3')

In [None]:
#Cell 12: Comparison of Embedding Models
from sklearn.metrics import silhouette_score

def compare_embeddings(prompt, user_idx=100, top_n=5):
    results = {}
    for col in embedding_columns:
        matches, explanations = recommend_with_prompt(user_idx, prompt, col, top_n)
        scores = [float(exp.split("Score=")[1].split(",")[0]) for exp in explanations]
        
        # Cosine similarity consistency (average similarity to prompt embedding)
        prompt_embedding = generate_t5_embeddings(prompt)  # T5 as reference
        top_embeddings = embedding_dict[col][matches.index]
        avg_prompt_sim = np.mean([cosine_similarity([prompt_embedding[:top_embeddings.shape[1]]], [e])[0][0] for e in top_embeddings])
        
        # Clustering quality (silhouette score)
        cluster_labels = df[f'cluster_{col}'].iloc[matches.index]
        if len(set(cluster_labels)) > 1:  # Need at least 2 clusters
            sil_score = silhouette_score(top_embeddings, cluster_labels)
        else:
            sil_score = 0
        
        results[col] = {
            'avg_score': np.mean(scores),
            'avg_prompt_sim': avg_prompt_sim,
            'silhouette_score': sil_score
        }
        print(f"{col}: Avg Score={results[col]['avg_score']:.3f}, Prompt Sim={results[col]['avg_prompt_sim']:.3f}, Silhouette={results[col]['silhouette_score']:.3f}")

    # Plot comparison
    metrics = ['avg_score', 'avg_prompt_sim', 'silhouette_score']
    for metric in metrics:
        plt.figure(figsize=(10, 6))
        sns.barplot(x=list(results.keys()), y=[r[metric] for r in results.values()], palette='viridis')
        plt.title(f"Embedding Model Comparison: {metric}")
        plt.xticks(rotation=45, ha='right')
        plt.ylabel(metric)
        plt.tight_layout()
        plt.show()

    return results

# Test comparison
compare_embeddings("Data scientist with Python experience")