In [None]:
# Cell 1: Install dependencies (run once)
%pip install numpy pandas transformers scikit-learn faiss-cpu gensim datasets networkx matplotlib seaborn tqdm ipython sentencepiece torch torchvision torchaudio

: 

In [None]:
# Cell 2: Import libraries
import numpy as np
import pandas as pd
from transformers import T5Tokenizer, T5ForConditionalGeneration
from sklearn.cluster import KMeans
from sklearn.preprocessing import normalize
from sklearn.metrics.pairwise import cosine_similarity
import faiss
import networkx as nx
from gensim.models import Word2Vec
from datasets import load_dataset
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display, HTML
import ipywidgets as widgets

%matplotlib inline 

In [None]:
# Cell 3: Load and preprocess data
print("Loading synthetic dataset...")
dataset = load_dataset("ilsilfverskiold/linkedin_profiles_synthetic")
df = pd.DataFrame(dataset['train'].select(range(500))) # Limit to 500 rows for speed
df['user_type'] = np.random.choice(['student', 'alumni', 'officer'], len(df))
display(df.head()) # Display sample data

In [None]:
# Cell 4: T5 setup for NLP
tokenizer = T5Tokenizer.from_pretrained('t5-small')
model = T5ForConditionalGeneration.from_pretrained('t5-small')

def summarize_profile(text):
    try:
        input_text = f"summarize: {text}"
        inputs = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True)
        summary_ids = model.generate(inputs['input_ids'], max_length=50, num_beams=4, early_stopping=True)
        return tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    except Exception as e:
        print(f"Error summarizing: {e}")
        return text

print("Summarizing profiles...")
tqdm.pandas() # Enable progress bar for pandas
df['summary'] = df['Headline'].progress_apply(summarize_profile)
display(df[['Headline', 'summary']].head())

In [None]:
# Cell 5: Generate embeddings
def generate_t5_embeddings(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=128)
    outputs = model.encoder(**inputs).last_hidden_state.mean(dim=1).detach().numpy()
    return outputs[0]

print("Generating T5 embeddings...")
df['headline_embedding'] = [generate_t5_embeddings(text) for text in tqdm(df['Headline'])]
df['summary_embedding'] = [generate_t5_embeddings(text) for text in tqdm(df['summary'])]

# Combine embeddings
df['combined_embedding'] = df.apply(lambda row: np.mean([row['headline_embedding'], row['summary_embedding']], axis=0), axis=1)
all_embeddings = normalize(np.stack(df['combined_embedding'].values))
print(f"Embeddings shape: {all_embeddings.shape}")

In [None]:
# Cell 6: FAISS index for similarity search
index = faiss.IndexFlatL2(all_embeddings.shape[1])
index.add(all_embeddings)
print("FAISS index created with", index.ntotal, "vectors.")

In [None]:
# Cell 7: Graph-based matching
G = nx.Graph()
for i, row in df.iterrows():
    G.add_node(i, user_type=row['user_type'], embedding=row['combined_embedding'])

print("Building graph edges...")
for i in tqdm(range(len(df))):
    for j in range(i + 1, min(i + 50, len(df))): # Limit comparisons for speed
        sim = cosine_similarity([all_embeddings[i]], [all_embeddings[j]])[0][0]
        if sim > 0.8:
            G.add_edge(i, j, weight=sim)

# Visualize graph (small subset)
plt.figure(figsize=(8, 6))
nx.draw(G, with_labels=False, node_size=50, node_color=df['user_type'].map({'student': 'blue', 'alumni': 'green', 'officer': 'red'}))
plt.title("User Connection Graph")
plt.show()

In [None]:
# Cell 8: Clustering
kmeans = KMeans(n_clusters=5, random_state=42)
df['cluster'] = kmeans.fit_predict(all_embeddings)

# Visualize clusters
plt.figure(figsize=(8, 6))
sns.scatterplot(x=all_embeddings[:, 0], y=all_embeddings[:, 1], hue=df['cluster'], palette='viridis')
plt.title("User Clusters")
plt.show()

In [None]:
# Cell 9: Enhanced recommendation function with prompt analysis
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Function to analyze prompt and extract requirements
def analyze_prompt(prompt):
    input_text = f"extract_keywords: {prompt}"
    inputs = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True)
    output_ids = model.generate(inputs['input_ids'], max_length=50, num_beams=4, early_stopping=True)
    keywords = tokenizer.decode(output_ids[0], skip_special_tokens=True).split(", ")
    
    # Generate prompt embedding
    prompt_embedding = generate_t5_embeddings(prompt)
    return keywords, prompt_embedding

# Enhanced recommendation function with prompt
def recommend_with_prompt(user_idx, prompt, top_n=5):
    # Step 1: Analyze prompt
    print(f"Analyzing prompt: '{prompt}'")
    prompt_keywords, prompt_embedding = analyze_prompt(prompt)
    print(f"Extracted keywords: {prompt_keywords}")

    # Step 2: Get user profile
    user_embedding = all_embeddings[user_idx]
    user_type = df.iloc[user_idx]['user_type']
    user_headline = df.iloc[user_idx]['Headline']

    # Step 3: Search FAISS for content-based matches
    distances, indices = index.search(prompt_embedding.reshape(1, -1), top_n + 10) # Extra candidates
    candidate_indices = indices[0][1:] # Exclude prompt itself if indexed
    candidates = df.iloc[candidate_indices]

    # Step 4: Score candidates based on prompt and user profile
    scores = []
    for idx in candidate_indices:
        candidate_embedding = all_embeddings[idx]
        candidate_headline = df.iloc[idx]['Headline']
        candidate_type = df.iloc[idx]['user_type']

        # Similarity scores
        prompt_sim = cosine_similarity([prompt_embedding], [candidate_embedding])[0][0]
        user_sim = cosine_similarity([user_embedding], [candidate_embedding])[0][0]

        # Weighted scoring (adjust weights as needed)
        score = 0.6 * prompt_sim + 0.4 * user_sim
        
        # Bonus for user type alignment (e.g., student -> alumni)
        if user_type == 'student' and candidate_type == 'alumni':
            score += 0.1
        
        # Bonus for keyword overlap
        headline_keywords = candidate_headline.lower().split()
        keyword_overlap = len(set(prompt_keywords) & set(headline_keywords)) / len(prompt_keywords)
        score += 0.2 * keyword_overlap

        scores.append((idx, score, prompt_sim, user_sim, keyword_overlap))

    # Step 5: Rank and filter top matches
    scores = sorted(scores, key=lambda x: x[1], reverse=True)[:top_n]
    top_indices = [s[0] for s in scores]
    top_matches = df.iloc[top_indices][['FirstName', 'LastName', 'user_type', 'Headline']]
    
    # Step 6: Prepare explanations
    explanations = []
    for i, (idx, score, prompt_sim, user_sim, keyword_overlap) in enumerate(scores):
        explanation = (
            f"Match {i+1}: Score={score:.3f}, "
            f"Prompt Similarity={prompt_sim:.3f}, "
            f"User Similarity={user_sim:.3f}, "
            f"Keyword Overlap={keyword_overlap:.3f}"
        )
        explanations.append(explanation)
    
    return top_matches, explanations

In [None]:
# Cell 10: Interactive prompt input and testing
from ipywidgets import interact, widgets

# Define a function for interactive testing
def test_prompt(user_idx, prompt):
    user_name = f"{df.iloc[user_idx]['FirstName']} {df.iloc[user_idx]['LastName']}"
    print(f"User: {user_name} (Type: {df.iloc[user_idx]['user_type']})")
    matches, explanations = recommend_with_prompt(user_idx, prompt)
    
    display(HTML(f"<b>Top Matches for '{prompt}':</b>"))
    display(matches)
    display(HTML("<b>Explanations:</b>"))
    for exp in explanations:
        print(exp)

# Create interactive widget
user_slider = widgets.IntSlider(min=0, max=len(df)-1, step=1, value=100, description='User Index')
prompt_text = widgets.Text(value="Any Android Developer to connect with?", description='Prompt')
interact(test_prompt, user_idx=user_slider, prompt=prompt_text)

In [None]:
# Cell 11: Visualize match quality
def plot_match_quality(user_idx, prompt):
    matches, explanations = recommend_with_prompt(user_idx, prompt)
    scores = [float(exp.split("Score=")[1].split(",")[0]) for exp in explanations]
    labels = [f"Match {i+1}" for i in range(len(scores))]
    
    plt.figure(figsize=(8, 5))
    sns.barplot(x=labels, y=scores, palette='viridis')
    plt.title(f"Match Quality for '{prompt}'")
    plt.ylabel("Score")
    plt.ylim(0, 1.5) # Adjust based on score range
    plt.show()

# Test visualization
plot_match_quality(100, "Any Android Developer to connect with?")