In [1]:
%pip install pandas numpy scikit-learn networkx datasets faiss-cpu gensim datasets

Collecting pandas
  Using cached pandas-2.2.3-cp311-cp311-macosx_11_0_arm64.whl.metadata (89 kB)
Collecting numpy
  Using cached numpy-2.2.3-cp311-cp311-macosx_14_0_arm64.whl.metadata (62 kB)
Collecting scikit-learn
  Using cached scikit_learn-1.6.1-cp311-cp311-macosx_12_0_arm64.whl.metadata (31 kB)
Collecting networkx
  Using cached networkx-3.4.2-py3-none-any.whl.metadata (6.3 kB)
Collecting datasets
  Using cached datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting faiss-cpu
  Using cached faiss_cpu-1.10.0-cp311-cp311-macosx_11_0_arm64.whl.metadata (4.4 kB)
Collecting gensim
  Downloading gensim-4.3.3-cp311-cp311-macosx_11_0_arm64.whl.metadata (8.1 kB)
Collecting pytz>=2020.1 (from pandas)
  Using cached pytz-2025.1-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Using cached tzdata-2025.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting scipy>=1.6.0 (from scikit-learn)
  Downloading scipy-1.15.2-cp311-cp311-macosx_14_0_arm64.whl.metadata (61

In [2]:
import numpy as np
from sklearn.preprocessing import normalize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import networkx as nx
import faiss
from gensim.models import Word2Vec
from datasets import load_dataset
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Load dataset from Hugging Face
dataset = load_dataset("ilsilfverskiold/linkedin_profiles_synthetic")
df = pd.DataFrame(dataset['train'])

In [4]:
# Feature Selection
text_features = ['Headline', 'About Me', 'Experience', 'Education', 'Skills', 'Certifications']

def generate_word2vec_embeddings(df, feature):
    sentences = df[feature].dropna().apply(lambda x: x.split(", ") if isinstance(x, str) else [])
    model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)
    df[f'{feature}_embedding'] = sentences.apply(lambda words: np.mean([model.wv[w] for w in words if w in model.wv] or [np.zeros(100)], axis=0))
    return df, model

# Generate embeddings for Skills, Experience, and Education
df, skills_model = generate_word2vec_embeddings(df, 'Skills')
df, experience_model = generate_word2vec_embeddings(df, 'Experience')
df, education_model = generate_word2vec_embeddings(df, 'Education')

# Combine all embeddings
def combine_embeddings(row):
    return np.mean([row['Skills_embedding'], row['Experience_embedding'], row['Education_embedding']], axis=0)
df['combined_embedding'] = df.apply(combine_embeddings, axis=1)

# Normalize embeddings
all_embeddings = np.stack(df['combined_embedding'].values)
all_embeddings = normalize(all_embeddings)

# Save to CSV
df.to_csv("enhanced_profiles.csv", index=False)



In [None]:
# Initialize FAISS Index
dimension = all_embeddings.shape[1]
index = faiss.IndexFlatIP(dimension)
index.add(all_embeddings)

# TF-IDF for text-based similarity
vectorizer = TfidfVectorizer(stop_words='english')
df['combined_text'] = df[text_features].fillna('').agg(' '.join, axis=1)
tfidf_matrix = vectorizer.fit_transform(df['combined_text'])



In [None]:
#Cosine Similarity

In [17]:
# Recommendation Functions
def get_top_matches(user_idx, top_n=5):
    query_embedding = all_embeddings[user_idx].reshape(1, -1)
    distances, indices = index.search(query_embedding, top_n + 1)
    similar_indices = indices[0][1:]
    return df.iloc[similar_indices][['FirstName', 'LastName', 'Headline']]

def get_text_similarity_recommendations(user_idx, top_n=5):
    query_tfidf = tfidf_matrix[user_idx]
    similarities = cosine_similarity(query_tfidf, tfidf_matrix).flatten()
    similar_indices = similarities.argsort()[-(top_n+1):-1][::-1]
    return df.iloc[similar_indices][['FirstName', 'LastName', 'Headline']]

def print_combined_recommendations(user_idx, user_name, top_n=5):
    print("Content-Based Recommendations:")
    print(get_top_matches(user_idx, 20))
    print("\nText-Based Recommendations:")
    print(get_text_similarity_recommendations(user_idx, 20))



In [18]:
# Test Recommendation System on an Existing User
existing_user_idx = 900
existing_user_name = f"{df.iloc[existing_user_idx]['FirstName']} {df.iloc[existing_user_idx]['LastName']}"
print_combined_recommendations(existing_user_idx, existing_user_name)

Content-Based Recommendations:
      FirstName     LastName  \
1173     Niklas     Svensson   
1725       Hans      Nielsen   
654         Ulf       Xavier   
1854     Gudrun       Xander   
1580      Jakob      Bergman   
6089    Winston     Grimstad   
2999     Dagmar       Madsen   
2183  Christian     Petersen   
4144     Astrid      Eriksen   
2058       Elsa  Ingemarsson   
5676    Nicolas       Muller   
4104     Casper      Eriksen   
3843      Peder     Petersen   
5727      Hanna      Ziegler   
6243      Sofia     Yildirim   
96       Julian        Yager   
856       Robin  Vestergaard   
5336     Jesper      Ullerup   
47      Theodor       Yassen   
2298      Bjorn    Johansson   

                                               Headline  
1173   Data Scientist | Predictive Modeling & Analytics  
1725     Chief Executive Officer (CEO) at NordicTech AB  
654         Commercial Co-founder at Element 7 Ventures  
1854  D2C Manager | Direct-to-Consumer E-commerce Ex...  
1580  