In [None]:
%pip install datasets scikit-learn matplotlib -qq

In [1]:
import numpy as np
import pandas as pd
from datasets import load_dataset
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import plotly.express as px

# Synthetic LinkedIn profiles with the embeddings
dataset = load_dataset("ilsilfverskiold/linkedin_profiles_synthetic")
profiles = dataset['train']

# Anonymous job descriptions with embeddings
dataset = load_dataset("ilsilfverskiold/linkedin_recruitment_questions_embedded")
applications = dataset['train']

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# Profiles with the different embeddings - pick the embeddings you'd like to use
profiles

In [None]:
# Go through the applications to see which query you'll search with
applications

In [None]:
application = applications[1] # deciding on the second application
application_text = application['natural_language']
print("application we're looking for: ",application_text)

In [None]:
# Get the query embeddings for an embedding model - in here we're picking mxbai-embed-large-v1
query_embedding_vector = np.array(application['embeddings_mxbai-embed-large-v1'])

embeddings_list = [np.array(emb) for emb in profiles['embeddings_mxbai-embed-large-v1 ']] # note the extra space
texts = profiles['text']

In [None]:
# Let's first try to calculate the cosine similarity (without clustering)
def cosine_similarity(a, b):
    a = np.array(a)
    b = np.array(b)
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

similarities = []
for idx, emb in enumerate(embeddings_list):
    sim = cosine_similarity(query_embedding_vector, emb)
    similarities.append(sim)

In [None]:
results = list(zip(range(1, len(texts) + 1), similarities, texts))
sorted_results = sorted(results, key=lambda x: x[1], reverse=True)

# Let's display the results as well
print("\nSimilarity Results (sorted from highest to lowest):")
for idx, sim, text in sorted_results[:30]:  # adjust if you want to show more
    percentage = (sim + 1) / 2 * 100
    text_preview = ' '.join(text.split()[:10])
    print(f"Text {idx} similarity: {percentage:.2f}% - Preview: {text_preview}...")


In [None]:
# Let's now try to set up our cluster from the embeddings from the profiles
embeddings_array = np.array(embeddings_list)

num_clusters = 10 # you can pick another number here

kmeans = KMeans(n_clusters=num_clusters, random_state=42)
kmeans.fit(embeddings_array)

cluster_labels = kmeans.labels_

pca = PCA(n_components=2)
reduced_embeddings = pca.fit_transform(embeddings_array)

In [None]:
# Let's now see how query fits into the clustering
query_embedding_array = np.array(query_embedding_vector).reshape(1, -1)
reduced_query_embedding = pca.transform(query_embedding_array)

# Let's also predict which cluster the query would belong to
query_cluster_label = kmeans.predict(query_embedding_array)[0]
print(f"The query belongs to cluster {query_cluster_label}")

In [None]:
# Let's now visualise the cluster with the query mapped out as well

labels = ['Data Point'] * len(embeddings_array)

truncated_texts = []
for text in texts:
    words = text.strip().split()
    truncated_text = ' '.join(words[:5]) if len(words) >= 5 else text.strip()
    truncated_texts.append(truncated_text)

query_words = application_text.strip().split()
truncated_query_text = ' '.join(query_words[:5]) if len(query_words) >= 5 else application_text.strip()

df = pd.DataFrame({
    'Component 1': reduced_embeddings[:, 0],
    'Component 2': reduced_embeddings[:, 1],
    'Cluster': cluster_labels.astype(str),
    'Label': labels,
    'Text': truncated_texts
})

df_query = pd.DataFrame({
    'Component 1': [reduced_query_embedding[0, 0]],
    'Component 2': [reduced_query_embedding[0, 1]],
    'Cluster': [str(query_cluster_label)],
    'Label': ['Query'],
    'Text': [truncated_query_text]
})

df = pd.concat([df, df_query], ignore_index=True)

fig = px.scatter(
    df,
    x='Component 1',
    y='Component 2',
    color='Cluster',
    hover_data=['Label', 'Text'],
    symbol=df['Label'].apply(lambda x: 'x' if x == 'Query' else 'circle'),
    size=df['Label'].apply(lambda x: 15 if x == 'Query' else 10),
    title='Embedding Clusters Visualization with Truncated Texts'
)

fig.show()

In [None]:
# Let's now do semantic search but only in the correct cluster to see if it helps filter out irrelevant results
cluster_indices = np.where(cluster_labels == query_cluster_label)[0]

cluster_embeddings = embeddings_array[cluster_indices]
cluster_texts = [texts[i] for i in cluster_indices]

similarities_in_cluster = []
for idx, emb in zip(cluster_indices, cluster_embeddings):
    sim = cosine_similarity(query_embedding_vector, emb)
    similarities_in_cluster.append((idx, sim))

similarities_in_cluster.sort(key=lambda x: x[1], reverse=True)

top_n = 40  # adjust this number if you want to display more matches
top_matches = similarities_in_cluster[:top_n]

print(f"\nTop {top_n} similar texts in the same cluster as the query:")
for idx, sim in top_matches:
    percentage = (sim + 1) / 2 * 100
    text_preview = ' '.join(texts[idx].split()[:10])
    print(f"Text {idx+1} similarity: {percentage:.2f}% - Preview: {text_preview}...")