In [1]:
!pip install pandas sentence-transformers scikit-learn

Collecting sentence-transformers
  Downloading sentence_transformers-3.2.1-py3-none-any.whl.metadata (10 kB)
Collecting transformers<5.0.0,>=4.41.0 (from sentence-transformers)
  Downloading transformers-4.46.0-py3-none-any.whl.metadata (44 kB)
Collecting huggingface-hub>=0.20.0 (from sentence-transformers)
  Downloading huggingface_hub-0.26.1-py3-none-any.whl.metadata (13 kB)
Collecting regex!=2019.12.17 (from transformers<5.0.0,>=4.41.0->sentence-transformers)
  Downloading regex-2024.9.11-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (40 kB)
Collecting safetensors>=0.4.1 (from transformers<5.0.0,>=4.41.0->sentence-transformers)
  Downloading safetensors-0.4.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.8 kB)
Collecting tokenizers<0.21,>=0.20 (from transformers<5.0.0,>=4.41.0->sentence-transformers)
  Downloading tokenizers-0.20.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Downloading sentence_transfor

In [2]:
import pandas as pd
from sentence_transformers import SentenceTransformer, util
import numpy as np




In [8]:
# Load the data
small_list_df = pd.read_csv("small_careers_list.csv", header=None, names=["Career"])
large_list_df = pd.read_csv("large_careers_list.csv", header=None, names=["Career"])

In [9]:
small_careers = small_list_df["Career"].tolist()
large_careers = large_list_df["Career"].tolist()

In [10]:
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

# Encode the career names into vectors
small_embeddings = model.encode(small_careers, convert_to_tensor=True)
large_embeddings = model.encode(large_careers, convert_to_tensor=True)


In [17]:
results = []

# For each career in the small list, find the most semantically similar careers in the large list
for i, small_career in enumerate(small_careers):
    # Compute cosine similarity between this career and all careers in the large list
    similarities = util.pytorch_cos_sim(small_embeddings[i], large_embeddings)
    
    # Get the indices of top matches, let's say top 5 matches
    top_matches = np.argpartition(similarities[0], -1)[-1:]
    # top_match_idx = similarities[0].argmax().item()  # Get the index of the most similar career
    # top_match = large_careers[top_match_idx]

    
    # Get the corresponding career names from large list
    matched_careers = [large_careers[idx] for idx in top_matches]
    
    # Save the career and matched careers to the results list
    results.append([small_career, ", ".join(matched_careers)])

# Convert the results to a DataFrame
output_df = pd.DataFrame(results, columns=["Career (Small List)", "Matched Careers (Large List)"])

# Save the result to a CSV file
output_df.to_csv("matched_careers_top1.csv", index=False)

print("Matching complete! Results saved to matched_careers.csv")

Matching complete! Results saved to matched_careers.csv


In [13]:
exact_matches = []
semantic_matches = []

# Find exact matches first
for small_career in small_careers:
    exact_match = [large_career for large_career in large_careers if large_career.lower() == small_career.lower()]
    if exact_match:
        exact_matches.append([small_career, ", ".join(exact_match)])

# Save the exact matches to a CSV file
exact_matches_df = pd.DataFrame(exact_matches, columns=["Career (Small List)", "Exact Matched Careers"])
exact_matches_df.to_csv("exact_matches_2.csv", index=False)

# Load the pre-trained Sentence-BERT model for semantic matching
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

# Encode the career names into vectors
small_embeddings = model.encode(small_careers, convert_to_tensor=True)
large_embeddings = model.encode(large_careers, convert_to_tensor=True)

# For each career in the small list, find the most semantically similar careers in the large list
for i, small_career in enumerate(small_careers):
    # Skip if it's already in the exact matches
    if any(small_career.lower() == exact_career[0].lower() for exact_career in exact_matches):
        continue
    
    # Compute cosine similarity between this career and all careers in the large list
    similarities = util.pytorch_cos_sim(small_embeddings[i], large_embeddings)
    
    # Get the indices of top matches, let's say top 5 matches
    top_matches = np.argpartition(similarities[0], -5)[-5:]
    
    # Get the corresponding career names from large list
    matched_careers = [large_careers[idx] for idx in top_matches]
    
    # Save the career and matched careers to the semantic matches list
    semantic_matches.append([small_career, ", ".join(matched_careers)])

# Convert the semantic matches to a DataFrame
semantic_matches_df = pd.DataFrame(semantic_matches, columns=["Career (Small List)", "Semantically Matched Careers"])

# Save the semantic matches result to a CSV file
semantic_matches_df.to_csv("semantic_matches_2.csv", index=False)

print("Matching complete! Exact matches saved to 'exact_matches.csv' and semantic matches saved to 'semantic_matches.csv'")

Matching complete! Exact matches saved to 'exact_matches.csv' and semantic matches saved to 'semantic_matches.csv'
