In [2]:
import pandas as pd
df = pd.read_csv('linkedin_title_skills.csv')
first_10k = df.head(10000)
first_10k.to_csv('linkedin_title_skills.csv', index=False)

In [3]:
df=pd.read_csv('linkedin_title_skills.csv')

In [4]:
df.shape

(10000, 2)

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [6]:
df['skills_text'] = df['job_skills'].fillna('').str.replace(',', ' ')

# Fit TF‑IDF on job skills
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(df['skills_text'])

# Recommendation function
def recommend_jobs(skills, top_n=5):
    skills_str = ' '.join(skills)
    skills_vec = vectorizer.transform([skills_str])
    scores = cosine_similarity(skills_vec, tfidf_matrix)[0]
    idx = scores.argsort()[::-1][:top_n]
    return pd.DataFrame({
        'job_title': df['job_title'].iloc[idx].values,
        'similarity_score': scores[idx]
    })

# Example skill sets
sample1 = ["Python","Data Analysis","Machine Learning"]
sample2 = ["Graphic Design","Photoshop","Illustrator"]

rec1 = recommend_jobs(sample1)
rec2 = recommend_jobs(sample2)


In [7]:
rec1
# rec2

Unnamed: 0,job_title,similarity_score
0,Research Analyst,0.421115
1,"Senior Analyst / Analyst (Bangkok Based, reloc...",0.420373
2,Senior Machine Learning Engineer - AI,0.402583
3,"Consultant, Business Insights (Small CL Automa...",0.369142
4,Supply Analytics Senior Manager (Bangkok-based...,0.366166


In [19]:
import re
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.neighbors import NearestNeighbors
# 2) Preprocess skills into lists; handle missing
df['skills_list'] = (
    df['job_skills']
      .fillna('')                                         # no NaN strings :contentReference[oaicite:0]{index=0}
      .apply(lambda s: [sk.strip() for sk in s.split(',') if sk.strip()])
)

# 3) Normalize titles to collapse variants (e.g. “Senior Data Scientist” → “Data Scientist”)
def normalize_title(t):
    t = t.lower()
    # remove common seniority/fellow/lead tokens :contentReference[oaicite:1]{index=1}
    t = re.sub(r'\b(senior|jr|ii|iii|lead|principal|technical|fellow)\b', '', t)
    t = re.sub(r'[^a-z0-9 ]', ' ', t)
    return re.sub(r'\s+', ' ', t).strip().title()

df['norm_title'] = df['job_title'].apply(normalize_title)

# 4) Initialize SBERT on GPU
model = SentenceTransformer('all-MiniLM-L6-v2', device='cuda')  # 384‑d embeddings :contentReference[oaicite:2]{index=2}

# 5) Embed per-skill and average for each job, handling empty lists
dim = model.get_sentence_embedding_dimension()                    # should be 384 :contentReference[oaicite:3]{index=3}
job_embs = []
for skills in df['skills_list']:
    if skills:
        # embed each skill separately to avoid truncation :contentReference[oaicite:4]{index=4}
        embs = model.encode(skills, convert_to_numpy=True, show_progress_bar=False)
        mean_emb = np.mean(embs, axis=0)
    else:
        # fallback zero‑vector if no skills
        mean_emb = np.zeros(dim)
    job_embs.append(mean_emb)
job_embeddings = np.vstack(job_embs)                             # now shape (n_jobs, 384) :contentReference[oaicite:5]{index=5}

# 6) Group by normalized title: average embeddings across variants
grouped = df.groupby('norm_title').indices
titles, embs = [], []
for title, idxs in grouped.items():
    titles.append(title)
    embs.append(job_embeddings[idxs].mean(axis=0))
emb_matrix = np.vstack(embs)                                     # (n_unique_titles, 384)

# 7) Fit NearestNeighbors on grouped embeddings (cosine distance)
knn = NearestNeighbors(n_neighbors=5, metric='cosine', algorithm='brute')  # fast for ~200 groups :contentReference[oaicite:6]{index=6}
knn.fit(emb_matrix)

# 8) Recommendation function
def recommend_jobs(skills, top_n=2):
    # embed & average query skills :contentReference[oaicite:7]{index=7}
    q_embs = model.encode(skills, convert_to_numpy=True)
    q_vec = np.mean(q_embs, axis=0, keepdims=True)
    dists, idxs = knn.kneighbors(q_vec, n_neighbors=top_n)
    recs = pd.DataFrame({
        'job_title': [titles[i] for i in idxs[0]],
        'similarity': 1 - dists[0]
    })
    return recs

# 9) Test on your 33‑skill set
sample_skills = [
    'NLP','Java','GitHub','MatplotLib','AWS','Time Series','TensorFlow',
    'Data Structures','Machine Learning','Deep Learning','CUDA','NumPy',
    'Data Analysis','MySQL','SQL','Algorithms','Pandas','MongoDB','Python',
    'Git','Mentorship','Teamwork','GraphQL','Excel','Shapash','Keras',
    'Analytical','PyTorch','Scikit-learn','LLMs','Versatility','C++','Power Bi'
]
print(recommend_jobs(sample_skills))


                   job_title  similarity
0               Data Science    0.906729
1  Machine Learning Engineer    0.896980


In [22]:
print(recommend_jobs(['Waterfall', 'Agile - RUP', 'Scrum',
'Windows 2008/XP/2007/2003'
'MS Office suite', 'SharePoint & Project', 'Rational Rose', 'Requisite Pro',
'Test Director', 'Quality Center', 'HP Quality Center',
'Microsoft SQL Server', 'Oracle', 'MS Access',
'.NET Framework', 'ASP.NET'
]))

                                           job_title  similarity
0  Sr Developer Analyst C Java Digital Documentation    0.831221
1              Gis Sr Business Application Developer    0.809661


In [23]:
# after you’ve initialized and (optionally) warmed up the model…
model.save('models/sbert_all-MiniLM-L6-v2')


In [24]:
import joblib

# Save NearestNeighbors
joblib.dump(knn, 'models/knn_cosine.joblib')

# Save the titles list so we know which index corresponds to which title
joblib.dump(titles, 'models/knn_titles.joblib')


['models/knn_titles.joblib']

In [25]:
!zip -r models.zip models/

  adding: models/ (stored 0%)
  adding: models/sbert_all-MiniLM-L6-v2/ (stored 0%)
  adding: models/sbert_all-MiniLM-L6-v2/1_Pooling/ (stored 0%)
  adding: models/sbert_all-MiniLM-L6-v2/1_Pooling/config.json (deflated 57%)
  adding: models/sbert_all-MiniLM-L6-v2/2_Normalize/ (stored 0%)
  adding: models/sbert_all-MiniLM-L6-v2/config.json (deflated 48%)
  adding: models/sbert_all-MiniLM-L6-v2/config_sentence_transformers.json (deflated 34%)
  adding: models/sbert_all-MiniLM-L6-v2/tokenizer_config.json (deflated 73%)
  adding: models/sbert_all-MiniLM-L6-v2/model.safetensors (deflated 9%)
  adding: models/sbert_all-MiniLM-L6-v2/README.md (deflated 64%)
  adding: models/sbert_all-MiniLM-L6-v2/vocab.txt (deflated 53%)
  adding: models/sbert_all-MiniLM-L6-v2/sentence_bert_config.json (deflated 4%)
  adding: models/sbert_all-MiniLM-L6-v2/modules.json (deflated 62%)
  adding: models/sbert_all-MiniLM-L6-v2/special_tokens_map.json (deflated 80%)
  adding: models/sbert_all-MiniLM-L6-v2/tokenizer.

In [26]:
from google.colab import files
files.download('models.zip')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>