# Get most semantically similar job

In this notebook we search for jobs that are semantically similar and retrieve the most similar jobs, more of a job recommender based on data from [kaggle](https://www.kaggle.com/madhab/jobposts)

In [4]:
import numpy as np
import pandas as pd
import spacy
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics.pairwise import cosine_distances
from tqdm.notebook import tqdm

nlp = spacy.load('en_core_web_md')

In [2]:
# load the data
data_df = pd.read_csv("../data/data job posts.csv")
data_df.head()

Unnamed: 0,jobpost,date,Title,Company,AnnouncementCode,Term,Eligibility,Audience,StartDate,Duration,...,Salary,ApplicationP,OpeningDate,Deadline,Notes,AboutC,Attach,Year,Month,IT
0,AMERIA Investment Consulting Company\r\nJOB TI...,"Jan 5, 2004",Chief Financial Officer,AMERIA Investment Consulting Company,,,,,,,...,,"To apply for this position, please submit a\r\...",,26 January 2004,,,,2004,1,False
1,International Research & Exchanges Board (IREX...,"Jan 7, 2004",Full-time Community Connections Intern (paid i...,International Research & Exchanges Board (IREX),,,,,,3 months,...,,Please submit a cover letter and resume to:\r\...,,12 January 2004,,The International Research & Exchanges Board (...,,2004,1,False
2,Caucasus Environmental NGO Network (CENN)\r\nJ...,"Jan 7, 2004",Country Coordinator,Caucasus Environmental NGO Network (CENN),,,,,,Renewable annual contract\r\nPOSITION,...,,Please send resume or CV toursula.kazarian@......,,20 January 2004\r\nSTART DATE: February 2004,,The Caucasus Environmental NGO Network is a\r\...,,2004,1,False
3,Manoff Group\r\nJOB TITLE: BCC Specialist\r\n...,"Jan 7, 2004",BCC Specialist,Manoff Group,,,,,,,...,,Please send cover letter and resume to Amy\r\n...,,23 January 2004\r\nSTART DATE: Immediate,,,,2004,1,False
4,Yerevan Brandy Company\r\nJOB TITLE: Software...,"Jan 10, 2004",Software Developer,Yerevan Brandy Company,,,,,,,...,,Successful candidates should submit\r\n- CV; \...,,"20 January 2004, 18:00",,,,2004,1,True


In [9]:
# check the missing values
data_df.isna().sum()

jobpost                 0
date                    0
Title                  28
Company                 7
AnnouncementCode    17793
Term                11325
Eligibility         14071
Audience            18361
StartDate            9326
Duration             8203
Location               32
JobDescription       3892
JobRequirment        2522
RequiredQual          484
Salary               9379
ApplicationP           60
OpeningDate           706
Deadline               65
Notes               16790
AboutC               6531
Attach              17442
Year                    0
Month                   0
IT                      0
dtype: int64

In [98]:
# drop the rows without descriptions or titles
data_df = data_df.dropna(subset=['Title', 'JobDescription'])
data_df = data_df.drop("jobpost", axis=1)

In [67]:
# let's get the job title and describtion
titles = data_df['Title'].values
describtions = data_df['JobDescription'].values

In [15]:
# let's build the vectors for the describtion
describtion_vectors = np.zeros((len(describtions), 300))
for i, desc in enumerate(tqdm(nlp.pipe(describtions), total=len(describtions))):
    vector = np.zeros(300,)
    valid_tokens = 0
    for token in desc:
        if not token.is_stop and not token.is_punct and token.has_vector:
            vector += token.vector
            valid_tokens += 1
    vector = vector/valid_tokens if valid_tokens > 1 else vector 
    describtion_vectors[i, :] = vector
print("all jobs were vectorized !")

HBox(children=(FloatProgress(value=0.0, max=15090.0), HTML(value='')))


all jobs were vectorized !


In [103]:
# export the vectors and the new data frame
np.save("jobs_vectors.npy", describtion_vectors)
data_df.to_csv("cleaned_data.csv", index=False)

In [17]:
# now let's build a KNN model
knn = KNeighborsClassifier(weights='distance', metric=lambda v1, v2: cosine_distances([v1], [v2])[0])
knn.fit(describtion_vectors, titles)

KNeighborsClassifier(algorithm='auto', leaf_size=30,
                     metric=<function <lambda> at 0x7faf34bae0e0>,
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='distance')

In [74]:
def sent2vect(text):
    vector = np.zeros(300,)
    valid_tokens = 0
    for token in nlp(text):
        if not token.is_stop and not token.is_punct and token.has_vector:
            vector += token.vector
            valid_tokens += 1
    vector = vector/valid_tokens if valid_tokens > 1 else vector
    return vector

In [68]:
# let's test it !
new_job_desc = "Machine learning engineer"

vector = sent2vect(new_job_desc)
knn.predict(vector.reshape(1, -1))

array(['Data Scientist'], dtype='<U200')

In [100]:
# let's try to build it ourself
def get_top_similar(job_desc, k=5):
    vector = sent2vect(job_desc)
    # get similarity scores
    distances = cosine_distances([vector], describtion_vectors)
    most_similar = np.argsort(distances).flatten()[:k]
    return data_df.iloc[most_similar].to_dict(orient='records')

In [101]:
new_job_desc = "fashion designer"

get_top_similar(new_job_desc)

[{'date': 'Apr 11',
  'Title': 'Designer',
  'Company': 'You and Me',
  'AnnouncementCode': nan,
  'Term': 'Full time',
  'Eligibility': 'All qualified candidates.',
  'Audience': nan,
  'StartDate': nan,
  'Duration': 'Open-ended contract',
  'Location': 'Yerevan, Armenia',
  'JobDescription': 'You and Me is looking for a creative, smart and\r\nextremely motivated experienced Designer, who will be responsible for\r\nvarious creative fashion, underwear, socks design assignments. Preferred\r\ncandidate should have an interest in fashion, graphic design, art and\r\nphotography.',
  'JobRequirment': '- Think creatively and use imagination to produce new ideas;\r\n- Create images and designs by using the traditional hand skills of\r\ndrawing and painting, alongside other techniques, meet design briefs;\r\n- Responsible for fashion, underwear, and socks design as well as\r\npackaging design;\r\n- Responsible for photo editing;\r\n- Filter and effect designs;\r\n- Responsible for font design

Now we can get a retrieve the top 5 jobs that are semantically similar to our query !