# Word embeddings jobs

In [3]:
import pandas as pd

df = pd.read_csv('processed_data.csv')
data = {
    'Processed_Job_Title': df['Processed_Job_Title'],
    'Processed_Job_Description': df['Processed_Job_Description']
}
tokenized_job_titles = [str(title).split() for title in data['Processed_Job_Title']]
tokenized_job_descriptions = [str(desc).split() for desc in data['Processed_Job_Description']]

In [4]:
from gensim.models import Word2Vec

model = Word2Vec(sentences=tokenized_job_titles + tokenized_job_descriptions, vector_size=100, window=5, min_count=1, workers=4)
job_title_embeddings = [model.wv[token] for token in tokenized_job_titles]
job_description_embeddings = [model.wv[token] for token in tokenized_job_descriptions]


In [6]:
number = len(job_title_embeddings)
print(number)

3087


In [7]:
number_desc = len(job_description_embeddings)
print(number_desc)

3087


# TF-IDF jobs

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

str_tokenized_job_titles = [' '.join(tokens) for tokens in tokenized_job_titles]
str_tokenized_job_descriptions = [' '.join(tokens) for tokens in tokenized_job_descriptions]

In [6]:
tfidf_vectorizer = TfidfVectorizer()
tfidf_job_title_vectors = tfidf_vectorizer.fit_transform(str_tokenized_job_titles)
num_job_title_embeddings = tfidf_job_title_vectors.shape[0]
print("Number of job title embeddings (TF-IDF):", num_job_title_embeddings)

Number of job title embeddings (TF-IDF): 3087


In [7]:
tfidf_job_description_vectors = tfidf_vectorizer.fit_transform(str_tokenized_job_descriptions)
num_job_description_embeddings = tfidf_job_description_vectors.shape[0]
print("Number of job description embeddings (TF-IDF):", num_job_description_embeddings)

Number of job description embeddings (TF-IDF): 3087


# Word embeddings resume

In [3]:
from gensim.models import Word2Vec
import pandas as pd

df = pd.read_csv('./test.csv')
data = {
    'description': df['description']
}
resumes_tokenized = [str(desc).split() for desc in data['description']]
model = Word2Vec(sentences=resumes_tokenized, vector_size=100, window=5, min_count=1, workers=4)
resumes_embeddings = [model.wv[token] for token in resumes_tokenized]
print("len:", len(resumes_embeddings))

len: 3057


# TF-IDF resumes

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize

data = pd.read_csv("./test.csv")
text_data = data['description'].tolist()
tokenized_text = [word_tokenize(text) for text in text_data]
str_tokenized_text = [' '.join(tokens) for tokens in tokenized_text]

print(str_tokenized_text)
tfidf_vectorizer = TfidfVectorizer()
tfidf_text_vectors = tfidf_vectorizer.fit_transform(str_tokenized_text)
num_text_embeddings = tfidf_text_vectors.shape[0]
print("Number of text embeddings (TF-IDF):", num_text_embeddings)


['Education science computer information ‘ Experience ‘ simple intuitif interaction concept optimal senior manage large design for client for front public code carry out assurance test ‘ Languages — resume senior in front stage cycle for in skills management cor', 'Certifications certification zend ‘ Education science computer information ‘ Experience interface simple interaction concept optimal senior manage large design for client for front public code ‘ carry out assurance test Languages resume senior in front stage cycle for in skills management a ‘', 'Certifications certification fur introduction international labour Education entrepreneur Experience Languages Profil Projects projet resume ’ in in data science about tara ah skills', 'Education Experience expérience Languages langue Profil bel a Projects projet certification resume skills palais a', 'Education Experience Languages langue Profil Projects projet certification resume skills', 'Education professionnel assurance banque 

# Suggested KNN

In [9]:
from sklearn.decomposition import TruncatedSVD
from sklearn.neighbors import NearestNeighbors

svd_job = TruncatedSVD(n_components=100)
job_description_svd = svd_job.fit_transform(tfidf_text_vectors)

svd_text = TruncatedSVD(n_components=100)
text_svd = svd_text.fit_transform(tfidf_job_description_vectors)
knn = NearestNeighbors(n_neighbors=5, metric='euclidean')
knn.fit(job_description_svd)
distances, indices = knn.kneighbors(text_svd)
matches = {'Job': [], 'Resume': [], 'Distance': []}
for i, (idx, dist) in enumerate(zip(indices, distances)):
    for neighbor_idx, distance in zip(idx, dist):
        matches['Job'].append(neighbor_idx)
        matches['Resume'].append(i)
        matches['Distance'].append(distance)
matches_df = pd.DataFrame(matches)
matches_df.to_csv('job_resume_matches.csv', index=False)


In [10]:
from sklearn.decomposition import TruncatedSVD
from sklearn.neighbors import NearestNeighbors

svd_job = TruncatedSVD(n_components=100)
job_description_svd = svd_job.fit_transform(tfidf_text_vectors)

svd_text = TruncatedSVD(n_components=100)
text_svd = svd_text.fit_transform(tfidf_text_vectors)
knn = NearestNeighbors(n_neighbors=5, metric='cosine')
knn.fit(job_description_svd)
distances, indices = knn.kneighbors(text_svd)
matches = {'Job': [], 'Resume': [], 'Distance': []}
for i, (idx, dist) in enumerate(zip(indices, distances)):
    for neighbor_idx, distance in zip(idx, dist):
        matches['Job'].append(neighbor_idx)
        matches['Resume'].append(i)
        matches['Distance'].append(distance)
matches_df = pd.DataFrame(matches)
matches_df.to_csv('job_resume_matches_cosine.csv', index=False)
