In [3]:
# scipy issue with gensim
# https://stackoverflow.com/questions/78279136/importerror-cannot-import-name-triu-from-scipy-linalg-gensim

!pip install scipy==1.10.1 numpy gensim scikit-learn matplotlib spacy lxml
!python -m spacy download en_core_web_sm

Collecting lxml
  Downloading lxml-5.2.1-cp310-cp310-macosx_10_9_universal2.whl.metadata (3.4 kB)
Downloading lxml-5.2.1-cp310-cp310-macosx_10_9_universal2.whl (8.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.5/8.5 MB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: lxml
Successfully installed lxml-5.2.1


# Paper
https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=8595174

# Data source
https://archive.org/download/stackexchange

In [19]:
# Import necessary libraries
import gensim
from gensim.models import Word2Vec
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import spacy
from tqdm import tqdm
import glob

# Load the English language model
nlp = spacy.load("en_core_web_sm")

In [20]:
all_files = glob.glob("data/*.csv")

# Load job description data
df = []

# Loop through each CSV file, read it into a DataFrame, and append it to the list
for filename in all_files:
    _df = pd.read_csv(filename)
    df.append(_df)

df = pd.concat(df, ignore_index=True)
valid_descriptions = df[~df['description'].isna()]['description']

# Tokenize job descriptions by spacy
tokenized_descriptions = []
for description in tqdm(valid_descriptions):
    tokens = nlp(description)
    tokenized_descriptions.append([token.text for token in tokens if not token.is_stop])

100%|██████████| 300/300 [01:06<00:00,  4.50it/s]


In [22]:
# Load your dataset (or use a preprocessed one)

# Train Word2Vec model
model = Word2Vec(tokenized_descriptions, vector_size=100, window=5, min_count=1, workers=4)
model.train(tokenized_descriptions, total_examples=len(tokenized_descriptions), epochs=10)

(991368, 1353680)

In [23]:
# Visualize word embeddings using t-SNE
def tsne_plot(model):
    labels = []
    wordvecs = []

    for word in model.wv.key_to_index:
        wordvecs.append(model.wv[word])
        labels.append(word)

    tsne_model = TSNE(perplexity=2, n_components=2, init='pca', n_iter=2500, random_state=23)
    new_values = tsne_model.fit_transform(np.array(wordvecs))

    x = []
    y = []
    for value in new_values:
        x.append(value[0])
        y.append(value[1])

    plt.figure(figsize=(16, 16))
    for i in range(len(x)):
        plt.scatter(x[i], y[i])
        plt.annotate(labels[i], xy=(x[i], y[i]), xytext=(5, 2), textcoords='offset points', ha='right', va='bottom')
    plt.show()

In [24]:
model.wv.key_to_index

{'*': 0,
 ',': 1,
 '.': 2,
 '\n': 3,
 '\n\n': 4,
 '-': 5,
 ')': 6,
 ':': 7,
 '(': 8,
 '\n\n\n': 9,
 'experience': 10,
 '/': 11,
 'work': 12,
 'Experience': 13,
 'software': 14,
 'team': 15,
 ':*': 16,
 'development': 17,
 'years': 18,
 'time': 19,
 'skills': 20,
 'design': 21,
 '+': 22,
 'systems': 23,
 'solutions': 24,
 'technical': 25,
 '#': 26,
 'including': 27,
 'applications': 28,
 'Software': 29,
 'Work': 30,
 'data': 31,
 'position': 32,
 'working': 33,
 '$': 34,
 'new': 35,
 'environment': 36,
 'business': 37,
 'code': 38,
 'Engineer': 39,
 'support': 40,
 'insurance': 41,
 ';': 42,
 'engineering': 43,
 'requirements': 44,
 'application': 45,
 '5': 46,
 'role': 47,
 ' \n': 48,
 'based': 49,
 'technologies': 50,
 '&': 51,
 'tools': 52,
 'related': 53,
 'company': 54,
 'system': 55,
 'projects': 56,
 'Ability': 57,
 'knowledge': 58,
 'management': 59,
 'best': 60,
 'services': 61,
 'quality': 62,
 'Required': 63,
 '=': 64,
 'information': 65,
 'teams': 66,
 'communication': 67,
 

In [None]:
# Plot t-SNE visualization
tsne_plot(model)

# Example application: similarity between words
similarity = model.wv.similarity('example', 'sentence')
print("Similarity between 'word1' and 'word2':", similarity)