In [None]:
# scipy issue with gensim
# https://stackoverflow.com/questions/78279136/importerror-cannot-import-name-triu-from-scipy-linalg-gensim

!pip install scipy==1.10.1 numpy gensim scikit-learn matplotlib spacy
!python -m spacy download en_core_web_sm

In [None]:
# Import necessary libraries
import gensim
from gensim.models import Word2Vec
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import spacy
from tqdm import tqdm

# Load the English language model
nlp = spacy.load("en_core_web_sm")

In [None]:
# Load job description data
df = pd.read_csv("jobspy.csv")
valid_descriptions = df[~df['description'].isna()]['description']

# Tokenize job descriptions by spacy
tokenized_descriptions = []
for description in tqdm(valid_descriptions):
    tokens = nlp(description)
    tokenized_descriptions.append([token.text for token in tokens if not token.is_stop])

In [None]:
# Load your dataset (or use a preprocessed one)

# Train Word2Vec model
model = Word2Vec(tokenized_descriptions, vector_size=100, window=5, min_count=1, workers=4)
model.train(tokenized_descriptions, total_examples=len(tokenized_descriptions), epochs=10)

# Get word embeddings
word_embeddings = model.wv

# Example of accessing word embeddings
print("Embedding for 'example':", word_embeddings['example'])

# Visualize word embeddings using t-SNE
def tsne_plot(model):
    labels = []
    wordvecs = []

    for word in model.wv.key_to_index:
        wordvecs.append(model.wv[word])
        labels.append(word)

    tsne_model = TSNE(perplexity=2, n_components=2, init='pca', n_iter=2500, random_state=23)
    new_values = tsne_model.fit_transform(np.array(wordvecs))

    x = []
    y = []
    for value in new_values:
        x.append(value[0])
        y.append(value[1])

    plt.figure(figsize=(16, 16))
    for i in range(len(x)):
        plt.scatter(x[i], y[i])
        plt.annotate(labels[i], xy=(x[i], y[i]), xytext=(5, 2), textcoords='offset points', ha='right', va='bottom')
    plt.show()

# Plot t-SNE visualization
tsne_plot(model)

# Example application: similarity between words
similarity = model.wv.similarity('example', 'sentence')
print("Similarity between 'word1' and 'word2':", similarity)
