# Importing Libraries and pulling dataset

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from gensim.models import Word2Vec
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.manifold import TSNE

# Importing the dataset

In [None]:
# loading the data into a pandas dataFrame
dataset = pd.read_csv('clustering.csv')
x = dataset.iloc[:, 6:7].values # making the maritx of the job titles
y = dataset.iloc[:, -1].values # making matrix of skillsets

In [None]:
# convert numpyarray to string data type
x = np.array(x, dtype=str)
x = np.vectorize(lambda p: p.lower())(x)
list_of_x = x.tolist()

flattened_x = [item for sublist in x for item in sublist]
print(flattened_x)
# print(list_of_x)

In [None]:
# data cleaning, Remove single quotes from each element in 'y' and split the strings by commas, creating a list of skills for each job title
y = np.array(y, dtype=str)
y = np.vectorize(lambda x: x.lower())(y)
y = np.core.defchararray.replace(y, "'", "")
# Custom function to remove leading spaces from each word in a string
remove_leading_spaces = np.vectorize(lambda x: x.lstrip())
# Apply the custom function to the entire array
y = remove_leading_spaces(y)
y = np.core.defchararray.split(y, ', ')
list_of_y = y.tolist()
print(list_of_y)

In [None]:
my_dict = {}

for i in range(len(flattened_x)):
    my_dict[flattened_x[i]] = list_of_y[i]

print(my_dict)

In [None]:
result = my_dict.items()
# Convert object to a list
data = list(result)
# Convert list to an array
numpyArray = np.array(data)
# print the numpy array
print(numpyArray)

# training model 1

In [None]:
# Train a Word2Vec model on the 'y' data to create word embeddings
model = Word2Vec(sentences=y, vector_size=2, window=5, min_count=1, workers=4)
print(model.wv)
# Print all word vectors
for word in model.wv.index_to_key:
    print(f"{word}: {model.wv[word]}")

In [None]:
for word in model.wv.index_to_key:
    for key, values in my_dict.items():
        for i, value in enumerate(values):
            if word == value:
                my_dict[key][i] = model.wv[word]

# for key, values in my_dict.items():
#     print(f"{key} : {values}")

print(my_dict)

In [None]:
new_dict = my_dict
for key, values in new_dict.items():
    # print(f"{key} : {values}")
    x_coor = 0
    y_coor = 0
    for i, value in enumerate(values):
        x_coor += value[0]
        y_coor += value[1]
    new_dict[key] = [x_coor, y_coor]

for key, values in new_dict.items():
    print(f"{key} : {values}")

In [None]:
# Plot the 2D word vectors
# plt.figure(figsize=(40, 32))
plt.figure(figsize=(20, 16))
for key, values in new_dict.items():
    plt.scatter(values[0], values[1], label=key)

# Annotate each point with the word
for key, values in new_dict.items():
    plt.annotate(key, (values[0], values[1]))

plt.title('Job Title Word Vectors')
plt.xlabel('Dimension 1')
plt.ylabel('Dimension 2')
# plt.legend()
plt.show()

In [None]:
# Plot the 2D word vectors
# plt.figure(figsize=(40, 32))
plt.figure(figsize=(25, 20))
for word in model.wv.index_to_key:
    plt.scatter(model.wv[word][0], model.wv[word][1], label=word)

# Annotate each point with the word
for word in model.wv.index_to_key:
    plt.annotate(word, (model.wv[word][0], model.wv[word][1]))

plt.title('Skills Word Vectors')
plt.xlabel('Dimension 1')
plt.ylabel('Dimension 2')
# plt.legend()
plt.show()

# job titles

In [None]:
for skills in y:
  for skill in skills:
    print(skill)

In [None]:
# Create vectors for job titles by aggregating vectors of individual skills for each job title.
# job_title_vectors = [values for skills in y for skill in skills]
# for i, vector in enumerate(job_title_vectors):
#     print(f"Job Title {i + 1} Vector: {vector}")

job_title_vectors = [values for key, values in new_dict.items()]
print(job_title_vectors)

In [None]:
# Standardize the job title vectors to have zero mean and unit variance.
scaler = StandardScaler()
job_title_vectors_scaled = scaler.fit_transform(job_title_vectors)
# for i, vector in enumerate(job_title_vectors):
#     print(f"Job Title {i + 1} Vector: {vector}")

print(job_title_vectors)

In [None]:
# Apply t-distributed Stochastic Neighbor Embedding (t-SNE) to reduce the dimensionality of the job title vectors to 2 dimensions
tsne = TSNE(n_components=2, random_state=42)
job_title_tsne = tsne.fit_transform(job_title_vectors_scaled)

# Using the elbow method to find the optimal number of clusters

In [None]:
from sklearn.cluster import KMeans
wcss = []
for i in range(1, 11):
    kmeans = KMeans(n_clusters = i, init = 'k-means++', random_state = 42)
    kmeans.fit(job_title_vectors_scaled)
    wcss.append(kmeans.inertia_)
plt.plot(range(1, 11), wcss)
plt.title('The Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.show()

In [None]:
# Apply K-Means clustering with 8 clusters to the standardized job title vectors
kmeans = KMeans(n_clusters=4, random_state=42)
clusters = kmeans.fit_predict(job_title_vectors_scaled)

In [None]:
from sklearn.metrics import silhouette_score

# Assuming 'true_labels' are the true labels for your data
# 'clusters' are the cluster assignments obtained from clustering algorithm

# Evaluate clustering using silhouette score
silhouette_avg = silhouette_score(job_title_vectors_scaled, clusters)
print(f"Silhouette Score: {silhouette_avg}")

In [None]:
# Plot the t-SNE transformed job title vectors
plt.figure(figsize=(10, 8))
scatter = plt.scatter(job_title_tsne[:, 0], job_title_tsne[:, 1], c=clusters, cmap='viridis')
plt.title('KMeans Clusters of Job Titles')
plt.xlabel('t-SNE Dimension 1')
plt.ylabel('t-SNE Dimension 2')
plt.legend(*scatter.legend_elements(), title='Clusters')
plt.show()


In [None]:
clusters