**Title**: CLUSTER ANALYSIS

**Description**: Job title categorization of the job history dataset using cluster approach

**Author**: Benedict Ibe

**Date Created**: 18/07/2023

In [1]:
pip install kmodes pandas

In [2]:
pip install category_encoders

In [3]:
#Importation of libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

In [4]:
import os
os.environ["OMP_NUM_THREADS"] = '1'

In [5]:
%%pyspark

spark.sql("set spark.sql.legacy.timeParserPolicy=LEGACY")
candidate_skilllevel= spark.sql("SELECT \
Id                                          as Cand_ID, \
recruit_candidatecontact                    as Candidate_contactno, \
crimson_proficiency                         as Cand_proficiency, \
crimson_requirement                         as Cand_requirement, \
crimson_experience                          as Cand_experience, \
crimson_experienceperiod                    as Cand_exp_period, \
crimson_skill                               as Cand_skill, \
crimson_name                                as Cand_name, \
crimson_skilllevelid                        as Cand_skilllevel_ID, \
lower(crimson_skillname)                    as Cand_skill_name, \
crimson_level                               as Cand_level \
FROM dataverse_edensmithcon_org87f26120.crimson_skilllevel")

In [6]:
candidate_skilllevel.show()

In [7]:
%%pyspark

spark.sql("set spark.sql.legacy.timeParserPolicy=LEGACY")
candidate_workhist =spark.sql("SELECT \
Id                                  as ID , \
recruit_candidatecontact            as Candidate_contact,\
crimson_startdate                   as Candidate_workstrt_date,\
crimson_enddate                     as Candidate_workend_date, \
crimson_workhistoryid               as WorkHistory_ID,\
lower(crimson_description)          as Candidate_work_description,\
crimson_jobtitle                    as Candidate_job_title ,\
crimson_name                        as Cand_workplace \
FROM dataverse_edensmithcon_org87f26120.crimson_workhistory")

In [8]:
from pyspark.sql.functions import to_date

In [9]:
#Convert the start date column to date
df_hist = candidate_workhist.withColumn("Cnd_start_date", to_date("Candidate_workstrt_date"))
df_hist.show()

In [10]:
#Convert the end date column to date
df_hist2 = df_hist.withColumn("Cnd_end_date", to_date("Candidate_workend_date"))
df_hist2.show()

In [11]:
#Select th e required columns
df_hist3 = df_hist2.select('ID', 'WorkHistory_ID', 'Candidate_contact', 'Candidate_job_title', 'Cnd_start_date', 'Cnd_end_date')
df_hist3.show()

In [12]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import datediff, floor
from pyspark.sql.functions import lit

In [13]:
#Calculate the number of months
df_hist4 = df_hist3.withColumn("Duration", floor((datediff(df_hist3["Cnd_end_date"], df_hist3["Cnd_start_date"]) / 7)/4))
df_hist4.show()

In [14]:
df_hist4cluster = df_hist4.select('Candidate_job_title', 'Duration')
df_hist4cluster.show()

In [15]:
df_skillCluster = candidate_skilllevel.select('Cand_skill_name', 'Cand_level')

In [16]:
df_skillCluster.show()

In [17]:
#Drop null or missin gvalues in the skill name column
df_skillCluster_clean = df_skillCluster.dropna(subset=["Cand_skill_name"])
df_skillCluster_clean.show()

In [18]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import Tokenizer, Word2Vec
from pyspark.ml.clustering import KMeans
from pyspark.sql.functions import col

In [19]:
#Tokenize the candidate job title
tokenizer = Tokenizer(inputCol="Candidate_job_title", outputCol="words")

In [20]:
#Tokenize the skill name column
tokenizer2 = Tokenizer(inputCol="Cand_skill_name", outputCol="words")

In [21]:
#Transform the candidate history dataset with the tokenized variable
df_tokenized = tokenizer.transform(df_hist4cluster)
df_tokenized.show()

In [22]:
#Transform the candidate skill dataset with the tokenized variable
df_tokenized_skill = tokenizer2.transform(df_skillCluster_clean)
df_tokenized_skill.show()

In [23]:
#Embedding the tokenized dataset with word2vec embedder
word2Vec = Word2Vec(vectorSize=100, minCount=0, inputCol="words", outputCol="features")
model = word2Vec.fit(df_tokenized)
df_vectorized = model.transform(df_tokenized)

In [24]:
#Getting the tokenized words whose token values are greater than 0, and are not empty
from pyspark.sql.functions import col, size

tokenizedSkill_clean = df_tokenized_skill.filter(col("words").isNotNull() & (size(col("words")) > 0))

In [25]:
#Fitting the word2vec embedder to the tokenized skill
model_skill = word2Vec.fit(df_tokenized_skill)
skill_vectorized = model_skill.transform(df_tokenized_skill)

In [26]:
df_vectorized.show()

In [27]:
skill_vectorized.show()

In [28]:
df_vectorized2 = df_vectorized.select('Duration', 'features')
df_vectorized2.show()

In [29]:
skill_vectorized2 = skill_vectorized.select('Cand_skill_name', 'features')
skill_vectorized2.show()

In [30]:
#Treat null values in the duration column
df_vectorized3 = df_vectorized.dropna(subset=["Duration"])
df_vectorized3.show()

To get the value for k

In [31]:
from pyspark.ml.clustering import KMeans
from pyspark.ml.feature import VectorAssembler
import matplotlib.pyplot as plt

In [32]:
#Define the K-range values for the skills vectorized dataframe
k_range_skill = range(2, 21)
wssse_list_skill = []

In [33]:
#Define the K-range values for the job history vectorized dataframe
k_range = range(2, 21)
wssse_list = []

In [34]:
#Fitting the K-Means model to the vectorized dataset, and calculating the cost of training along the k-value ranges
for k in k_range:
    kmeans = KMeans().setK(k).setSeed(1).setFeaturesCol("features")
    km_model = kmeans.fit(df_vectorized3)
    wssse = km_model.summary.trainingCost
    wssse_list.append(wssse)
    print(f"K={k}, WSSSE={wssse}")

In [35]:
#Fitting the K-Means model to the vectorized dataset, and calculating the cost of training along the k-value ranges
for k in k_range_skill:
    kmeans_skill = KMeans().setK(k).setSeed(1).setFeaturesCol("features")
    km_model_skill = kmeans.fit(skill_vectorized2)
    wssse_skill = km_model.summary.trainingCost
    wssse_list_skill.append(wssse_skill)
    print(f"K={k}, WSSSE={wssse_skill}")

In [36]:
#Plot the k-plot for the range of k-values to get the uptimum value for k
plt.plot(k_range, wssse_list, 'bx-')
plt.xlabel('k')
plt.ylabel('WSSSE')
plt.title('Elbow Method For Optimal k')
plt.show()

In [37]:
#K plot for the skills dataset
plt.plot(k_range_skill, wssse_list_skill, 'bx-')
plt.xlabel('k')
plt.ylabel('WSSSE')
plt.title('Elbow Method For Optimal k')
plt.show()

In [38]:
#Set the value of k from the k plot to get the clusters
KMeans().setK(8).setSeed(1)
km_model = kmeans.fit(df_vectorized3)
df_clusters = km_model.transform(df_vectorized3)

In [39]:
#Set the value of k from the k plot to get the clusters
KMeans().setK(8).setSeed(1)
km_model_skill = kmeans.fit(skill_vectorized2)
df_clusters_skill = km_model.transform(skill_vectorized2)

In [40]:
#Set the value of k from the k plot to get the clusters
kmeans = KMeans().setK(8).setInitMode('k-means||').setSeed(100)
km_model = kmeans.fit(df_vectorized3)
df_clusters2 = km_model.transform(df_vectorized3)

In [41]:
#Inspect/track the predicted k groups
df_clusters_skill2 = df_clusters_skill.select("Cand_skill_name", "prediction")
df_clusters_skill2.show(20)

In [42]:
#Inspect/track the predicted k groups
df_clusters3 = df_clusters2.select("Candidate_job_title", "prediction")
df_clusters3.show(20)

**Understanding the groups**

In [43]:
prediction = '1'

In [44]:
skill_prediction = '10'

In [45]:
#Track/Inspect the specific value for one group
df_categories = df_clusters3.filter(df_clusters3.prediction == prediction)
df_categories.show(100)

In [46]:
#Track/Inspect the specific value for one group
df_skill_categories = df_clusters_skill2.filter(df_clusters_skill2.prediction == skill_prediction)
df_skill_categories.show(100)

In [47]:
from pyspark.ml.feature import PCA
from pyspark.sql.functions import col
import matplotlib.pyplot as plt

In [48]:
#Reduce the dimensionality using the principal component analysis model (PCA)
pca = PCA(k=2, inputCol="features", outputCol="pca_features")
pca_model = pca.fit(df_clusters)
pca_result = pca_model.transform(df_clusters)

In [49]:
#Reduce the dimensionality to two(2) dimensions using the principal component analysis model (PCA)
pandas_df = pca_result.select("pca_features", "prediction").toPandas()
pandas_df['pca_x'] = pandas_df['pca_features'].apply(lambda x: x[0])
pandas_df['pca_y'] = pandas_df['pca_features'].apply(lambda x: x[1])

In [50]:
#Plot the clusters in a scatter plot with matplotlib
plt.figure(figsize=(10,6))
scatter = plt.scatter(pandas_df['pca_x'], pandas_df['pca_y'], c=pandas_df['prediction'], cmap='viridis')
plt.title("Clusters after PCA")
plt.xlabel("Component 2")
plt.colorbar(scatter)
plt.show()