# **Machine Learning (ML) Methods**

**Unsupervised Learning**

**KMeans clustering**

In [8]:
# ! pip install gdown
# import gdown 

In [9]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler
import ast
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score, silhouette_score
import warnings
warnings.filterwarnings('ignore')


In [10]:

# # Google Drive file ID (from the link)
# file_id = "1V2GCHGt2dkFGqVBeoUFckU4IhUgk4ocQ"
# url = f"https://drive.google.com/uc?id={file_id}"

# # Download the file
# gdown.download(url, output="lightcast_job_postings.csv", quiet=False)

In [11]:
# Load your data
df = pd.read_csv("lightcast_job_postings.csv") 

In [12]:
# Combine fields
df['TITLE_CLEAN'] = df['TITLE_CLEAN'].fillna('unknown').astype(str).str.strip().str.lower()
df['SOFTWARE_SKILLS_NAME'] = df['SOFTWARE_SKILLS_NAME'].fillna('').astype(str).str.lower()
df['SPECIALIZED_SKILLS_NAME'] = df['SPECIALIZED_SKILLS_NAME'].fillna('').astype(str).str.lower()
# Combine text fields for TF-IDF
df['combined_text'] = df['TITLE_CLEAN'] + ' ' + df['SOFTWARE_SKILLS_NAME'] + ' ' + df['SPECIALIZED_SKILLS_NAME']

In [13]:

columns = ['TITLE_CLEAN', 'SOFTWARE_SKILLS_NAME', 'SPECIALIZED_SKILLS_NAME']

for col in columns:
    unique_count = df[col].nunique(dropna=True)
    print(f"Unique values in '{col}': {unique_count}")

Unique values in 'TITLE_CLEAN': 27266
Unique values in 'SOFTWARE_SKILLS_NAME': 22456
Unique values in 'SPECIALIZED_SKILLS_NAME': 41462


In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score, silhouette_score

# Vectorize
tfidf = TfidfVectorizer(max_features=1000, stop_words='english')
X_tfidf = tfidf.fit_transform(df['combined_text']).toarray()

# Scale
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_tfidf) 

In [15]:
optimal_k = 3
kmeans = KMeans(n_clusters=optimal_k, random_state=42)
df['cluster'] = kmeans.fit_predict(X_scaled)

# Evaluation
reference_label = 'NAICS_2022_6_NAME'  
df_eval = df[[reference_label, 'cluster']].dropna()

ari = adjusted_rand_score(df_eval[reference_label], df_eval['cluster'])
nmi = normalized_mutual_info_score(df_eval[reference_label], df_eval['cluster'])
# sil_score = silhouette_score(X_scaled, df['cluster'])

print(f"Adjusted Rand Index ({reference_label}): {ari:.3f}")
print(f"Normalized Mutual Info Score ({reference_label}): {nmi:.3f}")
# print(f"Silhouette Score: {sil_score:.3f}") 

Adjusted Rand Index (NAICS_2022_6_NAME): 0.009
Normalized Mutual Info Score (NAICS_2022_6_NAME): 0.033


In [16]:

from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score

reference_labels = ['NAICS_2022_6_NAME', 'SOC_2021_5_NAME', 'ONET_NAME']
results = []

for label in reference_labels:
    df_eval = df[[label, 'cluster']].dropna()
    ari = adjusted_rand_score(df_eval[label], df_eval['cluster'])
    nmi = normalized_mutual_info_score(df_eval[label], df_eval['cluster'])
    results.append({'Reference Label': label, 'ARI': ari, 'NMI': nmi})

In [17]:
import pandas as pd
results_df = pd.DataFrame(results)
results_df = results_df.round(4)
results_df

Unnamed: 0,Reference Label,ARI,NMI
0,NAICS_2022_6_NAME,0.0092,0.0331
1,SOC_2021_5_NAME,0.0,0.0
2,ONET_NAME,0.0,0.0


In [18]:
import pandas as pd
import plotly.express as px
from sklearn.decomposition import PCA

# Step 1: Run PCA on your TF-IDF or scaled data
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

# Step 2: Create a DataFrame for plotting
pca_df = pd.DataFrame(X_pca, columns=['PCA1', 'PCA2'])
pca_df['cluster'] = df['cluster']

# Step 3: Plot using Plotly
fig = px.scatter(
    pca_df,
    x='PCA1',
    y='PCA2',
    color='cluster',
    title='PCA Projection of TF-IDF Clusters',
    labels={'PCA1': 'PCA Component 1', 'PCA2': 'PCA Component 2'},
    opacity=0.8,
    width=900,
    height=600
)

fig.update_traces(marker=dict(size=7, line=dict(width=0.5, color='white')))
fig.update_layout(legend_title='Cluster')
fig.show()

In [19]:

# Top terms per cluster
import numpy as np

terms = tfidf.get_feature_names_out()
order_centroids = kmeans.cluster_centers_.argsort()[:, ::-1]

for i in range(optimal_k):
    print(f"\n Cluster {i}:")
    print(", ".join([terms[ind] for ind in order_centroids[i, :15]]))


 Cluster 0:
pmi, apple, institute, ios, android, vmware, desktop, methodology, expectation, zachman, windows, infrastructure, capability, operating, subcontracting

 Cluster 1:
data, language, programming, sql, intelligence, python, tableau, analysis, dashboard, bi, power, statistics, visualization, analyst, analytics

 Cluster 2:
sap, enterprise, consultant, applications, oracle, functional, management, planning, cloud, architect, architecture, solution, design, erp, resource


In [20]:
import plotly.express as px
import pandas as pd

# Sample 10 titles per cluster
title_samples = df.groupby('cluster')['TITLE_CLEAN'].apply(lambda x: x.dropna().head(30)).reset_index()

# Plotly strip plot (aka dot plot)
fig = px.strip(
    title_samples,
    x='cluster',
    y='TITLE_CLEAN',
    stripmode='overlay',
    orientation='v',
    title="Sample Job Titles by Cluster",
    labels={"TITLE_CLEAN": "Job Title", "cluster": "Cluster"},
    color='cluster'
)

fig.update_traces(jitter=0.3, marker=dict(size=8, opacity=0.7), selector=dict(type='scatter'))
fig.update_layout(height=600, width=900, yaxis_tickfont_size=10)
fig.show()

In [21]:
# Clean and split software skills into lists
df['SOFTWARE_SKILLS_NAME'] = df['SOFTWARE_SKILLS_NAME'].fillna('').astype(str).str.lower()
df['SOFTWARE_SKILLS_LIST'] = df['SOFTWARE_SKILLS_NAME'].str.split(',')

# Explode the list into multiple rows
df_exploded = df.explode('SOFTWARE_SKILLS_LIST')

# Remove leading/trailing whitespace
df_exploded['SOFTWARE_SKILLS_LIST'] = df_exploded['SOFTWARE_SKILLS_LIST'].str.strip()

# Drop blanks
df_exploded = df_exploded[df_exploded['SOFTWARE_SKILLS_LIST'] != '']

# Remove extra characters: brackets, quotes, etc.
df_exploded['SOFTWARE_SKILLS_LIST'] = (
    df_exploded['SOFTWARE_SKILLS_LIST']
    .str.replace(r'[\[\]\"]', '', regex=True)  # remove brackets and quotes
    .str.strip()
)

In [22]:
import plotly.express as px

# Sample up to 30 skills per cluster for visualization
sampled = df_exploded.groupby('cluster').apply(lambda x: x.head(30)).reset_index(drop=True)

# Plot
fig = px.strip(
    sampled,
    x='cluster',
    y='SOFTWARE_SKILLS_LIST',
    color='cluster',
    title="Software Skills by Cluster",
    labels={'SOFTWARE_SKILLS_LIST': "Software Skill", 'cluster': "Cluster"},
    orientation='v'
)

fig.update_traces(jitter=0.3, marker=dict(size=8, opacity=0.7), selector=dict(type='scatter'))
fig.update_layout(height=700, width=950, yaxis_tickfont_size=10)
fig.show()

In [23]:
# Average salary per cluster
avg_salaries = df.groupby('cluster')['SALARY'].mean()

# Print results
for i, avg in avg_salaries.items():
    print(f"Average salary in Cluster {i}: ${avg:,.2f}")

Average salary in Cluster 0: $132,148.75
Average salary in Cluster 1: $105,813.00
Average salary in Cluster 2: $127,433.09


In [24]:
import plotly.express as px

fig = px.box(
    df,
    x='cluster',
    y='SALARY',
    color='cluster',
    title="Salary Distribution by Cluster",
    labels={'cluster': 'Cluster', 'SALARY': 'Salary'},
    width=1000,
    height=600,
    points='outliers'  # Show outliers only (better spacing)
)

# Improve layout and box width
fig.update_traces(
    marker=dict(size=5, opacity=0.7),  # outlier dots
    line=dict(width=2),                # box border width
    selector=dict(type='box')
)

# Update layout to center and balance plot
fig.update_layout(
    font=dict(size=13),
    title_font=dict(size=18),
    boxmode='group',
    showlegend=False,
    margin=dict(l=60, r=30, t=60, b=60),
    xaxis=dict(title='Cluster', tickmode='linear'),
    yaxis=dict(title='Salary', zeroline=False, gridcolor='lightgray')
)

fig.show()
