In [4]:
import pandas as pd
import numpy as np

from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler


In [6]:
# Load cleaned dataset
df = pd.read_csv("C:/Users/chadh/Desktop/Academic projects/BI-AI-Job-Market/data/cleaned/ai_job_market_cleaned.csv")


# Quick check
df.head()


Unnamed: 0,Job Title,Industry,Job Status,AI Impact Level,Median Salary (USD),Required Education,Experience Required (Years),Job Openings (2024),Projected Openings (2030),Remote Work Ratio (%),Automation Risk (%),Location,Gender Diversity (%),Net Job Change,Percent Job Change,AI Impact Score,Career Stability Index,Median Salary Norm,High-Opportunity Career Score
0,Investment Analyst,It,Increasing,Moderate,42109.76,Master’S Degree,5,1515,6342,55.96,28.28,Uk,44.63,4827,318.61,2,228.51,0.1,1.52
1,"Journalist, Newspaper",Manufacturing,Increasing,Moderate,132298.57,Master’S Degree,15,1243,6205,16.81,89.71,Usa,66.39,4962,399.2,2,41.08,0.85,1.88
2,Financial Planner,Finance,Increasing,Low,143279.19,Bachelor’S Degree,4,3338,1154,91.82,72.97,Canada,41.13,-2184,-65.43,1,-17.69,0.94,0.1
3,Legal Secretary,Healthcare,Increasing,High,97576.13,Associate Degree,15,7173,4060,1.89,99.94,Australia,65.76,-3113,-43.4,3,-0.03,0.56,-0.0
4,Aeronautical Engineer,It,Increasing,Low,60956.63,Master’S Degree,13,5944,7396,53.76,37.65,Germany,72.57,1452,24.43,1,15.23,0.26,0.36


In [7]:
features = [
    "Automation Risk (%)",
    "Percent Job Change",
    "Median Salary Norm",
    "Remote Work Ratio (%)",
    "AI Impact Score",
    "Career Stability Index"
]

X = df[features].copy()
X.describe()


Unnamed: 0,Automation Risk (%),Percent Job Change,Median Salary Norm,Remote Work Ratio (%),AI Impact Score,Career Stability Index
count,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0
mean,50.154229,141.035263,0.500998,49.836431,2.001733,69.870087
std,28.754889,556.652291,0.286817,28.966688,0.815651,329.14312
min,0.0,-98.94,0.0,0.0,1.0,-97.09
25%,25.4,-49.365,0.25,24.57,1.0,-18.39
50%,50.02,0.305,0.5,49.57,2.0,0.05
75%,75.03,97.5825,0.75,75.1,3.0,38.4325
max,99.99,8945.1,1.0,100.0,3.0,8199.08


In [8]:
# Fill missing values with column mean
X = X.fillna(X.mean())


In [9]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


In [10]:
kmeans = KMeans(n_clusters=3, random_state=42)
df["JobCluster"] = kmeans.fit_predict(X_scaled)

df["JobCluster"].value_counts()


JobCluster
1    14874
0    14533
2      593
Name: count, dtype: int64

In [11]:
cluster_profile = (
    df.groupby("JobCluster")[features]
    .mean()
    .round(2)
)

cluster_profile


Unnamed: 0_level_0,Automation Risk (%),Percent Job Change,Median Salary Norm,Remote Work Ratio (%),AI Impact Score,Career Stability Index
JobCluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,50.66,76.25,0.5,75.21,1.98,33.72
1,50.03,79.74,0.5,25.09,2.02,35.98
2,40.96,3266.16,0.51,48.77,1.98,1805.74


In [12]:
cluster_labels = {
    0: "Future-Proof & High Growth",
    1: "High Salary but High Risk",
    2: "Stable but Low Growth"
}

df["Career Cluster Label"] = df["JobCluster"].map(cluster_labels)

df[["Job Title", "Industry", "Career Cluster Label"]].head()


Unnamed: 0,Job Title,Industry,Career Cluster Label
0,Investment Analyst,It,Future-Proof & High Growth
1,"Journalist, Newspaper",Manufacturing,High Salary but High Risk
2,Financial Planner,Finance,Future-Proof & High Growth
3,Legal Secretary,Healthcare,High Salary but High Risk
4,Aeronautical Engineer,It,Future-Proof & High Growth


In [13]:
output_path = "../advanced_analysis/ai_job_market_with_clusters.csv"
df.to_csv(output_path, index=False)

print("File saved:", output_path)


File saved: ../advanced_analysis/ai_job_market_with_clusters.csv


In [14]:
df["Career Cluster Label"].value_counts()


Career Cluster Label
High Salary but High Risk     14874
Future-Proof & High Growth    14533
Stable but Low Growth           593
Name: count, dtype: int64