In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

In [None]:
#Section 1 - Load Dataset

df = pd.read_csv("data-final.csv", sep="\t")
df.head()

Unnamed: 0,EXT1,EXT2,EXT3,EXT4,EXT5,EXT6,EXT7,EXT8,EXT9,EXT10,...,dateload,screenw,screenh,introelapse,testelapse,endelapse,IPC,country,lat_appx_lots_of_err,long_appx_lots_of_err
0,4.0,1.0,5.0,2.0,5.0,1.0,5.0,2.0,4.0,1.0,...,2016-03-03 02:01:01,768.0,1024.0,9.0,234.0,6.0,1.0,GB,51.5448,0.1991
1,3.0,5.0,3.0,4.0,3.0,3.0,2.0,5.0,1.0,5.0,...,2016-03-03 02:01:20,1360.0,768.0,12.0,179.0,11.0,1.0,MY,3.1698,101.706
2,2.0,3.0,4.0,4.0,3.0,2.0,1.0,3.0,2.0,5.0,...,2016-03-03 02:01:56,1366.0,768.0,3.0,186.0,7.0,1.0,GB,54.9119,-1.3833
3,2.0,2.0,2.0,3.0,4.0,2.0,2.0,4.0,1.0,4.0,...,2016-03-03 02:02:02,1920.0,1200.0,186.0,219.0,7.0,1.0,GB,51.75,-1.25
4,3.0,3.0,3.0,3.0,5.0,3.0,3.0,5.0,3.0,4.0,...,2016-03-03 02:02:57,1366.0,768.0,8.0,315.0,17.0,2.0,KE,1.0,38.0


In [None]:
# Section 2 - Extract IPIP 50 items
ipip_columns = ...
...
print("Final df_big5 shape:", df_big5.shape)


Final df_big5 shape: (0, 5)


In [None]:
# SECTION 3 – Clean dataset and compute Big Five scores (fixed version)

# 1) Extract the 50 IPIP item columns: EXT*, EST*, AGR*, CSN*, OPN*
ipip_columns = [col for col in df.columns if col.startswith(("EXT", "EST", "AGR", "CSN", "OPN"))]
df_ipip = df[ipip_columns].copy()

print("Step 1 - original IPIP shape:", df_ipip.shape)

# 2) Convert everything to numeric (if any strings exist)
df_ipip = df_ipip.apply(pd.to_numeric, errors="coerce")

# 3) Treat 0 as 'no answer' (missing)
df_ipip.replace(0, np.nan, inplace=True)

# 4) Drop rows that are almost completely empty
missing_per_row = df_ipip.isna().sum(axis=1)
print("Missing values per row (summary):")
print(missing_per_row.describe())

# keep rows that have at least 40 answered items out of 50
df_ipip = df_ipip[missing_per_row <= 10]
print("Step 4 - after keeping rows with at least 40/50 answers:", df_ipip.shape)

# 5) Drop rows that still have all NaN (safety)
df_ipip = df_ipip.dropna(how="all")
print("Step 5 - after dropping all-NaN rows:", df_ipip.shape)

# 6) Define reverse-scored items for IPIP-50
reverse_items = {
    "EXT": ["EXT2", "EXT4", "EXT6", "EXT8", "EXT10"],
    "EST": ["EST1", "EST3", "EST5", "EST6", "EST7"],
    "AGR": ["AGR2", "AGR4", "AGR6", "AGR8", "AGR10"],
    "CSN": ["CSN1", "CSN3", "CSN5", "CSN7", "CSN9"],
    "OPN": ["OPN2", "OPN4", "OPN6", "OPN8", "OPN10"]
}

df_scored = df_ipip.copy()

# 7) Apply reverse scoring: new_value = 6 - old_value
for trait, items in reverse_items.items():
    for item in items:
        if item in df_scored.columns:
            df_scored[item] = 6 - df_scored[item]

# 8) Compute the Big Five trait scores as mean of answered items
df_big5 = pd.DataFrame()
df_big5["E"] = df_scored[[f"EXT{i}" for i in range(1, 11)]].mean(axis=1)
df_big5["N"] = df_scored[[f"EST{i}" for i in range(1, 11)]].mean(axis=1)
df_big5["A"] = df_scored[[f"AGR{i}" for i in range(1, 11)]].mean(axis=1)
df_big5["C"] = df_scored[[f"CSN{i}" for i in range(1, 11)]].mean(axis=1)
df_big5["O"] = df_scored[[f"OPN{i}" for i in range(1, 11)]].mean(axis=1)


Step 1 - original IPIP shape: (738549, 100)
Missing values per row (summary):
count    738549.000000
mean          1.168375
std           8.224367
min           0.000000
25%           0.000000
50%           0.000000
75%           0.000000
max         100.000000
dtype: float64
Step 4 - after keeping rows with at least 40/50 answers: (729800, 100)
Step 5 - after dropping all-NaN rows: (729800, 100)


In [None]:
print("Shape of df_big5 (rows, columns):", df_big5.shape)
print("\nFirst 5 rows:")
display(df_big5.head())

print("\nSummary statistics:")
display(df_big5.describe())


Shape of df_big5 (rows, columns): (729800, 5)

First 5 rows:


Unnamed: 0,E,N,A,C,O
0,4.6,3.2,2.1,3.0,4.1
1,2.0,2.7,2.0,2.3,3.7
2,2.5,2.8,2.0,2.4,3.9
3,2.6,3.1,2.2,3.7,3.7
4,2.9,3.5,1.8,1.6,4.4



Summary statistics:


Unnamed: 0,E,N,A,C,O
count,729800.0,729800.0,729800.0,729800.0,729800.0
mean,2.965162,2.852228,2.396445,2.766259,3.638476
std,0.905436,0.397942,0.585912,0.655519,0.402882
min,1.0,1.0,1.0,1.0,1.0
25%,2.3,2.6,2.0,2.3,3.4
50%,3.0,2.8,2.3,2.8,3.7
75%,3.6,3.1,2.8,3.2,3.9
max,5.0,4.8,5.0,5.0,5.0


In [None]:
# SECTION 5 – PCA: Dimensionality Reduction from 5D to 2D

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# 1) Standardize the 5 trait scores
scaler = StandardScaler()
X_scaled = scaler.fit_transform(df_big5[["E", "N", "A", "C", "O"]])

# 2) PCA to 2 components
pca = PCA(n_components=2, random_state=42)
X_pca = pca.fit_transform(X_scaled)

# 3) Add PCA coordinates to df_big5
df_big5["PCA1"] = X_pca[:, 0]
df_big5["PCA2"] = X_pca[:, 1]

print("Explained variance by each PCA component:")
print(pca.explained_variance_ratio_)

print("\nFirst 5 rows with PCA:")
display(df_big5.head())


Explained variance by each PCA component:
[0.29495088 0.21354054]

First 5 rows with PCA:


Unnamed: 0,E,N,A,C,O,PCA1,PCA2
0,4.6,3.2,2.1,3.0,4.1,-1.832078,0.68325
1,2.0,2.7,2.0,2.3,3.7,-0.042803,-0.514035
2,2.5,2.8,2.0,2.4,3.9,-0.554984,-0.231201
3,2.6,3.1,2.2,3.7,3.7,0.35213,0.263519
4,2.9,3.5,1.8,1.6,4.4,-2.260084,1.507826


In [None]:
# SECTION 6 – KMeans Clustering with a fixed number of clusters

from sklearn.cluster import KMeans

X_full = df_big5[["E", "N", "A", "C", "O"]].values

best_k = 7

kmeans_final = KMeans(n_clusters=best_k, random_state=42, n_init=10)
df_big5["Cluster"] = kmeans_final.fit_predict(X_full)

print("Number of clusters:", best_k)
print("\nCluster sizes:")
print(df_big5["Cluster"].value_counts())


Number of clusters: 7

Cluster sizes:
Cluster
3    131143
4    113162
6    109172
5    108914
2    100421
1     93872
0     73116
Name: count, dtype: int64


In [None]:
# SECTION 7 – Cluster Centers (average Big Five profile per cluster)

cluster_centers = pd.DataFrame(
    kmeans_final.cluster_centers_,
    columns=["E", "N", "A", "C", "O"]
)
cluster_centers.index.name = "Cluster"

print("Cluster centers (mean trait scores):")
display(cluster_centers)


Cluster centers (mean trait scores):


Unnamed: 0_level_0,E,N,A,C,O
Cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,1.760846,2.865573,3.258344,3.092462,3.54021
1,4.178492,2.970859,1.986767,2.194666,3.777175
2,1.984973,2.778068,2.365236,2.246294,3.631079
3,3.093525,2.845478,2.058296,2.272879,3.694951
4,2.387235,2.727927,2.236804,3.401421,3.647044
5,3.124646,2.956585,2.978063,2.838533,3.458562
6,3.919734,2.842625,2.190063,3.378062,3.694912


In [None]:
# SECTION 8 – Rank-based personality labels (forces variety)

def assign_trait_words(values, low_word, mid_word, high_word):
    """
    values: pandas Series indexed by cluster_id, e.g. cluster_centers['E']
    returns: dict {cluster_id: word}
    """
    # sort clusters from low to high on this trait
    sorted_ids = values.sort_values().index.tolist()
    k = len(sorted_ids)

    # split into low / mid / high groups
    n_low = max(1, k // 3)
    n_high = max(1, k // 3)
    n_mid = k - n_low - n_high

    trait_label = {}

    # low group
    for cid in sorted_ids[:n_low]:
        trait_label[cid] = low_word

    # mid group
    for cid in sorted_ids[n_low:n_low + n_mid]:
        trait_label[cid] = mid_word

    # high group
    for cid in sorted_ids[n_low + n_mid:]:
        trait_label[cid] = high_word

    return trait_label


# 1) Create label dictionaries per trait using RANKS, not absolute thresholds
E_words = assign_trait_words(cluster_centers["E"],
                             low_word="Introvert",
                             mid_word="Ambivert",
                             high_word="Extrovert")

N_words = assign_trait_words(cluster_centers["N"],
                             low_word="Calm",
                             mid_word="Balanced",
                             high_word="Sensitive")

O_words = assign_trait_words(cluster_centers["O"],
                             low_word="Traditional",
                             mid_word="Open-minded",
                             high_word="Creative")

# (you could also do A_words, C_words similarly if needed)

# 2) Build final combined label per cluster
cluster_labels = {}

for cid in cluster_centers.index:
    e_label = E_words[cid]
    n_label = N_words[cid]
    o_label = O_words[cid]

    combined_label = f"{e_label}, {n_label}, {o_label}"
    cluster_labels[cid] = combined_label

print("Cluster → Personality label mapping (rank-based):")
for cid, label in cluster_labels.items():
    print(f"Cluster {cid}: {label}")

# 3) Attach to rows
df_big5["Personality_Label"] = df_big5["Cluster"].map(cluster_labels)

print("\nSample rows with labels:")
display(df_big5[["E", "N", "A", "C", "O", "Cluster", "Personality_Label"]].head())


Cluster → Personality label mapping (rank-based):
Cluster 0: Introvert, Balanced, Traditional
Cluster 1: Extrovert, Sensitive, Creative
Cluster 2: Introvert, Calm, Open-minded
Cluster 3: Ambivert, Balanced, Creative
Cluster 4: Ambivert, Calm, Open-minded
Cluster 5: Ambivert, Sensitive, Traditional
Cluster 6: Extrovert, Balanced, Open-minded

Sample rows with labels:


Unnamed: 0,E,N,A,C,O,Cluster,Personality_Label
0,4.6,3.2,2.1,3.0,4.1,6,"Extrovert, Balanced, Open-minded"
1,2.0,2.7,2.0,2.3,3.7,2,"Introvert, Calm, Open-minded"
2,2.5,2.8,2.0,2.4,3.9,3,"Ambivert, Balanced, Creative"
3,2.6,3.1,2.2,3.7,3.7,4,"Ambivert, Calm, Open-minded"
4,2.9,3.5,1.8,1.6,4.4,3,"Ambivert, Balanced, Creative"


In [None]:
# EXTRA SECTION – Per-user trait descriptions (more variety)

def describe_trait_value(series, low_word, mid_word, high_word):
    """
    series: a pandas Series of one trait over all users (e.g. df_big5['E'])
    returns: Series of same length with words (low/mid/high)
    """
    low_t = series.quantile(0.33)
    high_t = series.quantile(0.67)

    def label(v):
        if v <= low_t:
            return low_word
        elif v >= high_t:
            return high_word
        else:
            return mid_word

    return series.apply(label)

# Per-user labels for each trait
df_big5["E_word"] = describe_trait_value(df_big5["E"],
                                         low_word="Introvert",
                                         mid_word="Ambivert",
                                         high_word="Extrovert")

df_big5["N_word"] = describe_trait_value(df_big5["N"],
                                         low_word="Calm",
                                         mid_word="Balanced",
                                         high_word="Sensitive")

df_big5["O_word"] = describe_trait_value(df_big5["O"],
                                         low_word="Traditional",
                                         mid_word="Open-minded",
                                         high_word="Creative")

df_big5["C_word"] = describe_trait_value(df_big5["C"],
                                         low_word="Relaxed",
                                         mid_word="Balanced",
                                         high_word="Organized")

# Optional: Agreeableness too
df_big5["A_word"] = describe_trait_value(df_big5["A"],
                                         low_word="Reserved",
                                         mid_word="Warm",
                                         high_word="Compassionate")

# Combined per-user trait combo text
df_big5["Trait_Combo"] = (
    df_big5["E_word"] + " · " +
    df_big5["N_word"] + " · " +
    df_big5["O_word"] + " · " +
    df_big5["C_word"] + " . " +
    df_big5["A_word"]
    )

print("Sample rows with rich trait combo:")
display(df_big5[["E", "N", "A", "C", "O",
                 "Cluster", "Personality_Label",
                 "E_word", "N_word", "O_word", "C_word",  "A_word",
                 "Trait_Combo"]].head())


Sample rows with rich trait combo:


Unnamed: 0,E,N,A,C,O,Cluster,Personality_Label,E_word,N_word,O_word,C_word,A_word,Trait_Combo
0,4.6,3.2,2.1,3.0,4.1,6,"Extrovert, Balanced, Open-minded",Extrovert,Sensitive,Creative,Balanced,Reserved,Extrovert · Sensitive · Creative · Balanced . ...
1,2.0,2.7,2.0,2.3,3.7,2,"Introvert, Calm, Open-minded",Introvert,Calm,Open-minded,Relaxed,Reserved,Introvert · Calm · Open-minded · Relaxed . Res...
2,2.5,2.8,2.0,2.4,3.9,3,"Ambivert, Balanced, Creative",Introvert,Balanced,Creative,Relaxed,Reserved,Introvert · Balanced · Creative · Relaxed . Re...
3,2.6,3.1,2.2,3.7,3.7,4,"Ambivert, Calm, Open-minded",Ambivert,Sensitive,Open-minded,Organized,Warm,Ambivert · Sensitive · Open-minded · Organized...
4,2.9,3.5,1.8,1.6,4.4,3,"Ambivert, Balanced, Creative",Ambivert,Sensitive,Creative,Relaxed,Reserved,Ambivert · Sensitive · Creative · Relaxed . Re...


In [None]:
# SECTION 9 – Save trained ML objects for later use (backend / deployment)

import joblib

joblib.dump(scaler, "scaler_big5.pkl")
joblib.dump(pca, "pca_big5.pkl")
joblib.dump(kmeans_final, "kmeans_big5.pkl")
joblib.dump(cluster_labels, "cluster_labels_dict.pkl")

print("Saved: scaler_big5.pkl, pca_big5.pkl, kmeans_big5.pkl, cluster_labels_dict.pkl")


Saved: scaler_big5.pkl, pca_big5.pkl, kmeans_big5.pkl, cluster_labels_dict.pkl


In [None]:
# SECTION 10 – Prediction function (from traits to personality island info)

import numpy as np

def predict_personality_from_traits(E, N, A, C, O):
    """
    Input:  five trait scores (1–5 each).
    Output: dict with cluster, label, PCA coords and trait words.
    """

    # 1) Put into array
    x = np.array([[E, N, A, C, O]])

    # 2) Standardize with the trained scaler
    x_scaled = scaler.transform(x)

    # 3) PCA transform
    x_pca = pca.transform(x_scaled)
    pca1, pca2 = float(x_pca[0, 0]), float(x_pca[0, 1])

    # 4) Cluster prediction
    cluster = int(kmeans_final.predict(x)[0])

    # 5) Cluster-level label
    archetype_label = cluster_labels[cluster]

    # 6) Per-user trait words (same logic as before)

    def one_trait_word(value, series, low_word, mid_word, high_word):
        low_t = series.quantile(0.33)
        high_t = series.quantile(0.67)
        if value <= low_t:
            return low_word
        elif value >= high_t:
            return high_word
        else:
            return mid_word

    E_word = one_trait_word(E, df_big5["E"], "Introvert", "Ambivert", "Extrovert")
    N_word = one_trait_word(N, df_big5["N"], "Calm", "Balanced", "Sensitive")
    O_word = one_trait_word(O, df_big5["O"], "Traditional", "Open-minded", "Creative")
    C_word = one_trait_word(C, df_big5["C"], "Relaxed", "Balanced", "Organized")
    A_word = one_trait_word(A, df_big5["A"], "Reserved", "Warm", "Compassionate")

    trait_combo = f"{E_word} · {N_word} · {O_word} · {C_word} · {A_word}"

    return {
        "E": float(E), "N": float(N), "A": float(A), "C": float(C), "O": float(O),
        "cluster": cluster,
        "archetype_label": archetype_label,
        "E_word": E_word,
        "N_word": N_word,
        "O_word": O_word,
        "C_word": C_word,
        "A_word": A_word,
        "trait_combo": trait_combo,
        "PCA1": pca1,
        "PCA2": pca2,
    }

# Quick test:
test_result = predict_personality_from_traits(3.5, 2.0, 4.0, 3.0, 4.5)
test_result




{'E': 3.5,
 'N': 2.0,
 'A': 4.0,
 'C': 3.0,
 'O': 4.5,
 'cluster': 5,
 'archetype_label': 'Ambivert, Sensitive, Traditional',
 'E_word': 'Extrovert',
 'N_word': 'Calm',
 'O_word': 'Creative',
 'C_word': 'Balanced',
 'A_word': 'Compassionate',
 'trait_combo': 'Extrovert · Calm · Creative · Balanced · Compassionate',
 'PCA1': 0.8843873822622744,
 'PCA2': -0.35349478749213903}

In [None]:
import json

# Compute 33% and 67% quantiles for each trait
q = df_big5[["E", "N", "A", "C", "O"]].quantile([0.33, 0.67])

quantiles_dict = {}
for trait in ["E", "N", "A", "C", "O"]:
    quantiles_dict[trait] = {
        "low": float(q.loc[0.33, trait]),
        "high": float(q.loc[0.67, trait])
    }

print("Quantiles dictionary:")
print(quantiles_dict)

# Save to JSON file
with open("quantiles.json", "w") as f:
    json.dump(quantiles_dict, f, indent=4)

print("Saved quantiles.json")


Quantiles dictionary:
{'E': {'low': 2.5, 'high': 3.4}, 'N': {'low': 2.7, 'high': 3.0}, 'A': {'low': 2.1, 'high': 2.6}, 'C': {'low': 2.5, 'high': 3.1}, 'O': {'low': 3.5, 'high': 3.888888888888889}}
Saved quantiles.json
