## Social Media User Behavior Clustering

### Project by:- Ayush Sain

### Introduction
Unsupervised learning helps uncover hidden patterns in data without predefined labels. In this project, we use K-Means clustering to segment social media users based on their digital behavior.

### Objective
Identify distinct user groups from survey data and profile them (e.g., Heavy Users, Casual Browsers, Premium Subscribers, Aware but Distracted).

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

In [None]:
# Load dataset

filename = "HybridDataset.csv"
df = pd.read_csv(filename)
# Drop unnamed index columns if any
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
df.columns = [c.strip() for c in df.columns]
print("Shape:", df.shape)
print(df.head())

In [None]:
# Quick data inspection
print("Columns:", df.columns.tolist())
print("\nSample non-null counts:")
print(df.count())

In [None]:
# Preprocessing
working = df.copy()
def yn_map(x):
    if pd.isna(x):
        return np.nan
    s = str(x).strip().lower()
    if s in ['yes','y','true','1']:
        return 1
    if s in ['no','n','false','0']:
        return 0
    return np.nan


# Yes/No conversion
for c in working.columns:
    vals = working[c].dropna().astype(str).str.lower()
    if len(vals) > 0 and (vals.isin(['yes','no']).sum() / len(vals)) > 0.15:
        working[c + '_bin'] = working[c].apply(yn_map)


# Extract numeric values
for c in working.columns:
    if working[c].dtype == object:
        if working[c].dropna().astype(str).str.contains(r'\d').any():
            coerced = pd.to_numeric(working[c].astype(str).str.extract(r'(\d+\.?\d*)')[0], errors='coerce')
            if coerced.notna().sum() > 0:
                working[c + '_num'] = coerced
    

# Multi-select expansion
def split_and_clean(cell):
    if pd.isna(cell):
        return []
    parts = [p.strip().lower() for p in re.split(r'[;,/|]', str(cell)) if p.strip() != '']
    parts2 = []
    for p in parts:
        parts2 += [x.strip() for x in p.split(',') if x.strip() != '']
    return list(dict.fromkeys([p for p in parts2 if p != 'nan']))


multiselect_cols = [c for c in working.columns if 'platform' in c.lower() or 'activity' in c.lower()]
for c in multiselect_cols:
    working[c + '_count'] = working[c].apply(lambda x: len(split_and_clean(x)))
    top_vals = pd.Series(sum(working[c].dropna().apply(split_and_clean).tolist(), [])).value_counts().head(8).index.tolist()
    for val in top_vals:
        safe = re.sub(r'[^0-9a-z]+', '_', val)[:30]
        colname = f"{c}_has_{safe}"
        working[colname] = working[c].apply(lambda x: 1 if val in split_and_clean(x) else 0)


# One-hot encoding for small categorical questions
single_choice_cols = [c for c in working.columns if working[c].dtype == object and working[c].nunique() < 30]
special_exclude = ['description','title','director','cast','country']
single_choice_cols = [c for c in single_choice_cols if c.lower() not in special_exclude]
for c in single_choice_cols:
    top = working[c].value_counts().head(6).index.tolist()
    for val in top:
        colname = f"{c}_is_" + re.sub(r'[^0-9a-z]+', '_', str(val).strip().lower())[:30]
        working[colname] = working[c].apply(lambda x: 1 if str(x).strip() == str(val).strip() else 0)


# Build feature list
num_cols = [c for c in working.columns if working[c].dtype in [np.float64, np.int64]]
features = [c for c in working.columns if any(s in c for s in ['_num','_count','_bin','_has_','_is_'])]
features = sorted(list(set(features + num_cols)))


feat_df = working[features].copy()
feat_df = feat_df.dropna(thresh=max(1, int(0.5 * len(features))))
feat_df = feat_df.fillna(feat_df.median())
print("Selected features:", len(features))

In [None]:
# Scaling & PCA
scaler = StandardScaler()
X = scaler.fit_transform(feat_df)


pca = PCA(n_components=2, random_state=42)
X_pca = pca.fit_transform(X)
print("Explained variance by 2 components:", pca.explained_variance_ratio_.sum())


plt.figure(figsize=(6,3))
plt.plot(np.cumsum(PCA().fit(X).explained_variance_ratio_))
plt.xlabel('n components')
plt.ylabel('cumulative explained variance')
plt.title('PCA Explained Variance')
plt.grid(True)
plt.show()

In [None]:
# Clustering & Model Tuning
ks = list(range(2,9))
inertias, silhouettes = [], []
for k in ks:
    km = KMeans(n_clusters=k, random_state=42, n_init=10)
    labs = km.fit_predict(X)
    inertias.append(km.inertia_)
    silhouettes.append(silhouette_score(X, labs) if len(set(labs)) > 1 else np.nan)


print(pd.DataFrame({'k': ks, 'inertia': inertias, 'silhouette': silhouettes}))


plt.figure(figsize=(6,3))
plt.plot(ks, inertias, marker='o')
plt.title('Elbow plot')
plt.xlabel('k')
plt.ylabel('inertia')
plt.show()


plt.figure(figsize=(6,3))
plt.plot(ks, silhouettes, marker='o')
plt.title('Silhouette Score')
plt.xlabel('k')
plt.ylabel('score')
plt.show()

In [None]:
# Final Model & Visualization
best_k = ks[int(np.nanargmax(silhouettes))]
print("Chosen k:", best_k)
final_km = KMeans(n_clusters=best_k, random_state=42, n_init=20)
labels = final_km.fit_predict(X)


plt.figure(figsize=(7,6))
for c in np.unique(labels):
    sel = X_pca[labels == c]
    plt.scatter(sel[:,0], sel[:,1], s=30, alpha=0.7, label=f'Cluster {c}')
centers_pca = pca.transform(final_km.cluster_centers_)
plt.scatter(centers_pca[:,0], centers_pca[:,1], s=200, c='black', marker='X')
plt.legend()
plt.xlabel('PCA1')
plt.ylabel('PCA2')
plt.title(f'Clusters (k={best_k})')
plt.show()

In [None]:
# Cluster Profiling
feat_df['cluster'] = labels
feat_df['pca1'] = X_pca[:,0]
feat_df['pca2'] = X_pca[:,1]


cluster_counts = feat_df['cluster'].value_counts().sort_index()
print("Cluster sizes:\n", cluster_counts)


cluster_means = feat_df.groupby('cluster').mean()
print(cluster_means.head())


global_mean = feat_df.drop(columns=['cluster','pca1','pca2']).mean()
top_features = {}
for c in sorted(feat_df['cluster'].unique()):
    mean_c = feat_df[feat_df['cluster'] == c].drop(columns=['cluster','pca1','pca2']).mean()
    diff = (mean_c - global_mean).abs().sort_values(ascending=False).head(8)
    top_features[c] = diff.index.tolist()
print("Top features per cluster:", top_features)

In [None]:
# Save results
feat_df.to_csv('hybrid_dataset_with_clusters.csv', index=False)
cluster_means.to_csv('cluster_profile_means.csv')
print("Saved outputs: hybrid_dataset_with_clusters.csv, cluster_profile_means.csv")

### Conclusion

This project successfully grouped survey respondents into meaningful social-media usage clusters. These insights can support:

Marketing campaigns (cluster-specific targeting).

Product development (features for different user types).

Digital wellbeing programs.