<a href="https://colab.research.google.com/github/2303A51758/2303A51758-b-11-PDS/blob/main/unsupervised_learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
# Unsupervised Learning Pipeline for "Students Social Media Addiction" dataset
# This script:
# - Loads the dataset (from known extracted path, working dir, or via Colab upload)
# - Preprocesses numeric and categorical features
# - Runs clustering algorithms: KMeans, Agglomerative, DBSCAN
# - Evaluates clusters using silhouette and Davies-Bouldin scores
# - Runs dimensionality reduction (PCA + t-SNE) for visualization
# - Runs anomaly detection: IsolationForest
# - Plots results
#
# NOTE: This code is intended to be run in Colab/Jupyter. If the dataset is not found,
# it will prompt for upload (Colab). Adjust paths as needed.

import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN
from sklearn.metrics import silhouette_score, davies_bouldin_score
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.ensemble import IsolationForest
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="whitegrid", context="notebook")

# ---- Load dataset ----
possible_paths = [
    "/mnt/data/extracted_files/Students Social Media Addiction.csv",
    "Students Social Media Addiction.csv",
    "/mnt/data/Students Social Media Addiction.csv"
]

csv_path = None
for p in possible_paths:
    if os.path.exists(p):
        csv_path = p
        break

if csv_path is None:
    # attempt Colab upload
    try:
        from google.colab import files
        uploaded = files.upload()
        csv_path = list(uploaded.keys())[0]
    except Exception as e:
        raise FileNotFoundError("Dataset not found. Place 'Students Social Media Addiction.csv' in working dir or provide path.") from e

df = pd.read_csv(csv_path)
print("Loaded dataset:", csv_path, " shape:", df.shape)
display(df.head())

# ---- Select features for unsupervised analysis ----
# Remove identifier columns and raw target (if present)
drop_cols = [c for c in ['Student_ID','Addicted_Score','Addicted','Addicted_Label'] if c in df.columns]
X_df = df.drop(columns=drop_cols)
print("\nUsing features:", X_df.columns.tolist())

# ---- Preprocessing ----
numeric_cols = X_df.select_dtypes(include=['int64','float64']).columns.tolist()
cat_cols = X_df.select_dtypes(include=['object']).columns.tolist()

print("Numeric cols:", numeric_cols)
print("Categorical cols:", cat_cols)

# Build transformer
numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])
categorical_transformer = Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))])

from sklearn.compose import ColumnTransformer
preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, numeric_cols),
    ('cat', categorical_transformer, cat_cols)
])

Saving archive.zip to archive (2).zip
Loaded dataset: archive (2).zip  shape: (705, 13)


Unnamed: 0,Student_ID,Age,Gender,Academic_Level,Country,Avg_Daily_Usage_Hours,Most_Used_Platform,Affects_Academic_Performance,Sleep_Hours_Per_Night,Mental_Health_Score,Relationship_Status,Conflicts_Over_Social_Media,Addicted_Score
0,1,19,Female,Undergraduate,Bangladesh,5.2,Instagram,Yes,6.5,6,In Relationship,3,8
1,2,22,Male,Graduate,India,2.1,Twitter,No,7.5,8,Single,0,3
2,3,20,Female,Undergraduate,USA,6.0,TikTok,Yes,5.0,5,Complicated,4,9
3,4,18,Male,High School,UK,3.0,YouTube,No,7.0,7,Single,1,4
4,5,21,Male,Graduate,Canada,4.5,Facebook,Yes,6.0,6,In Relationship,2,7



Using features: ['Age', 'Gender', 'Academic_Level', 'Country', 'Avg_Daily_Usage_Hours', 'Most_Used_Platform', 'Affects_Academic_Performance', 'Sleep_Hours_Per_Night', 'Mental_Health_Score', 'Relationship_Status', 'Conflicts_Over_Social_Media']
Numeric cols: ['Age', 'Avg_Daily_Usage_Hours', 'Sleep_Hours_Per_Night', 'Mental_Health_Score', 'Conflicts_Over_Social_Media']
Categorical cols: ['Gender', 'Academic_Level', 'Country', 'Most_Used_Platform', 'Affects_Academic_Performance', 'Relationship_Status']
