**Social Media Addiction**

In [None]:
# import manipulation libraries
import pandas as pd
import numpy as np

# import visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns

# import machine learning libraries
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

In [None]:
# -------------------- DATA INGESTION --------------------
def data_ingestion():
    df = pd.read_csv(r'C:\SocialMediaAddiction_Analysis\data\raw\Social_Media_Addiction.csv')
    return df

In [None]:
df = data_ingestion()
df.head()

In [None]:
df = df.drop(columns=['Student_ID','Relationship_Status'], errors="ignore")


In [None]:
# -------------------- ADDICTION LEVEL --------------------
def addiction_lavel(score):
    if score <= 3:
        return "Low"
    elif score <= 6:
        return "Medium"
    else:
        return "High"

In [None]:
# -------------------- DATA EXPLORATION --------------------
def data_exploration(df):

    numerical_col = df.select_dtypes(exclude='object').columns
    categorical_col = df.select_dtypes(include = 'object').columns

    numerical_stats = []
    categorical_stats = []

   # ---------- Numerical Analysis ----------
    Q1 = df[numerical_col].quantile(0.25)
    Q3 = df[numerical_col].quantile(0.75)
    IQR = Q3 - Q1
    LW = Q1 - 1.5 * IQR
    UW = Q3 + 1.5 * IQR
    outlier_count = ((df[numerical_col] < LW) | (df[numerical_col] > UW))


    for col in numerical_col:
        num_stats = ({
            "feature": col,
            "mean": df[col].mean(),
            "median": df[col].median(),
            "quartile_1": Q1[col],
            "quartile_3": Q3[col],
            "IQR": IQR[col],
            "lower_whisker": LW[col],
            "upper_whisker": UW[col],
            "outlier_count": outlier_count[col].sum(),
            "std_dev": df[col].std(),
            "variance": df[col].var(),
            "skewness": df[col].skew(),
            "kurtosis": df[col].kurtosis()
        })

        numerical_stats.append(num_stats)

    # ---------- Categorical Analysis ----------

    for col in categorical_col:
        cat_stats = ({
            "feature": col,
            "unique_values": df[col].nunique(),
            "mode": df[col].mode()[0] if not df[col].mode().empty else None,
            "missing_values": df[col].isnull().sum()
        })

        categorical_stats.append(cat_stats)

    # return both reports
    numerical_report = pd.DataFrame(numerical_stats)
    categorical_report = pd.DataFrame(categorical_stats)

    return numerical_report, categorical_col    
        

    

In [None]:
# -------------------- PREPROCESSOR --------------------
def data_preprocessor(df):

    categorical_col = df.select_dtypes(include='object').columns
    numerical_col = df.select_dtypes(exclude='object').columns

    numerical_pipeline = Pipeline([
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler())
    ])

    categorical_pipeline = Pipeline([
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("encoder", OneHotEncoder(handle_unknown="ignore"))
    ])

    preprocessor = ColumnTransformer([
        ("num", numerical_pipeline, numerical_col),
        ("cat", categorical_pipeline, categorical_col)
    ])

    return preprocessor

In [None]:
# -------------------- MODEL BUILD --------------------
def model_build(df):

    X = df.drop(columns=["Addicted", "addiction_lavel"], errors='ignore')
    y = df["Addicted"]

    # Split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, random_state=1, test_size=0.2
    )

   # Preprocessing (handle categorical + scaling)
    preprocessor = data_preprocessor(X_train)

    X_train = preprocessor.fit_transform(X_train)
    X_test = preprocessor.transform(X_test)

    models = {
        "Logistic Regression": LogisticRegression(max_iter=1000),
        "Random Forest": RandomForestClassifier(random_state=42),
        "Gradient Boosting": GradientBoostingClassifier(),
        "XGBoost": XGBClassifier(use_label_encoder=False)
    }

    return models, X_train, X_test, y_train, y_test

In [None]:
# -------------------- MODEL EVALUATION --------------------
def model_evalution(models, X_train, X_test, y_train, y_test):

    for model_name, model in models.items():

        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        print(f"\n===== {model_name} ===")
        print("Accuracy:", accuracy_score(y_test, y_pred))
        print("Classification Report:\n", classification_report(y_test, y_pred))
        print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
        print("-" * 50)

In [None]:
import pickle


In [None]:
# -------------------- MAIN FUNCTION --------------------
def main():

    # 1 Data Ingestion
    df = data_ingestion()

    # Create 'addiction_lavel' and 'Addicted' columns (moved from global scope)
    df["addiction_lavel"] = df["Addicted_Score"].apply(addiction_lavel)
    df["Addicted"] = df["addiction_lavel"].apply(lambda x:1 if x in ["Medium","High"] else 0)

    # 3 Data Exploration
    report = data_exploration(df)
    print(report)

    # 4 Model Build
    models, X_train, X_test, y_train, y_test = model_build(df)

    # 5 Model Evaluation
    model_evalution(models, X_train, X_test, y_train, y_test)


if __name__ == "__main__":
    main()