**Social Media Addiction**

In [None]:
# import manipulation libraries
import pandas as pd
import numpy as np

# import visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns

# import machine learning libraries
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

In [None]:
# -------------------- DATA INGESTION --------------------
def data_ingestion():
    df = pd.read_csv(r'C:\SocialMediaAddiction_Analysis\data\raw\Social_Media_Addiction.csv')
    return df


In [None]:
df.head()

In [None]:
df.isnull().sum()

In [None]:
df.describe()

In [None]:
df.head()

In [None]:
df.duplicated().sum()

In [None]:
# -------------------- ADDICTION LEVEL --------------------
def addiction_lavel(score):
    if score <= 3:
        return "Low"
    elif score <= 6:
        return "Medium"
    else:
        return "High" 
    
df["addiction_lavel"] = df["Addicted_Score"].apply(addiction_lavel) 

In [None]:
df["Addicted"] = df["addiction_lavel"].apply(lambda x:1 if x in ["Medium","High"] else 0)

In [None]:
df.head()

In [None]:
# -------------------- DATA EXPLORATION --------------------
def data_exploration(df):

    numerical_col = df.select_dtypes(exclude='object').columns
    stats = []

    for i in numerical_col:
        Q1 = df[i].quantile(0.25)
        Q3 = df[i].quantile(0.75)
        IQR = Q3 - Q1
        LW = Q1 - 1.5 * IQR
        UW = Q3 + 1.5 * IQR

        outlier_flag = "Has Outliers" if df[(df[i] < LW) | (df[i] > UW)].shape[0] > 0 else "No Outliers"

        stats.append({
            "Feature": i,
            "Minimum": df[i].min(),
            "Maximum": df[i].max(),
            "Mean": df[i].mean(),
            "Median": df[i].median(),
            "IQR": IQR,
            "Std": df[i].std(),
            "Skewness": df[i].skew(),
            "Kurtosis": df[i].kurt(),
            "Outliers": outlier_flag
        })

    return pd.DataFrame(stats)

In [None]:
# -------------------- PREPROCESSOR --------------------
def data_preprocessor(df):

    categorical_col = df.select_dtypes(include='object').columns
    numerical_col = df.select_dtypes(exclude='object').columns

    numerical_pipeline = Pipeline([
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler())
    ])

    categorical_pipeline = Pipeline([
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("encoder", OneHotEncoder(handle_unknown="ignore"))
    ])

    preprocessor = ColumnTransformer([
        ("num", numerical_pipeline, numerical_col),
        ("cat", categorical_pipeline, categorical_col)
    ])

    return preprocessor

In [None]:
# -------------------- MODEL BUILD --------------------
def model_build(df):

    X = df.drop(columns=["Addicted", "addiction_lavel"], errors='ignore')
    y = df["Addicted"]

    # Split 
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, random_state=1, test_size=0.2
    )

   # Preprocessing (handle categorical + scaling)
    preprocessor = data_preprocessor(X_train)

    X_train = preprocessor.fit_transform(X_train)
    X_test = preprocessor.transform(X_test)

    models = {
        "Logistic Regression": LogisticRegression(max_iter=1000),
        "Random Forest": RandomForestClassifier(random_state=42),
        "Gradient Boosting": GradientBoostingClassifier(),
        "XGBoost": XGBClassifier(use_label_encoder=False)
    }

    return models, X_train, X_test, y_train, y_test

In [None]:
# -------------------- MODEL EVALUATION --------------------
def model_evalution(models, X_train, X_test, y_train, y_test):

    for model_name, model in models.items():

        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        print(f"\n===== {model_name} ===%%")
        print("Accuracy:", accuracy_score(y_test, y_pred))
        print("Classification Report:\n", classification_report(y_test, y_pred))
        print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
        print("-" * 50)

In [None]:
# -------------------- MAIN FUNCTION --------------------
def main():

    # 1 Data Ingestion
    df = data_ingestion()

    # 3 Data Exploration
    report = data_exploration(df)
    print(report)

    # 4 Model Build
    models, X_train, X_test, y_train, y_test = model_build(df)

    # 5 Model Evaluation
    model_evalution(models, X_train, X_test, y_train, y_test)


if __name__ == "__main__":
    main()