**Social Media Addiction**


In [None]:
# import manipulation lybraries
import pandas as pd
import numpy as np

# import visualization lybraries
import matplotlib.pyplot as plt
import seaborn as sns

# import machine learning lybraries
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder,StandardScaler
from sklearn.metrics import classification_report, confusion_matrix,accuracy_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier


In [None]:
# data ingestion

def data_ingestion():
    df = pd.read_csv(r'C:\SocialMediaAddiction_Analysis\data\raw\Social_Media_Addiction.csv')

    return df

In [None]:
df = data_ingestion()
df.head()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
def addiction_lavel(score):
    if score <= 3:
        return "Low"
    elif score <= 6:
        return "Medium"
    else:
        return "High" 
    
df["addiction_lavel"] = df["Addicted_Score"].apply(addiction_lavel) 

In [None]:
from collections import OrderedDict
# Data exploration

numerical_col = df.select_dtypes(exclude='object').columns
def data_exploration():
     # select numerical columns
    stats = []
    for i in numerical_col:
        Q1 = df[i].quantile(0.25)
        Q3 = df[i].quantile(0.75)
        IQR = Q3 - Q1
        LW = Q1 - 1.5 * IQR
        UW = Q3 + 1.5 * IQR
        outlier_flag = "Has Outliers" if df[(df[i] < LW) | (df[i] > UW)].shape[0] > 0 else "No Outliers"

        numerical_stats = OrderedDict({
            "Feature": i,
            "Minimum": df[i].min(),
            "Maximum": df[i].max(),
            "Mean": df[i].mean(),
            "Median": df[i].median(),
            "25%": Q1,
            "75%": Q3,
            "IQR": IQR,
            "Standard Deviation": df[i].std(),
            "Skewness": df[i].skew(),
            "Kurtosis": df[i].kurt(),
            "Outlier Comment": outlier_flag
        })
        stats.append(numerical_stats)
        numerical_stats_report = pd.DataFrame(stats)
    return  numerical_stats_report

In [None]:
# Data preprocessing 
# categorical columms 
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
categorical_col = df.select_dtypes(include = 'object').columns
numerical_col = df.select_dtypes(exclude='object').columns
def data_preprocessor(numerical_col, categorical_col):

    numerical_pipeline = Pipeline([
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler())
    ])

    categorical_pipeline = Pipeline([
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("encoder", OneHotEncoder(handle_unknown="ignore"))
    ])

    preprocessor = ColumnTransformer([
        ("numerical", numerical_pipeline, numerical_pipeline),
        ("categorical", categorical_col, categorical_col)
    ])

    return preprocessor


    return df    


In [None]:
df.columns

In [None]:
df["Addicted"] = df['addiction_lavel'].apply(
    lambda x: 1 if x in ['High','Medium'] else 0)


In [None]:
df

In [None]:
# model building
def model_build(df):
    X = df.drop(columns=["Addicted","Addiction Level"], errors='ignore')
    y = df["Addicted"]

    X_train, X_test, y_train, y_test = train_test_split(X,y,
                                                         random_state=1,
                                                         test_size=0.2)
    # use standard scaler
    sc = StandardScaler
    X_train = sc.fit_transform(X_train)
    X_test = sc.fit(X_test)

    models = {
        "Logistic Regression": LogisticRegression(),
        "Random Forest": RandomForestClassifier(random_state=42),
        "Gradient Boosting": GradientBoostingClassifier(),
        "XGBoost": XGBClassifier()
    }

    model_train = {}

    for model_name, model in models.items():
        model.fit(X_train,y_train)
        model_train[model_name] = model
    return model_name,sc, X_train, y_train, X_test, y_test
    

In [None]:
# def model evalution

def model_evalution(models,X_train, y_train, X_test, y_test):

    for model_name, model in models.items():
        model.fit(X_train,y_train)
        y_pred = model.predict(X_test)
        print("Accuracy:",accuracy_score(y_test,y_pred))
        print("Random Classifier:", classification_report(y_test,y_pred))
        print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
        print("-" * 50)
    return models,X_train, y_train, X_test, y_test
        


In [None]:
def main():

    # 1 Data Ingestion
    df = data_ingestion()

    # 3 Data Exploration
    report = data_exploration(df)
    print(report)

    # 4 Preprocessing
    df = numerical_stats_report(df)

    # 5 Model Build
    models, X_train, X_test, y_train, y_test = model_build(df)

    # 6 Model Evaluation
    model_evaluation(models, X_train, X_test, y_train, y_test)


if __name__ == "__main__":
    main()
