In [73]:
# import manipulation lybraries
import pandas as pd
import numpy as np 
# import visualization lybraries
import seaborn as sns
import matplotlib.pyplot as plt

#import machine learning lybraries
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from xgboost import XGBClassifier


In [88]:
# data ingestion

def data_ingestion():
    df = pd.read_csv(r'C:\Mental_Health_Prediction_Model\data\raw\survey.csv')
    return df

In [75]:
# check null values
df.isnull().sum()

Age                           0
Gender                        0
self_employed                18
family_history                0
treatment                     0
work_interfere                0
no_employees                  0
remote_work                   0
tech_company                  0
benefits                      0
care_options                  0
wellness_program              0
seek_help                     0
anonymity                     0
leave                         0
mental_health_consequence     0
phys_health_consequence       0
coworkers                     0
supervisor                    0
mental_health_interview       0
phys_health_interview         0
mental_vs_physical            0
obs_consequence               0
dtype: int64

In [76]:
# check duplicate
df.duplicated().sum()

np.int64(4)

In [77]:
df.columns

Index(['Age', 'Gender', 'self_employed', 'family_history', 'treatment',
       'work_interfere', 'no_employees', 'remote_work', 'tech_company',
       'benefits', 'care_options', 'wellness_program', 'seek_help',
       'anonymity', 'leave', 'mental_health_consequence',
       'phys_health_consequence', 'coworkers', 'supervisor',
       'mental_health_interview', 'phys_health_interview',
       'mental_vs_physical', 'obs_consequence'],
      dtype='object')

In [78]:
df.columns

Index(['Age', 'Gender', 'self_employed', 'family_history', 'treatment',
       'work_interfere', 'no_employees', 'remote_work', 'tech_company',
       'benefits', 'care_options', 'wellness_program', 'seek_help',
       'anonymity', 'leave', 'mental_health_consequence',
       'phys_health_consequence', 'coworkers', 'supervisor',
       'mental_health_interview', 'phys_health_interview',
       'mental_vs_physical', 'obs_consequence'],
      dtype='object')

In [79]:
df.isnull().sum()

Age                           0
Gender                        0
self_employed                18
family_history                0
treatment                     0
work_interfere                0
no_employees                  0
remote_work                   0
tech_company                  0
benefits                      0
care_options                  0
wellness_program              0
seek_help                     0
anonymity                     0
leave                         0
mental_health_consequence     0
phys_health_consequence       0
coworkers                     0
supervisor                    0
mental_health_interview       0
phys_health_interview         0
mental_vs_physical            0
obs_consequence               0
dtype: int64

In [80]:
df['self_employed'].value_counts()

self_employed
No     1095
Yes     146
Name: count, dtype: int64

In [81]:
df['work_interfere'].fillna('N\A',inplace= True)

  df['work_interfere'].fillna('N\A',inplace= True)


In [82]:
from collections import OrderedDict
# data exploration

def data_exploration(df):

    # select numerical columns 

    numerical_col = df.select_dtypes(exclude='object').columns
    stats = []
    for i in numerical_col:
        Q1 = df[i].quantile(0.25)
        Q3 = df[i].quantile(0.75)
        IQR = Q3 - Q1
        LW = Q1 - 1.5 * IQR
        UW = Q3 + 1.5 * IQR
        outlier_flag = "Has Outliers" if df[(df[i] < LW) | (df[i] > UW)].shape[0] > 0 else "No Outliers"

        numerical_stats = OrderedDict({
            "Feature": i,
            "Minimum": df[i].min(),
            "Maximum": df[i].max(),
            "Mean": df[i].mean(),
            "Median": df[i].median(),
            "Mode": df[i].mode().iloc[0] if not df[i].mode().empty else np.nan,
            "25%": Q1,
            "75%": Q3,
            "IQR": IQR,
            "Standard Deviation": df[i].std(),
            "Skewness": df[i].skew(),
            "Kurtosis": df[i].kurt(),
            "Outlier Comment": outlier_flag
        })
        stats.append(numerical_stats)
        numerical_stats_report = pd.DataFrame(stats)
    return  numerical_stats_report   



In [83]:
# def data preprocessing 

def data_preprocessing(df):
    le = LabelEncoder()
    for col in df.columns:
        if df[col].dtype == 'object':
            df[col] = le.fit_transform(df[col])

    return df
 


In [84]:
# model building

def model_building(df):
    X = df.drop("treatment", axis=1)
    y = df["treatment"]

    X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                        test_size=0.2,
                                                        random_state=42)

    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    models = {
        "Logistic Regression": LogisticRegression(),
        "Random Forest": RandomForestClassifier(random_state=42),
        "Gradient Boosting": GradientBoostingClassifier(),
        "XGBoost": XGBClassifier()
    }

    model_train = {}

    for name, model in models.items():
        model.fit(X_train,y_train)
        model_train[name] = model
    return model_train, scaler, X_test, y_test

In [85]:
# model evalution

def model_evaluation(models, X_test, y_test):
    results = {}
    for name, model in models.items():
        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)

        results[name] = accuracy

        print("Accuracy:", accuracy)
        print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
        print("Classification Report:\n", classification_report(y_test, y_pred))


        best_model = max(results, key=results.get)
        print("Best Model:", best_model)
        print("Best Accuracy:", results[best_model])

    return best_model

In [86]:
import pickle
def save_model(models, best_model_name, scaler):

    best_model = models[best_model_name]

    pickle.dump(best_model, open("best_model.pkl", "wb"))
    pickle.dump(scaler, open("scaler.pkl", "wb"))

    print("\nBest Model Saved Successfully")

In [89]:
def main():
    df = data_ingestion() # Raw dataframe

    # Apply the column drops to prepare the dataframe for further processing
    data = df.copy()
    data.drop(columns=['Country','state','Timestamp','comments'], axis=1, inplace=True)

   
    numerical_stats_report = data_exploration(data)

    
    df_preprocessed = data_preprocessing(data)

    
    models, scaler, X_test, y_test = model_building(data)

    best_model_name = model_evaluation(models, X_test, y_test)

    save_model(models, best_model_name, scaler)

if __name__ == "__main__":
    main()

Accuracy: 0.7222222222222222
Confusion Matrix:
 [[94 35]
 [35 88]]
Classification Report:
               precision    recall  f1-score   support

           0       0.73      0.73      0.73       129
           1       0.72      0.72      0.72       123

    accuracy                           0.72       252
   macro avg       0.72      0.72      0.72       252
weighted avg       0.72      0.72      0.72       252

Best Model: Logistic Regression
Best Accuracy: 0.7222222222222222
Accuracy: 0.8015873015873016
Confusion Matrix:
 [[ 98  31]
 [ 19 104]]
Classification Report:
               precision    recall  f1-score   support

           0       0.84      0.76      0.80       129
           1       0.77      0.85      0.81       123

    accuracy                           0.80       252
   macro avg       0.80      0.80      0.80       252
weighted avg       0.80      0.80      0.80       252

Best Model: Random Forest
Best Accuracy: 0.8015873015873016
Accuracy: 0.8095238095238095
Confu