In [93]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Machine Learning libraries
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score, roc_curve
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

In [95]:
# Load the dataset
df = pd.read_csv('../data/mental_health_india.csv')

print("Dataset Shape:", df.shape)
print("\nFirst 5 rows:")
print(df.head())

print("\nDataset Info:")
print(df.info())

print("\nMissing Values:")
print(df.isnull().sum())

print("\nTarget Variable Distribution:")
print(df['treatment'].value_counts())
print("\nTarget Variable Percentages:")
print(df['treatment'].value_counts(normalize=True) * 100)

Dataset Shape: (2774, 16)

First 5 rows:
        Timestamp  Gender Occupation self_employed family_history treatment  \
0  8/28/2014 9:18  Female  Corporate            No            Yes       Yes   
1  8/28/2014 9:18  Female    Student            No            Yes       Yes   
2  8/28/2014 9:18  Female    Student            No            Yes       Yes   
3  8/28/2014 9:18  Female   Business            No            Yes       Yes   
4  8/28/2014 9:18  Female    Student            No            Yes       Yes   

         Days_Indoors Growing_Stress Changes_Habits Mental_Health_History  \
0           1-14 days            Yes             No                   Yes   
1    Go out Every day             No            Yes                    No   
2  More than 2 months            Yes            Yes                    No   
3    Go out Every day            Yes          Maybe                    No   
4           1-14 days            Yes          Maybe                 Maybe   

  Mood_Swings Coping_

In [96]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2774 entries, 0 to 2773
Data columns (total 16 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   Timestamp                2774 non-null   object
 1   Gender                   2774 non-null   object
 2   Occupation               2774 non-null   object
 3   self_employed            2774 non-null   object
 4   family_history           2774 non-null   object
 5   treatment                2774 non-null   object
 6   Days_Indoors             2774 non-null   object
 7   Growing_Stress           2774 non-null   object
 8   Changes_Habits           2774 non-null   object
 9   Mental_Health_History    2774 non-null   object
 10  Mood_Swings              2774 non-null   object
 11  Coping_Struggles         2774 non-null   object
 12  Work_Interest            2774 non-null   object
 13  Social_Weakness          2774 non-null   object
 14  mental_health_interview  2774 non-null  

In [97]:
df["treatment"].value_counts()

treatment
No     1950
Yes     824
Name: count, dtype: int64

In [98]:
# Detailed Exploratory Data Analysis
print("=== DETAILED DATA ANALYSIS ===")
print(f"Dataset shape: {df.shape}")
print(f"Number of features: {df.shape[1] - 1}")
print(f"Target variable: treatment")

print("\n=== COLUMN DESCRIPTIONS ===")
for col in df.columns:
    print(f"{col}: {df[col].dtype} - {df[col].nunique()} unique values")
    if df[col].dtype == 'object':
        print(f"    Values: {df[col].unique()[:10]}")  # Show first 10 unique values
    print()

print("\n=== TARGET VARIABLE ANALYSIS ===")
target_counts = df['treatment'].value_counts()
print("Treatment distribution:")
for val, count in target_counts.items():
    pct = (count / len(df)) * 100
    print(f"  {val}: {count} ({pct:.1f}%)")

print(f"\nClass balance ratio: {target_counts.min()}/{target_counts.max()} = {target_counts.min()/target_counts.max():.3f}")

# Check for data quality issues
print("\n=== DATA QUALITY ANALYSIS ===")
print("Missing values per column:")
missing_data = df.isnull().sum()
for col, missing in missing_data.items():
    if missing > 0:
        pct = (missing / len(df)) * 100
        print(f"  {col}: {missing} ({pct:.1f}%)")

if missing_data.sum() == 0:
    print("  No missing values found!")

# Check for duplicate rows
duplicates = df.duplicated().sum()
print(f"\nDuplicate rows: {duplicates}")

# Basic statistics for numerical columns
numerical_cols = df.select_dtypes(include=[np.number]).columns
if len(numerical_cols) > 0:
    print(f"\n=== NUMERICAL COLUMNS STATISTICS ===")
    print(df[numerical_cols].describe())

=== DETAILED DATA ANALYSIS ===
Dataset shape: (2774, 16)
Number of features: 15
Target variable: treatment

=== COLUMN DESCRIPTIONS ===
Timestamp: object - 7 unique values
    Values: ['8/28/2014 9:18' '8/27/2014 15:31' '8/28/2014 8:23' '8/28/2014 17:31'
 '8/29/2014 9:15' '8/30/2014 6:48' '9/4/2014 8:35']

Gender: object - 2 unique values
    Values: ['Female' 'Male']

Occupation: object - 5 unique values
    Values: ['Corporate' 'Student' 'Business' 'Housewife' 'Others']

self_employed: object - 2 unique values
    Values: ['No' 'Yes']

family_history: object - 2 unique values
    Values: ['Yes' 'No']

treatment: object - 2 unique values
    Values: ['Yes' 'No']

Days_Indoors: object - 5 unique values
    Values: ['1-14 days' 'Go out Every day' 'More than 2 months' '15-30 days'
 '31-60 days']

Growing_Stress: object - 3 unique values
    Values: ['Yes' 'No' 'Maybe']

Changes_Habits: object - 3 unique values
    Values: ['No' 'Yes' 'Maybe']

Mental_Health_History: object - 3 unique val

In [99]:
# 1. Handle missing values in self_employed column
print(f"Missing values in self_employed: {df['self_employed'].isnull().sum()}")
# Fill missing values with 'No' (most common case for corporate employees)
df['self_employed'].fillna('No', inplace=True)
print(f"After filling: {df['self_employed'].isnull().sum()}")

Missing values in self_employed: 0
After filling: 0


In [100]:
# 2. Remove duplicate rows
print(f"Duplicate rows before removal: {df.duplicated().sum()}")
df.drop_duplicates(inplace=True)
print(f"Dataset shape after removing duplicates: {df.shape}")

Duplicate rows before removal: 3
Dataset shape after removing duplicates: (2771, 16)


In [101]:
#3. Timestamp
df.drop(columns=['Timestamp'], inplace=True)

In [102]:
# 4. Create ordinal mappings for ordinal features
ordinal_mappings = {
    'Days_Indoors': {
        'Go out Every day': 0,
        '1-14 days': 1,
        '15-30 days': 2,
        '31-60 days': 3,
        'More than 2 months': 4
    },
    'Mood_Swings': {
        'Low': 0,
        'Medium': 1,
        'High': 2
    }
}

# Apply ordinal mappings
for col, mapping in ordinal_mappings.items():
    df[col] = df[col].map(mapping)

In [103]:
# 5. Create binary encodings for Yes/No/Maybe columns
binary_cols = ['Growing_Stress', 'Changes_Habits', 'Mental_Health_History', 
               'Work_Interest', 'Social_Weakness', 'mental_health_interview', 'care_options']

for col in binary_cols:
    # Create binary features
    df[f'{col}_Yes'] = (df[col] == 'Yes').astype(int)
    df[f'{col}_Maybe'] = (df[col] == 'Maybe').astype(int)
    # No need for _No as it's the reference category

In [104]:
df

Unnamed: 0,Gender,Occupation,self_employed,family_history,treatment,Days_Indoors,Growing_Stress,Changes_Habits,Mental_Health_History,Mood_Swings,...,Mental_Health_History_Yes,Mental_Health_History_Maybe,Work_Interest_Yes,Work_Interest_Maybe,Social_Weakness_Yes,Social_Weakness_Maybe,mental_health_interview_Yes,mental_health_interview_Maybe,care_options_Yes,care_options_Maybe
0,Female,Corporate,No,Yes,Yes,1,Yes,No,Yes,1,...,1,0,0,0,1,0,0,0,0,0
1,Female,Student,No,Yes,Yes,0,No,Yes,No,1,...,0,0,0,1,0,0,0,0,0,0
2,Female,Student,No,Yes,Yes,4,Yes,Yes,No,1,...,0,0,0,1,0,0,0,0,0,0
3,Female,Business,No,Yes,Yes,0,Yes,Maybe,No,0,...,0,0,0,1,0,1,0,0,0,0
4,Female,Student,No,Yes,Yes,1,Yes,Maybe,Maybe,2,...,0,1,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2769,Male,Business,No,No,No,2,No,Maybe,No,0,...,0,0,0,0,0,1,0,1,0,0
2770,Male,Business,Yes,No,No,2,No,Maybe,No,0,...,0,0,0,0,0,1,0,0,0,0
2771,Male,Business,No,No,Yes,2,No,Maybe,No,0,...,0,0,0,0,0,1,0,0,0,0
2772,Male,Business,No,No,No,2,No,Maybe,No,0,...,0,0,0,0,0,1,0,1,0,0


In [105]:
# 7. Create interaction features
# Mental health related interactions
df['MH_Family_History'] = ((df['family_history'] == 'Yes')).astype(int)


df['Coping_Struggles'] = (
    (df['Coping_Struggles'] == 'Yes')).astype(int)

In [106]:
# 8. Drop original timestamp and redundant columns
cols_to_drop = ['Growing_Stress', 'Changes_Habits', 'Mental_Health_History',
               'Work_Interest', 'Social_Weakness', 'mental_health_interview', 'care_options','family_history']
df.drop(columns=cols_to_drop, inplace=True)

In [107]:
df

Unnamed: 0,Gender,Occupation,self_employed,treatment,Days_Indoors,Mood_Swings,Coping_Struggles,Growing_Stress_Yes,Growing_Stress_Maybe,Changes_Habits_Yes,...,Mental_Health_History_Maybe,Work_Interest_Yes,Work_Interest_Maybe,Social_Weakness_Yes,Social_Weakness_Maybe,mental_health_interview_Yes,mental_health_interview_Maybe,care_options_Yes,care_options_Maybe,MH_Family_History
0,Female,Corporate,No,Yes,1,1,0,1,0,0,...,0,0,0,1,0,0,0,0,0,1
1,Female,Student,No,Yes,0,1,1,0,0,1,...,0,0,1,0,0,0,0,0,0,1
2,Female,Student,No,Yes,4,1,1,1,0,1,...,0,0,1,0,0,0,0,0,0,1
3,Female,Business,No,Yes,0,0,0,1,0,0,...,0,0,1,0,1,0,0,0,0,1
4,Female,Student,No,Yes,1,2,0,1,0,0,...,1,0,0,1,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2769,Male,Business,No,No,2,0,1,0,0,0,...,0,0,0,0,1,0,1,0,0,0
2770,Male,Business,Yes,No,2,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2771,Male,Business,No,Yes,2,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2772,Male,Business,No,No,2,0,1,0,0,0,...,0,0,0,0,1,0,1,0,0,0


In [108]:
print(f"\nFinal dataset shape: {df.shape}")
print(f"New features created: {df.shape[1] - df.shape[1] + len(cols_to_drop)}")


Final dataset shape: (2771, 22)
New features created: 8


In [109]:
df.columns

Index(['Gender', 'Occupation', 'self_employed', 'treatment', 'Days_Indoors',
       'Mood_Swings', 'Coping_Struggles', 'Growing_Stress_Yes',
       'Growing_Stress_Maybe', 'Changes_Habits_Yes', 'Changes_Habits_Maybe',
       'Mental_Health_History_Yes', 'Mental_Health_History_Maybe',
       'Work_Interest_Yes', 'Work_Interest_Maybe', 'Social_Weakness_Yes',
       'Social_Weakness_Maybe', 'mental_health_interview_Yes',
       'mental_health_interview_Maybe', 'care_options_Yes',
       'care_options_Maybe', 'MH_Family_History'],
      dtype='object')

In [110]:
# Prepare Data for Machine Learning
print("=== PREPARING DATA FOR MACHINE LEARNING ===")

# Separate features and target
X = df.drop('treatment', axis=1)
y = df['treatment']

print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")

=== PREPARING DATA FOR MACHINE LEARNING ===
Features shape: (2771, 21)
Target shape: (2771,)


In [111]:
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()
numerical_cols = X.select_dtypes(include=[np.number]).columns.tolist()

print(f"\nCategorical columns ({len(categorical_cols)}): {categorical_cols}")
print(f"Numerical columns ({len(numerical_cols)}): {len(numerical_cols)}")


Categorical columns (3): ['Gender', 'Occupation', 'self_employed']
Numerical columns (18): 18


In [112]:
# Encode categorical variables using OneHotEncoder for non-ordinal categories
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder

# Encode target variable
le_target = LabelEncoder()
y_encoded = le_target.fit_transform(y)
print(f"\nTarget encoding: {dict(zip(le_target.classes_, le_target.transform(le_target.classes_)))}")

# Create preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(drop='first', sparse_output=False), categorical_cols)
    ],
    remainder='passthrough'
)

# Fit and transform the data
X_processed = preprocessor.fit_transform(X)

# Get feature names after preprocessing
# Get numerical feature names
num_feature_names = numerical_cols

# Get categorical feature names (after one-hot encoding)
cat_feature_names = []
if len(categorical_cols) > 0:
    # Get the OneHotEncoder from the preprocessor
    ohe = preprocessor.named_transformers_['cat']
    cat_feature_names = ohe.get_feature_names_out(categorical_cols).tolist()

# Combine all feature names
all_feature_names = num_feature_names + cat_feature_names

print(f"\nTotal features after preprocessing: {X_processed.shape[1]}")
print(f"Feature names count: {len(all_feature_names)}")


Target encoding: {'No': np.int64(0), 'Yes': np.int64(1)}

Total features after preprocessing: 24
Feature names count: 24


In [None]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X_processed, y_encoded, test_size=0.2, random_state=41, stratify=y_encoded
)

print(f"\nTraining set shape: {X_train.shape}")
print(f"Test set shape: {X_test.shape}")
print(f"Training set target distribution: {np.bincount(y_train)}")
print(f"Test set target distribution: {np.bincount(y_test)}")

print("\n=== DATA PREPARATION COMPLETED ===")
print(f"Ready for model training with {X_train.shape[1]} features and {X_train.shape[0]} training samples")


Training set shape: (2216, 24)
Test set shape: (555, 24)
Training set target distribution: [1559  657]
Test set target distribution: [391 164]

=== DATA PREPARATION COMPLETED ===
Ready for model training with 24 features and 2216 training samples


In [115]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

# Apply LDA for dimensionality reduction
lda = LinearDiscriminantAnalysis(n_components=1)  # Since we have binary classification, max components is 1
X_train = lda.fit_transform(X_train, y_train)
X_test = lda.transform(X_test)

In [None]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score,confusion_matrix

models={
    "Logisitic_Regression":LogisticRegression(n_jobs=-1),
    "GNB": GaussianNB(),
    "KNN": KNeighborsClassifier(n_jobs=-1),
    "Decision_Tree":DecisionTreeClassifier(),
    "Random_Forest":RandomForestClassifier(n_jobs=-1),
    "Gradient_Boost":GradientBoostingClassifier(),
    "Adaboost":AdaBoostClassifier(),
    "Xgboost":XGBClassifier(tree_method='gpu_hist',predictor='gpu_predictor'),
    "Catboost":CatBoostClassifier(task_type='GPU',verbose=0),
    'lightgbm':LGBMClassifier(device_type='gpu',verbose=-1)
}

acc={}
rec={}

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train) # Train model

    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    # Training set performance
    model_train_accuracy = accuracy_score(y_train, y_train_pred) # Calculate Accuracy
    model_train_f1 = f1_score(y_train, y_train_pred, average='weighted') # Calculate F1-score
    model_train_precision = precision_score(y_train, y_train_pred) # Calculate Precision
    model_train_recall = recall_score(y_train, y_train_pred) # Calculate Recall
    model_train_rocauc_score = roc_auc_score(y_train, y_train_pred)


    # Test set performance
    model_test_accuracy = accuracy_score(y_test, y_test_pred) # Calculate Accuracy
    acc[list(models.keys())[i]]=model_test_accuracy
    model_test_f1 = f1_score(y_test, y_test_pred, average='weighted') # Calculate F1-score
    model_test_precision = precision_score(y_test, y_test_pred) # Calculate Precision
    model_test_recall = recall_score(y_test, y_test_pred) # Calculate Recall
    rec[list(models.keys())[i]]=model_test_recall
    model_test_rocauc_score = roc_auc_score(y_test, y_test_pred) #Calculate Roc
    cm=confusion_matrix(y_test, y_test_pred)


    print(list(models.keys())[i])
    
    print('Model performance for Training set')
    print("- Accuracy: {:.4f}".format(model_train_accuracy))
    print('- F1 score: {:.4f}'.format(model_train_f1))
    
    print('- Precision: {:.4f}'.format(model_train_precision))
    print('- Recall: {:.4f}'.format(model_train_recall))
    print('- Roc Auc Score: {:.4f}'.format(model_train_rocauc_score))

    
    
    print('----------------------------------')
    
    print('Model performance for Test set')
    print('- Accuracy: {:.4f}'.format(model_test_accuracy))
    print('- F1 score: {:.4f}'.format(model_test_f1))
    print('- Precision: {:.4f}'.format(model_test_precision))
    print('- Recall: {:.4f}'.format(model_test_recall))
    print('- Roc Auc Score: {:.4f}'.format(model_test_rocauc_score))
    print('Confusion Matrix:')
    print(cm)

    
    print('='*35)
    print('\n')

Logisitic_Regression
Model performance for Training set
- Accuracy: 0.8651
- F1 score: 0.8653
- Precision: 0.7696
- Recall: 0.7778
- Roc Auc Score: 0.8398
----------------------------------
Model performance for Test set
- Accuracy: 0.8360
- F1 score: 0.8350
- Precision: 0.7325
- Recall: 0.7012
- Roc Auc Score: 0.7969
Confusion Matrix:
[[349  42]
 [ 49 115]]


GNB
Model performance for Training set
- Accuracy: 0.8569
- F1 score: 0.8627
- Precision: 0.6756
- Recall: 0.9954
- Roc Auc Score: 0.8970
----------------------------------
Model performance for Test set
- Accuracy: 0.8685
- F1 score: 0.8736
- Precision: 0.6936
- Recall: 0.9939
- Roc Auc Score: 0.9049
Confusion Matrix:
[[319  72]
 [  1 163]]


KNN
Model performance for Training set
- Accuracy: 0.8755
- F1 score: 0.8752
- Precision: 0.7935
- Recall: 0.7839
- Roc Auc Score: 0.8490
----------------------------------
Model performance for Test set
- Accuracy: 0.7946
- F1 score: 0.7938
- Precision: 0.6562
- Recall: 0.6402
- Roc Auc Sc

In [117]:
rec

{'Logisitic_Regression': 0.7012195121951219,
 'GNB': 0.9939024390243902,
 'KNN': 0.6402439024390244,
 'Decision_Tree': 0.5609756097560976,
 'Random_Forest': 0.5975609756097561,
 'Gradient_Boost': 0.6341463414634146,
 'Adaboost': 0.7865853658536586,
 'Xgboost': 0.6585365853658537,
 'Catboost': 0.6951219512195121,
 'lightgbm': 0.6524390243902439}

In [118]:
acc

{'Logisitic_Regression': 0.836036036036036,
 'GNB': 0.8684684684684685,
 'KNN': 0.7945945945945946,
 'Decision_Tree': 0.7639639639639639,
 'Random_Forest': 0.7639639639639639,
 'Gradient_Boost': 0.790990990990991,
 'Adaboost': 0.8342342342342343,
 'Xgboost': 0.7963963963963964,
 'Catboost': 0.8288288288288288,
 'lightgbm': 0.8054054054054054}