In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler, LabelEncoder
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import RobustScaler
from sklearn.impute import SimpleImputer



# Load the preprocessed dataset
data = pd.read_csv("dataset/Mental_Health_Dataset.csv")

# # Separate features and target variable
# X = data.drop(columns=["classification"])
# y = data["classification"]

In [2]:
data.dtypes

Timestamp                  object
Gender                     object
Country                    object
Occupation                 object
self_employed              object
family_history             object
treatment                  object
Days_Indoors               object
Growing_Stress             object
Changes_Habits             object
Mental_Health_History      object
Mood_Swings                object
Coping_Struggles           object
Work_Interest              object
Social_Weakness            object
mental_health_interview    object
care_options               object
dtype: object

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 292364 entries, 0 to 292363
Data columns (total 17 columns):
 #   Column                   Non-Null Count   Dtype 
---  ------                   --------------   ----- 
 0   Timestamp                292364 non-null  object
 1   Gender                   292364 non-null  object
 2   Country                  292364 non-null  object
 3   Occupation               292364 non-null  object
 4   self_employed            287162 non-null  object
 5   family_history           292364 non-null  object
 6   treatment                292364 non-null  object
 7   Days_Indoors             292364 non-null  object
 8   Growing_Stress           292364 non-null  object
 9   Changes_Habits           292364 non-null  object
 10  Mental_Health_History    292364 non-null  object
 11  Mood_Swings              292364 non-null  object
 12  Coping_Struggles         292364 non-null  object
 13  Work_Interest            292364 non-null  object
 14  Social_Weakness     

In [4]:
data.shape

(292364, 17)

In [5]:
data.describe(include="all")

Unnamed: 0,Timestamp,Gender,Country,Occupation,self_employed,family_history,treatment,Days_Indoors,Growing_Stress,Changes_Habits,Mental_Health_History,Mood_Swings,Coping_Struggles,Work_Interest,Social_Weakness,mental_health_interview,care_options
count,292364,292364,292364,292364,287162,292364,292364,292364,292364,292364,292364,292364,292364,292364,292364,292364,292364
unique,580,2,35,5,2,2,2,5,3,3,3,3,2,3,3,3,3
top,8/27/2014 11:43,Male,United States,Housewife,No,No,Yes,1-14 days,Maybe,Yes,No,Medium,No,No,Maybe,No,No
freq,2384,239850,171308,66351,257994,176832,147606,63548,99985,109523,104018,101064,154328,105843,103393,232166,118886


In [6]:
data.head()

Unnamed: 0,Timestamp,Gender,Country,Occupation,self_employed,family_history,treatment,Days_Indoors,Growing_Stress,Changes_Habits,Mental_Health_History,Mood_Swings,Coping_Struggles,Work_Interest,Social_Weakness,mental_health_interview,care_options
0,8/27/2014 11:29,Female,United States,Corporate,,No,Yes,1-14 days,Yes,No,Yes,Medium,No,No,Yes,No,Not sure
1,8/27/2014 11:31,Female,United States,Corporate,,Yes,Yes,1-14 days,Yes,No,Yes,Medium,No,No,Yes,No,No
2,8/27/2014 11:32,Female,United States,Corporate,,Yes,Yes,1-14 days,Yes,No,Yes,Medium,No,No,Yes,No,Yes
3,8/27/2014 11:37,Female,United States,Corporate,No,Yes,Yes,1-14 days,Yes,No,Yes,Medium,No,No,Yes,Maybe,Yes
4,8/27/2014 11:43,Female,United States,Corporate,No,Yes,Yes,1-14 days,Yes,No,Yes,Medium,No,No,Yes,No,Yes


In [7]:
from sklearn.preprocessing import LabelEncoder


# Load the dataset
data = pd.read_csv("dataset/Mental_Health_Dataset.csv")

# Drop irrelevant columns like Timestamp
data.drop(columns=['Timestamp'], inplace=True)

# Fill missing values in 'self_employed' column with the mode
data['self_employed'].fillna(data['self_employed'].mode()[0], inplace=True)

# Convert categorical columns to numerical using LabelEncoder
label_encoders = {}
for column in data.select_dtypes(include=['object']).columns:
    label_encoders[column] = LabelEncoder()
    data[column] = label_encoders[column].fit_transform(data[column])

# Split the data into features and target
X = data.drop(columns=['treatment'])
y = data['treatment']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Now you can use X_train, X_test, y_train, y_test for further modeling


In [8]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Random Forest classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
rf_classifier.fit(X_train, y_train)

# Predict on the test set
y_pred = rf_classifier.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.7286952952644811

Classification Report:
              precision    recall  f1-score   support

           0       0.74      0.70      0.72     28895
           1       0.72      0.76      0.74     29578

    accuracy                           0.73     58473
   macro avg       0.73      0.73      0.73     58473
weighted avg       0.73      0.73      0.73     58473



In [11]:
from sklearn.linear_model import LogisticRegression


# Logistic Regression classifier
log_reg_classifier = LogisticRegression(random_state=42)

# Train the model
log_reg_classifier.fit(X_train, y_train)

# Predict on the test set
y_pred = log_reg_classifier.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Accuracy: 0.7032818565833804

Classification Report:
              precision    recall  f1-score   support

           0       0.70      0.69      0.70     28895
           1       0.70      0.72      0.71     29578

    accuracy                           0.70     58473
   macro avg       0.70      0.70      0.70     58473
weighted avg       0.70      0.70      0.70     58473



In [12]:
# Decision Tree classifier
dt_classifier = DecisionTreeClassifier(random_state=42)

# Train the model
dt_classifier.fit(X_train, y_train)

# Predict on the test set
y_pred = dt_classifier.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.7253775246695056

Classification Report:
              precision    recall  f1-score   support

           0       0.72      0.72      0.72     28895
           1       0.73      0.73      0.73     29578

    accuracy                           0.73     58473
   macro avg       0.73      0.73      0.73     58473
weighted avg       0.73      0.73      0.73     58473



In [15]:
from sklearn.neighbors import KNeighborsClassifier

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train a K-Nearest Neighbors Classifier
knn_classifier = KNeighborsClassifier(n_neighbors=5)
knn_classifier.fit(X_train, y_train)

# Predictions for KNN Classifier
knn_y_pred = knn_classifier.predict(X_test)

# Evaluate KNN Classifier
print("K-Nearest Neighbors Classifier:")
print("Accuracy:", accuracy_score(y_test, knn_y_pred))
print("Classification Report:\n", classification_report(y_test, knn_y_pred))


K-Nearest Neighbors Classifier:
Accuracy: 0.6682229405024541
Classification Report:
               precision    recall  f1-score   support

           0       0.66      0.67      0.66     28895
           1       0.67      0.67      0.67     29578

    accuracy                           0.67     58473
   macro avg       0.67      0.67      0.67     58473
weighted avg       0.67      0.67      0.67     58473



In [14]:
from sklearn.neighbors import KNeighborsClassifier



# K-Nearest Neighbors classifier
knn_classifier = KNeighborsClassifier(n_neighbors=5)

# Train the model
knn_classifier.fit(X_train, y_train)

# Predict on the test set
y_pred = knn_classifier.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.6682229405024541

Classification Report:
              precision    recall  f1-score   support

           0       0.66      0.67      0.66     28895
           1       0.67      0.67      0.67     29578

    accuracy                           0.67     58473
   macro avg       0.67      0.67      0.67     58473
weighted avg       0.67      0.67      0.67     58473



In [16]:
# Gradient Boosting Machines (GBM) classifier - XGBoost
xgb_classifier = XGBClassifier(random_state=42)

# Train the model
xgb_classifier.fit(X_train, y_train)

# Predict on the test set
y_pred = xgb_classifier.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.7747507396576198

Classification Report:
              precision    recall  f1-score   support

           0       0.81      0.72      0.76     28895
           1       0.75      0.83      0.79     29578

    accuracy                           0.77     58473
   macro avg       0.78      0.77      0.77     58473
weighted avg       0.78      0.77      0.77     58473



In [17]:
from sklearn.svm import SVC



# Support Vector Machines (SVM) classifier
svm_classifier = SVC(kernel='rbf', random_state=42)

# Train the model
svm_classifier.fit(X_train, y_train)

# Predict on the test set
y_pred = svm_classifier.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.6787748191472988

Classification Report:
              precision    recall  f1-score   support

           0       0.64      0.79      0.71     28895
           1       0.73      0.57      0.64     29578

    accuracy                           0.68     58473
   macro avg       0.69      0.68      0.68     58473
weighted avg       0.69      0.68      0.68     58473



In [19]:
from sklearn.svm import OneClassSVM




# Filter out only the instances with the majority class (assuming 0 represents the majority class)
X_train_normal = X_train[y_train == 0]

# OneClassSVM for outlier detection
one_class_svm = OneClassSVM(kernel='rbf', gamma='auto')
one_class_svm.fit(X_train_normal)

# Predict on the test set
y_pred = one_class_svm.predict(X_test)
# Convert prediction labels (-1 for outliers, 1 for inliers) to match with actual labels
y_pred[y_pred == 1] = 0  # Inliers
y_pred[y_pred == -1] = 1  # Outliers

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.5100131684709182

Classification Report:
              precision    recall  f1-score   support

           0       0.50      0.50      0.50     28895
           1       0.52      0.52      0.52     29578

    accuracy                           0.51     58473
   macro avg       0.51      0.51      0.51     58473
weighted avg       0.51      0.51      0.51     58473



In [22]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam



# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the Neural Network architecture
model = Sequential([
    Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    Dropout(0.5),
    Dense(32, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(optimizer=Adam(), loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=32, verbose=1)

# Evaluate the model
y_pred_proba = model.predict(X_test)
y_pred = np.round(y_pred_proba).flatten()

# Convert predicted probabilities to binary predictions
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/10
[1m7310/7310[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 1ms/step - accuracy: 0.5967 - loss: 0.7370
Epoch 2/10
[1m7310/7310[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 955us/step - accuracy: 0.6975 - loss: 0.5948
Epoch 3/10
[1m7310/7310[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 996us/step - accuracy: 0.7014 - loss: 0.5857
Epoch 4/10
[1m7310/7310[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 990us/step - accuracy: 0.7063 - loss: 0.5795
Epoch 5/10
[1m7310/7310[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 1000us/step - accuracy: 0.7103 - loss: 0.5742
Epoch 6/10
[1m7310/7310[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 986us/step - accuracy: 0.7110 - loss: 0.5726
Epoch 7/10
[1m7310/7310[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 964us/step - accuracy: 0.7107 - loss: 0.5730
Epoch 8/10
[1m7310/7310[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 957us/step - accuracy: 0.7145 - loss: 0.5682
Epoch 9/

In [20]:
from sklearn.preprocessing import LabelEncoder

# Encode the target variable
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

# Instantiate XGBoost classifier
xgb_classifier = XGBClassifier(random_state=42)

# Define parameter grid for XGBoost classifier
xgb_param_grid = {
    'max_depth': [3, 5, 7],
    'learning_rate': [0.1, 0.01, 0.001],
    'n_estimators': [100, 200, 300],
    'gamma': [0, 0.1, 0.2],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'reg_alpha': [0, 0.1, 0.5],
    'reg_lambda': [1, 1.5, 2]
}

# Instantiate GridSearchCV for XGBoost classifier
xgb_grid_search = GridSearchCV(xgb_classifier, xgb_param_grid, cv=5, scoring='accuracy', n_jobs=-1)

# Fit the model
xgb_grid_search.fit(X_train, y_train_encoded)

# Best parameters for XGBoost
print("Best parameters for XGBoost:", xgb_grid_search.best_params_)

# Predicting on the test set using best estimator from grid search
xgb_y_pred = xgb_grid_search.predict(X_test)

# Evaluate XGBoost Classifier
print("\nXGBoost Classifier:")
xgb_accuracy = accuracy_score(y_test_encoded, xgb_y_pred)
print("Accuracy:", xgb_accuracy)
print("Classification Report:\n", classification_report(y_test_encoded, xgb_y_pred))


KeyboardInterrupt: 

In [21]:




# Initialize LightGBM Classifier
lgb_classifier = LGBMClassifier(random_state=42)

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features using RobustScaler
scaler = RobustScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define the parameter grid for LightGBM
param_grid_lgb = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.05, 0.1]
}

# Initialize LightGBM Classifier
best_lgb_classifier = LGBMClassifier(num_leaves=50, max_depth=7, learning_rate=0.05, n_estimators=300, random_state=42)
best_lgb_classifier.fit(X_train_scaled, y_train)

# Perform GridSearchCV for LightGBM
grid_search_lgb = GridSearchCV(lgb_classifier, param_grid_lgb, cv=5, scoring='accuracy', n_jobs=-1)
grid_search_lgb.fit(X_train_scaled, y_train)

# Get the best parameters for LightGBM
best_params_lgb = grid_search_lgb.best_params_
print("Best Parameters for LightGBM:", best_params_lgb)

# Train the LightGBM classifier with the best parameters
best_lgb_classifier = LGBMClassifier(**best_params_lgb, random_state=42)
best_lgb_classifier.fit(X_train_scaled, y_train)

# Predictions for the best LightGBM Classifier
best_lgb_y_pred = best_lgb_classifier.predict(X_test_scaled)

# Evaluate the best LightGBM Classifier
print("Best LightGBM Classifier:")
print("Accuracy:", accuracy_score(y_test, best_lgb_y_pred))
print("Classification Report:\n", classification_report(y_test, best_lgb_y_pred))


[LightGBM] [Info] Number of positive: 118028, number of negative: 115863
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.011763 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 77
[LightGBM] [Info] Number of data points in the train set: 233891, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.504628 -> initscore=0.018513
[LightGBM] [Info] Start training from score 0.018513
[LightGBM] [Info] Number of positive: 118028, number of negative: 115863
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.010402 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 77
[LightGBM] [Info] Number of data points in the train set: 233891, number of used features: 15
[LightGBM] [Info] 