In [50]:
import pandas as pd
import numpy as np
import seaborn as sns
import joblib as jl
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.linear_model import LogisticRegression
# KNearestNeighborsClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler

In [38]:
data = pd.read_csv('Student Mental Health.csv')
data.head()

Unnamed: 0,Name,Gender,Age,Education Level,Screen Time (hrs/day),Sleep Duration (hrs),Physical Activity (hrs/week),Stress Level,Anxious Before Exams,Academic Performance Change
0,Aarav,Male,15,Class 8,7.1,8.9,9.3,Medium,No,Same
1,Meera,Female,25,MSc,3.3,5.0,0.2,Medium,No,Same
2,Ishaan,Male,20,BTech,9.5,5.4,6.2,Medium,No,Same
3,Aditya,Male,20,BA,10.8,5.6,5.5,High,Yes,Same
4,Anika,Female,17,Class 11,2.8,5.4,3.1,Medium,Yes,Same


In [39]:
# Removing Unnecessary column

data.drop(['Name'], axis=1, inplace=True)
data.head()

Unnamed: 0,Gender,Age,Education Level,Screen Time (hrs/day),Sleep Duration (hrs),Physical Activity (hrs/week),Stress Level,Anxious Before Exams,Academic Performance Change
0,Male,15,Class 8,7.1,8.9,9.3,Medium,No,Same
1,Female,25,MSc,3.3,5.0,0.2,Medium,No,Same
2,Male,20,BTech,9.5,5.4,6.2,Medium,No,Same
3,Male,20,BA,10.8,5.6,5.5,High,Yes,Same
4,Female,17,Class 11,2.8,5.4,3.1,Medium,Yes,Same


# Encoding

In [40]:
# Label Encoding
le = LabelEncoder()

columns = ["Stress Level", "Anxious Before Exams"]

for column in columns:
    data[column] = le.fit_transform(data[column])

data.head()


Unnamed: 0,Gender,Age,Education Level,Screen Time (hrs/day),Sleep Duration (hrs),Physical Activity (hrs/week),Stress Level,Anxious Before Exams,Academic Performance Change
0,Male,15,Class 8,7.1,8.9,9.3,2,0,Same
1,Female,25,MSc,3.3,5.0,0.2,2,0,Same
2,Male,20,BTech,9.5,5.4,6.2,2,0,Same
3,Male,20,BA,10.8,5.6,5.5,0,1,Same
4,Female,17,Class 11,2.8,5.4,3.1,2,1,Same


In [41]:
# One Hot Encoding

columns_2 = ["Gender", "Education Level", "Academic Performance Change"]
encoder = OneHotEncoder()

# Perform One Hot Encoding and add the new columns to the dataframe
for column_2 in columns_2:
    encoded = encoder.fit_transform(data[[column_2]]).toarray()
    encoded_df = pd.DataFrame(encoded, columns=encoder.get_feature_names_out([column_2]))
    data = pd.concat([data.drop(column_2, axis=1), encoded_df], axis=1)

data.head()


Unnamed: 0,Age,Screen Time (hrs/day),Sleep Duration (hrs),Physical Activity (hrs/week),Stress Level,Anxious Before Exams,Gender_Female,Gender_Male,Gender_Other,Education Level_BA,...,Education Level_Class 11,Education Level_Class 12,Education Level_Class 8,Education Level_Class 9,Education Level_MA,Education Level_MSc,Education Level_MTech,Academic Performance Change_Declined,Academic Performance Change_Improved,Academic Performance Change_Same
0,15,7.1,8.9,9.3,2,0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,25,3.3,5.0,0.2,2,0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
2,20,9.5,5.4,6.2,2,0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,20,10.8,5.6,5.5,0,1,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,17,2.8,5.4,3.1,2,1,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


# Preprocessing

In [42]:
# Outlier detection

def detect_outliers_iqr(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return df[(df[column] < lower_bound) | (df[column] > upper_bound)]

outlier_col = [col for col in data.columns if data[col].dtype in [np.int64, np.float64]]
for col in outlier_col:
    outliers = detect_outliers_iqr(data, col)
    if not outliers.empty:
        # print(f"Outliers detected in column '{col}':")
        # print(outliers)
        # remove outliers
        data = data[~data.index.isin(outliers.index)]
# Displaying the cleaned data
print("Data after removing outliers:")
data.head()
data.shape

# # Plotting Gender_Other Box plot
# plt.figure(figsize=(10, 6))
# sns.boxplot(x=data["Gender_Other"])
# plt.title("Box Plot of Gender_Other")
# plt.xlabel("Gender_Other")
# plt.show()


Data after removing outliers:


(398, 23)

# ML

In [54]:
# Spliting data into features and target variable
X = data.drop(['Stress Level'], axis=1)
y = data['Stress Level']

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Creating and training the Random Forest Classifier model
model = LogisticRegression(max_iter=1000, random_state=42)
model.fit(X_train, y_train)
# Making predictions on the test set
y_pred = model.predict(X_test)
# Evaluating the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")
print("Classification Report:")
print(classification_report(y_test, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# Saving the model
jl.dump(model, 'mental_health_model.pkl')



Accuracy: 0.53
Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        12
           1       0.47      0.31      0.37        26
           2       0.54      0.81      0.65        42

    accuracy                           0.53        80
   macro avg       0.34      0.37      0.34        80
weighted avg       0.44      0.53      0.46        80

Confusion Matrix:
[[ 0  1 11]
 [ 0  8 18]
 [ 0  8 34]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


['mental_health_model.pkl']