In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv("Mental health.csv")
df.drop(columns=['id'], inplace=True)
print(df.info())

In [None]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
df['Gender'] = label_encoder.fit_transform(df['Gender'])


In [None]:
import category_encoders as ce
df['City'].value_counts()
# Target encoding - captures city's relationship with target
encoder = ce.TargetEncoder(cols=['City'])
df['City'] = encoder.fit_transform(df['City'], df['Depression'])


In [None]:
#make profession = to students only student and non-student
df.loc[(df['Profession'] != 'Student'), 'Profession'] = 'Non-Student'
df['Profession'].value_counts()
#makeing int 0 - 1
df['Profession'] = label_encoder.fit_transform(df['Profession'])

In [None]:
df['Profession'].value_counts()

In [None]:
#Sleep Duration encoding
df["Sleep Duration"].value_counts()
df = df.drop(df[df["Sleep Duration"] == 'Others'].index)
df = pd.get_dummies(df,columns=["Sleep Duration"],dtype=int)

In [None]:
(df['Dietary Habits']).value_counts()
df = df.drop(df[df['Dietary Habits'] == 'Others'].index)
df = pd.get_dummies(df,columns=["Dietary Habits"],dtype=int)

In [None]:
#Degree encoding
df["Degree"].value_counts()
# Apply the education level mapping
degree_mapping = {
    'Class 12': 'High_School',
    'B.Ed': 'Bachelors', 'B.Com': 'Bachelors', 'B.Arch': 'Bachelors',
    'BCA': 'Bachelors', 'B.Tech': 'Bachelors', 'BHM': 'Bachelors',
    'BSc': 'Bachelors', 'B.Pharm': 'Bachelors', 'BBA': 'Bachelors',
    'BA': 'Bachelors', 'BE': 'Bachelors',
    'MSc': 'Masters', 'MCA': 'Masters', 'M.Tech': 'Masters', 'M.Ed': 'Masters',
    'M.Com': 'Masters', 'M.Pharm': 'Masters', 'MA': 'Masters', 'ME': 'Masters',
    'MHM': 'Masters',
    'MBBS': 'Professional', 'MD': 'Professional', 'MBA': 'Professional',
    'LLB': 'Professional', 'LLM': 'Professional', 'PhD': 'Doctoral',
    'Others': 'Other'
}
df['Degree'] = df['Degree'].map(degree_mapping)

In [None]:
df["Degree"].value_counts()
# Now use get_dummies on the grouped version
df = pd.get_dummies(df, columns=['Degree'], prefix='edu',dtype=int)

In [None]:
#Have you ever had suicidal thoughts ? encoding
df["Have you ever had suicidal thoughts ?"].value_counts()
df["Have you ever had suicidal thoughts ?"] = label_encoder.fit_transform(df["Have you ever had suicidal thoughts ?"])

In [None]:
#Financial Stress encoding
df["Financial Stress"].value_counts()
df = df.drop(df[df["Financial Stress"] == '?'].index)
df['Financial Stress'] = pd.to_numeric(df['Financial Stress'])

print(f"Missing after conversion: {df['Financial Stress'].isna().sum()}")

In [None]:
#Family History of Mental Illness encoding
df["Family History of Mental Illness"].value_counts()
df["Family History of Mental Illness"] = label_encoder.fit_transform(df["Family History of Mental Illness"])

In [None]:
#Removing outliers based on Age
Q1 = df['Age'].quantile(0.25)
Q3 = df['Age'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
df = df[(df['Age'] >= lower_bound) & (df['Age'] <= upper_bound)]

In [None]:
df.info()

In [None]:
#correlation matrix
plt.figure(figsize=(16,12))
correlation_matrix = df.corr()
sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap='coolwarm')
plt.title("Correlation Matrix")

In [None]:
#chi2 feature selection
from sklearn.feature_selection import chi2
from sklearn.preprocessing import MinMaxScaler
X = df.drop('Depression', axis=1)
X_selected = df.drop(['Depression','Profession','edu_Other','Gender','Job Satisfaction','Work Pressure','CGPA','edu_Doctoral',"Sleep Duration_'7-8 hours'","Sleep Duration_'5-6 hours'",'City','edu_Professional','edu_Bachelors','Dietary Habits_Moderate','Family History of Mental Illness'], axis=1)
y = df['Depression']
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)


In [None]:
chi2_scores = chi2(X_scaled, y)
chi2_scores = pd.Series(chi2_scores[0], index=X.columns)
chi2_scores.sort_values(ascending=False, inplace=True)
print(chi2_scores)

In [None]:
chi2_scores.plot.bar(figsize=(12,6))
plt.show()

In [None]:
#Feature Selection using Random Forest
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
rf.fit(X, y)
importances = pd.Series(rf.feature_importances_, index=X.columns)
importances.sort_values(ascending=False, inplace=True)
importances.plot.bar(figsize=(12,6))
plt.show()

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,confusion_matrix, ConfusionMatrixDisplay

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)
y_pred = dt.predict(X_test)
print(classification_report(y_test, y_pred))

In [None]:
from sklearn.tree import plot_tree

plt.figure(figsize=(22, 14))
plot_tree(
    dt,
    fontsize=14,
    feature_names=X.columns,
    max_depth=4  # ← show only top 4 levels so it fits nicely
)
plt.title("Your Decision Tree – First 4 Levels (trained to depth 4)", fontsize=16)
plt.show()

In [None]:
actual = y_test
predicted = y_pred
confusion_mtx = confusion_matrix(actual, predicted)
cm_display = ConfusionMatrixDisplay(confusion_matrix = confusion_mtx, display_labels = ['no deppression', 'depression'])
cm_display.plot()
plt.show()

In [None]:
#Naive Bayes Classifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

clt = GaussianNB()
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=.2,random_state=42)

clt.fit(X_train,y_train)
y_pred = clt.predict(X_test)
print(classification_report(y_test,y_pred))

In [None]:
actual = y_test
predicted = y_pred
confusion_mtx = confusion_matrix(actual, predicted)
cm_display = ConfusionMatrixDisplay(confusion_matrix = confusion_mtx, display_labels = ['no deppression', 'depression'])
cm_display.plot()
plt.show()

In [None]:
#logistic regression
from sklearn.linear_model import LogisticRegression
Lr = LogisticRegression(max_iter=1000)
Lr.fit(X_train,y_train)
y_pred = Lr.predict(X_test)
print(classification_report(y_test,y_pred))

In [None]:
actual = y_test
predicted = y_pred
confusion_mtx = confusion_matrix(actual, predicted)
cm_display = ConfusionMatrixDisplay(confusion_matrix = confusion_mtx, display_labels = ['no deppression', 'depression'])
cm_display.plot()
plt.show()

In [None]:
#xgboost
import xgboost as xgb
xg_cl = xgb.XGBClassifier(eval_metric='logloss')
xg_cl.fit(np.array(X_train),np.array(y_train))
y_pred = xg_cl.predict(np.array(X_test))
print(classification_report(y_test, y_pred))

In [None]:
actual = y_test
predicted = y_pred
confusion_mtx = confusion_matrix(actual, predicted)
cm_display = ConfusionMatrixDisplay(confusion_matrix = confusion_mtx, display_labels = ['no deppression', 'depression'])
cm_display.plot()
plt.show()

In [None]:
# ===============================
# ML Model Comparison: Depression Prediction
# ===============================

# Import libraries
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier


# Example: assuming target column is 'depression'
X = df.drop('Depression', axis=1)
y = df['Depression']


# Scale numeric features for models that benefit from it
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_selected)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# ===============================
# 2. Define Models and Hyperparameters
# ===============================
model = {
    "Logistic Regression": LogisticRegression(solver='liblinear', C=1.0),#liblinear for binary , C avoids overfitting
    "Decision Tree": DecisionTreeClassifier(max_depth=5, criterion='gini', random_state=42),
    "Gaussian Naive Bayes": GaussianNB(),
    "XGBoost": XGBClassifier(n_estimators=100, learning_rate=0.1, max_depth=4, eval_metric='logloss', random_state=42)#n_estimators: number of trees, learning_rate: lower reduce over fitting, max_depth: depth of each tree (logloss for binary classification)
}

# ===============================
# 3. Train, Predict, and Evaluate
# ===============================
results = []

for name, model in model.items():
    model.fit(np.array(X_train),np.array(y_train))
    y_pred = model.predict(X_test)
    
    results.append({
        "Model": name,
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred, zero_division=0),
        "Recall": recall_score(y_test, y_pred, zero_division=0),
        "F1 Score": f1_score(y_test, y_pred, zero_division=0)
    })

# ===============================
# 4. Display Results
# ===============================
results_df = pd.DataFrame(results)
print("\nModel Performance Comparison:")
print(results_df.sort_values(by="Accuracy", ascending=False).reset_index(drop=True))


In [None]:
#train logestic regression model and scaler for deployment.
LogisticRegression_model = LogisticRegression(solver='liblinear', C=1.0)
LogisticRegression_model.fit(X_scaled, y)



In [None]:
import tkinter as tk
from tkinter import messagebox
import pandas as pd
from sklearn.preprocessing import StandardScaler
import numpy as np
import joblib

# =========================
# Load your trained LogisticRegression_model and scaler
# =========================
# Save your LogisticRegression_model and scaler first using:
joblib.dump(LogisticRegression_model, "lr_LogisticRegression_model.pkl")
joblib.dump(scaler, "scaler.pkl")

LogisticRegression_model = joblib.load("lr_LogisticRegression_model.pkl")
scaler = joblib.load("scaler.pkl")

# =========================
# GUI
# =========================
def predict():
    try:
        # Collect inputs from entries
        inputs = [float(entry.get()) for entry in entries]
        
        # Scale inputs
        inputs_scaled = scaler.transform([inputs])
        
        # Predict
        pred = LogisticRegression_model.predict(inputs_scaled)[0]
        prob = LogisticRegression_model.predict_proba(inputs_scaled)[0][1]
        
        # Display
        messagebox.showinfo("Prediction", f"Depression Status: {pred}\nProbability: {prob:.2f}")
    except ValueError:
        messagebox.showerror("Error", "Please enter valid numeric values!")

# Create main window
root = tk.Tk()
root.title("Depression Prediction (Logistic Regression)")

# List of features (replace with your actual column names)
features = [

    'Age', 'Academic Pressure', 'Study Satisfaction',
    'Have you ever had suicidal thoughts ?',
    'Work/Study Hours', 'Financial Stress',
    "Sleep Duration_'Less than 5 hours'", 
    "Sleep Duration_'More than 8 hours'",
    'Dietary Habits_Healthy', 'Dietary Habits_Unhealthy', 
    'edu_Masters'

    ]

entries = []

# Create labels and entry fields
for i, feature in enumerate(features):
    tk.Label(root, text=feature).grid(row=i, column=0, padx=10, pady=5, sticky="w")
    entry = tk.Entry(root)
    entry.grid(row=i, column=1, padx=10, pady=5)
    entries.append(entry)

# Predict button
tk.Button(root, text="Predict", command=predict).grid(row=len(features), column=0, columnspan=2, pady=10)

root.mainloop()
