In [87]:
# Import Libraries
import kagglehub
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.decomposition import PCA
from scipy.stats import ttest_1samp, chi2_contingency
import statsmodels.stats.weightstats as sms
import os

# Download Dataset
path = kagglehub.dataset_download("salahuddinahmedshuvo/student-mental-stress-and-coping-mechanisms")
print("Path to dataset files:", path)

# Locate CSV File
for file in os.listdir(path):
    if file.endswith(".csv"):
        dataset_path = os.path.join(path, file)
        break

# Load Dataset
df = pd.read_csv(dataset_path)
print(df.head())

# Data Cleaning
df.drop(columns=["Student ID"], inplace=True)  # Drop irrelevant column

# Convert "Mental Stress Level" to Binary Classification (0 = Low, 1 = High)
df["Stress Category"] = df["Mental Stress Level"].apply(lambda x: 1 if x > 5 else 0)

# Encode Categorical Features
categorical_cols = ["Gender", "Counseling Attendance", "Stress Coping Mechanisms",
                    "Family Mental Health History", "Medical Condition"]
label_encoders = {}
for col in categorical_cols:
    label_encoders[col] = LabelEncoder()
    df[col] = label_encoders[col].fit_transform(df[col])

# Feature Scaling
scaler = StandardScaler()
scaled_features = scaler.fit_transform(df.drop(columns=["Mental Stress Level", "Stress Category"]))
df_scaled = pd.DataFrame(scaled_features, columns=df.columns[:-2])

# Split Data
X = df_scaled
y = df["Stress Category"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Machine Learning Models
models = {
    "Logistic Regression": LogisticRegression(),
    "Random Forest": RandomForestClassifier(n_estimators=100),
    "SVM": SVC(kernel='linear')
}

# Evaluate Models
results = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    results[name] = {
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "F1-Score": f1_score(y_test, y_pred)
    }

# Display Model Performance
print("\nModel Performance:")
for model, metrics in results.items():
    print(f"{model}: {metrics}")

# Feature Importance (Random Forest)
importances = models["Random Forest"].feature_importances_
feature_names = X.columns
feature_importance = pd.DataFrame({"Feature": feature_names, "Importance": importances})
feature_importance = feature_importance.sort_values(by="Importance", ascending=False)
print("\nTop 3 Most Important Features:")
print(feature_importance.head(3))

# PCA for Dimensionality Reduction
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)
explained_variance = pca.explained_variance_ratio_
print(f"\nPCA Explained Variance: {explained_variance}")

# One-Sample T-Test (Hypothesized Mean = 5)
t_stat, p_value = ttest_1samp(df["Mental Stress Level"], 5)
print(f"\nOne-Sample T-Test:\nT-Statistic: {t_stat:.4f}, P-Value: {p_value:.4f}")
if p_value < 0.05:
    print("Significant difference in stress levels from mean = 5.")
else:
    print("No significant difference in stress levels.")

# Z-Test (Study Hours & Counseling Attendance)
study_hours_col = "Study Hours Per Week"
attended_counseling = df[df["Counseling Attendance"] == 1][study_hours_col]
not_attended = df[df["Counseling Attendance"] == 0][study_hours_col]
z_stat, z_p_value = sms.ztest(attended_counseling, not_attended)
print(f"\nZ-Test:\nZ-Statistic: {z_stat:.4f}, P-Value: {z_p_value:.4f}")
if z_p_value < 0.05:
    print("Significant difference in study hours between counseling attendees and non-attendees.")
else:
    print("No significant difference in study hours.")

# Chi-Square Test (Gender & Counseling Attendance)
contingency_table = pd.crosstab(df["Gender"], df["Counseling Attendance"])
chi2_stat, chi_p_value, dof, expected = chi2_contingency(contingency_table)
print(f"\nChi-Square Test:\nChi-Square Statistic: {chi2_stat:.4f}, P-Value: {chi_p_value:.4f}")
if chi_p_value < 0.05:
    print("Gender significantly influences counseling attendance.")
else:
    print("No significant relationship between gender and counseling attendance.")


Path to dataset files: /root/.cache/kagglehub/datasets/salahuddinahmedshuvo/student-mental-stress-and-coping-mechanisms/versions/1
    Student ID  Age  Gender  Academic Performance (GPA)  Study Hours Per Week  \
0  802-17-3671   22  Female                           2                     9   
1  871-12-8572   25  Female                           0                    28   
2  495-13-2672   24  Female                           0                    45   
3  365-77-2496   20    Male                           2                     8   
4  664-76-5622   28    Male                           0                    14   

   Social Media Usage (Hours per day)  Sleep Duration (Hours per night)  \
0                                   2                                12   
1                                   0                                 6   
2                                   3                                12   
3                                   7                                 7   
4      