<a href="https://colab.research.google.com/github/AkulaTejdeep/DAUP37/blob/main/PE1_1_T_P_Test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.decomposition import PCA
from scipy.stats import ttest_1samp
from statsmodels.stats.weightstats import ztest

file_path = "/content/Student_Mental_Stress_and_Coping_Mechanisms.csv"
df = pd.read_csv(file_path)

df = df.drop(columns=["Student ID"])

categorical_cols = ["Gender", "Counseling Attendance", "Stress Coping Mechanisms",
                    "Family Mental Health History", "Medical Condition"]
label_encoders = {col: LabelEncoder() for col in categorical_cols}

for col in categorical_cols:
    df[col] = label_encoders[col].fit_transform(df[col])

scaler = StandardScaler()
numerical_cols = df.select_dtypes(include=["int64"]).columns
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])

X = df.drop(columns=["Mental Stress Level"])
y = df["Mental Stress Level"]

y_binary = (y > y.median()).astype(int)

X_train, X_test, y_train_bin, y_test_bin = train_test_split(X, y_binary, test_size=0.2, random_state=42)

log_reg = LogisticRegression()
rf_clf = RandomForestClassifier(random_state=42)
svm_clf = SVC()

log_reg.fit(X_train, y_train_bin)
rf_clf.fit(X_train, y_train_bin)
svm_clf.fit(X_train, y_train_bin)

y_pred_log = log_reg.predict(X_test)
y_pred_rf = rf_clf.predict(X_test)
y_pred_svm = svm_clf.predict(X_test)

def evaluate_model(y_true, y_pred):
    return {
        "Accuracy": accuracy_score(y_true, y_pred),
        "Precision": precision_score(y_true, y_pred),
        "Recall": recall_score(y_true, y_pred),
        "F1-score": f1_score(y_true, y_pred)
    }

log_reg_results = evaluate_model(y_test_bin, y_pred_log)
rf_results = evaluate_model(y_test_bin, y_pred_rf)
svm_results = evaluate_model(y_test_bin, y_pred_svm)

feature_importances = pd.Series(rf_clf.feature_importances_, index=X.columns).sort_values(ascending=False)

pca = PCA(n_components=0.95)
X_pca = pca.fit_transform(X)

t_stat, p_value = ttest_1samp(df["Mental Stress Level"], popmean=5)

df["Counseling Attendance"] = (df["Counseling Attendance"] > 0).astype(int)

study_hours_yes = df[df["Counseling Attendance"] == 1]["Study Hours Per Week"]
study_hours_no = df[df["Counseling Attendance"] == 0]["Study Hours Per Week"]
z_stat, p_value_z = ztest(study_hours_yes, study_hours_no)

results = {
    "Logistic Regression": log_reg_results,
    "Random Forest": rf_results,
    "SVM": svm_results,
    "Top 3 Features": feature_importances.head(3).to_dict(),
    "T-test": {"T-statistic": t_stat, "P-value": p_value},
    "Z-test": {"Z-statistic": z_stat, "P-value": p_value_z}
}

results

{'Logistic Regression': {'Accuracy': 0.48026315789473684,
  'Precision': 0.43283582089552236,
  'Recall': 0.4142857142857143,
  'F1-score': 0.4233576642335766},
 'Random Forest': {'Accuracy': 0.5197368421052632,
  'Precision': 0.4716981132075472,
  'Recall': 0.35714285714285715,
  'F1-score': 0.4065040650406504},
 'SVM': {'Accuracy': 0.5263157894736842,
  'Precision': 0.48214285714285715,
  'Recall': 0.38571428571428573,
  'F1-score': 0.42857142857142855},
 'Top 3 Features': {'Study Hours Per Week': 0.12611619914486588,
  'Physical Exercise (Hours per week)': 0.08200985231079133,
  'Age': 0.0770955199977506},
 'T-test': {'T-statistic': np.float64(-137.74977313955912),
  'P-value': np.float64(0.0)},
 'Z-test': {'Z-statistic': np.float64(1.1762342138083057),
  'P-value': np.float64(0.2395012959973013)}}