# ARCHITECTURE 1 (Ensemble Baseline)

In [2]:
import pandas as pd

path= "../data/processed/combined_hybrid_student_reddit_data.csv"
df = pd.read_csv(path)

df.head()


Unnamed: 0,binary_label,clean_text
0,1,how much do you currently spend per month on y...
1,1,the first time i had my youth ahead of me.the ...
2,0,demonyita talaga hisu nyo —
3,1,i am in a place in life where i feel so numb. ...
4,1,just took 25 pills idk if imma be here when i ...


In [3]:
X = df["clean_text"]
y = df["binary_label"].astype(int)

##### RANDOM FOREST PIPELINE

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Pipeline
rf_pipeline = Pipeline(steps=[("tfidf", TfidfVectorizer(max_features=3000,ngram_range=(1, 2),stop_words="english",min_df=10,max_df=0.8)),
                             ("rf", RandomForestClassifier(n_estimators=100,max_depth=20,random_state=42,class_weight="balanced",
                              min_samples_split=10,min_samples_leaf=5,max_features='sqrt',bootstrap=True,))])

# Train
rf_pipeline.fit(X_train, y_train)

# Predict
y_pred_rf = rf_pipeline.predict(X_test)

# Evaluate
print("Random Forest Baseline Results")
print(classification_report(y_test, y_pred_rf))
print("Accuracy:", accuracy_score(y_test, y_pred_rf))


Random Forest Baseline Results
              precision    recall  f1-score   support

           0       0.67      0.84      0.74      5582
           1       0.87      0.73      0.79      8478

    accuracy                           0.77     14060
   macro avg       0.77      0.78      0.77     14060
weighted avg       0.79      0.77      0.77     14060

Accuracy: 0.7702702702702703


In [32]:
from xgboost import XGBClassifier

xgb_pipeline = Pipeline(steps=[
    ("tfidf", TfidfVectorizer(max_features=3000,ngram_range=(1, 2),stop_words="english",min_df=10,max_df=0.8)),
    ("xgb", XGBClassifier(n_estimators=150,max_depth=6,learning_rate=0.1,subsample=0.8,colsample_bytree=0.8,
                          eval_metric="logloss",random_state=42,n_jobs=-1,tree_method='hist',verbosity=1))])

# Train
xgb_pipeline.fit(X_train, y_train)

# Predict
y_pred_xgb = xgb_pipeline.predict(X_test)

# Evaluate
print("XGBoost Baseline Results")
print(classification_report(y_test, y_pred_xgb))
print("Accuracy:", accuracy_score(y_test, y_pred_xgb))


XGBoost Baseline Results
              precision    recall  f1-score   support

           0       0.84      0.65      0.73      5582
           1       0.80      0.92      0.86      8478

    accuracy                           0.81     14060
   macro avg       0.82      0.79      0.80     14060
weighted avg       0.82      0.81      0.81     14060

Accuracy: 0.8133001422475107


In [34]:
# save the models 
import joblib

joblib.dump(xgb_pipeline, "../models/baseline_xgb_model.pkl")
joblib.dump(rf_pipeline, "../models/baseline_rf_model.pkl")


['../models/baseline_rf_model.pkl']

In [30]:
import sys
!{sys.executable} -m pip install xgboost

Collecting xgboost
  Downloading xgboost-3.1.2-py3-none-win_amd64.whl (72.0 MB)
     ---------------------------------------- 0.0/72.0 MB ? eta -:--:--
     --------------------------------------- 0.0/72.0 MB 640.0 kB/s eta 0:01:53
     --------------------------------------- 0.0/72.0 MB 435.7 kB/s eta 0:02:46
     --------------------------------------- 0.0/72.0 MB 245.8 kB/s eta 0:04:53
     --------------------------------------- 0.1/72.0 MB 297.7 kB/s eta 0:04:02
     --------------------------------------- 0.1/72.0 MB 326.8 kB/s eta 0:03:41
     --------------------------------------- 0.1/72.0 MB 327.7 kB/s eta 0:03:40
     --------------------------------------- 0.1/72.0 MB 343.4 kB/s eta 0:03:30
     --------------------------------------- 0.1/72.0 MB 355.0 kB/s eta 0:03:23
     --------------------------------------- 0.2/72.0 MB 339.7 kB/s eta 0:03:32
     --------------------------------------- 0.2/72.0 MB 338.5 kB/s eta 0:03:33
     --------------------------------------- 0.2


[notice] A new release of pip is available: 23.0.1 -> 25.3
[notice] To update, run: C:\Users\NUGGET\mental_health_prediction\venv\Scripts\python.exe -m pip install --upgrade pip


     ------------------------------ --------- 54.0/72.0 MB 2.1 MB/s eta 0:00:09
     ------------------------------ --------- 54.3/72.0 MB 2.2 MB/s eta 0:00:09
     ------------------------------ --------- 54.4/72.0 MB 2.2 MB/s eta 0:00:08
     ------------------------------ --------- 54.5/72.0 MB 2.2 MB/s eta 0:00:09
     ------------------------------ --------- 54.7/72.0 MB 2.2 MB/s eta 0:00:09
     ------------------------------ --------- 54.8/72.0 MB 2.1 MB/s eta 0:00:09
     ------------------------------ --------- 55.0/72.0 MB 2.2 MB/s eta 0:00:08
     ------------------------------ --------- 55.2/72.0 MB 2.2 MB/s eta 0:00:08
     ------------------------------ --------- 55.3/72.0 MB 2.2 MB/s eta 0:00:08
     ------------------------------ --------- 55.4/72.0 MB 2.2 MB/s eta 0:00:08
     ------------------------------ --------- 55.6/72.0 MB 2.2 MB/s eta 0:00:08
     ------------------------------ --------- 55.8/72.0 MB 2.2 MB/s eta 0:00:08
     ------------------------------- ---