In [14]:
# 💻 STEP 1: Imports
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.multioutput import MultiOutputClassifier
import joblib
import json

# 💾 STEP 2: Load Data
df = pd.read_csv("Dataset.csv")

# 🧹 STEP 3: Preprocess
# Convert stringified lists into actual lists (if stored like "['anxiety', 'depression']")
df["diagnosis_tags"] = df["diagnosis_tags"].apply(eval)

# Rename for consistency
df.rename(columns={
    "age": "age",
    "gender": "gender",
    "stress level": "stress_level",
    "sleep hours": "sleep_hours",
    "sociability": "sociability",
    "anxiety": "anxiety",
    "emotional stability (Mood)": "emotional_stability",
    "self-esteem": "self_esteem",
    "motivation": "motivation",
    "eating habits": "eating_habits",
    "substance abuse (Including Self-Harm)": "substance_abuse",
    "diagnosis_tags": "diagnosis_tags"
}, inplace=True)

# Encode Gender → Male: 0, Female: 1, Other: 2
df["gender"] = df["gender"].map({"Male": 0, "Female": 1, "Other": 2})

# Encode substance abuse (assuming Yes/No or 1/0)
df["substance_abuse"] = df["substance_abuse"].map({"Yes": 1, "No": 0}).fillna(0)

# 🎯 STEP 4: Define Features & Labels
features = [
    "age",
    "gender",
    "stress_level",
    "sleep_hours",
    "sociability",
    "anxiety",
    "emotional_stability",
    "self_esteem",
    "motivation",
    "eating_habits",
    "substance_abuse"
]

X = df[features]
y_raw = df["diagnosis_tags"]

# 🎯 STEP 5: Binarize Tags
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(y_raw)

# ✨ STEP 6: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 🧠 STEP 7: Create Pipeline
pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("classifier", MultiOutputClassifier(RandomForestClassifier(n_estimators=100, random_state=42)))
])

pipeline.fit(X_train, y_train)

# ✅ STEP 8: Save Artifacts
joblib.dump(pipeline, "mindmentor_model.joblib")
joblib.dump(mlb, "mindmentor_mlb.joblib")

# Also save feature order & tag names for future reference
metadata = {
    "features": features,
    "tag_names": mlb.classes_.tolist()
}

with open("model_metadata.json", "w") as f:
    json.dump(metadata, f)

print("🎉 Model training complete. Files saved:")
print("• mindmentor_model.joblib")
print("• mindmentor_mlb.joblib")
print("• model_metadata.json")


🎉 Model training complete. Files saved:
• mindmentor_model.joblib
• mindmentor_mlb.joblib
• model_metadata.json
