# 🌲 Random Forest Classifier - Music Effects Prediction

This notebook focuses on building and evaluating the **Random Forest Classifier** for predicting music effects on mental wellbeing.

## 🎯 Objective
Train a Random Forest model to predict whether music will have a positive effect (Improve/No effect/Worsen) on users' mental health based on their psychological profile and listening habits.

## 📋 Workflow
1. Load and prepare data
2. Feature engineering
3. Train Random Forest model
4. Evaluate performance
5. Analyze feature importance
6. Save model artifacts

---

## 📦 Import Libraries

In [None]:
# Core libraries
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

# Machine Learning
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    classification_report, 
    confusion_matrix, 
    accuracy_score,
    precision_recall_fscore_support
)

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Model persistence
import joblib
import os

# Styling
plt.style.use('seaborn-v0_8')
sns.set_palette('husl')

print('✅ Libraries imported successfully!')
print('🌲 Ready to build Random Forest model...')

## 📊 Load Data

In [None]:
# Load the MXMH survey dataset
try:
    df = pd.read_csv('../data/processed/dataset_clean.csv')
    print("✅ Loaded processed dataset")
except FileNotFoundError:
    print("⚠️ Processed dataset not found, loading raw data...")
    df = pd.read_csv('../data/raw/mxmh_survey_results.csv')
    print("✅ Loaded raw dataset")

print(f"\n📋 Dataset Shape: {df.shape}")
print(f"👥 Samples: {len(df)}")
print(f"📊 Features: {df.shape[1]}")

# Display first few rows
df.head()

## 🧹 Data Preprocessing

In [None]:
# Handle BPM outliers
if 'BPM' in df.columns:
    median_bpm = df['BPM'].median()
    df['BPM'] = df['BPM'].apply(lambda x: median_bpm if (pd.isna(x) or x < 40 or x > 300) else x)
    print(f"✅ BPM outliers handled (median: {median_bpm})")

# Remove rows with missing target
if 'Music effects' in df.columns:
    original_len = len(df)
    df = df.dropna(subset=['Music effects'])
    print(f"✅ Dropped {original_len - len(df)} rows with missing target")

# Check target distribution
print(f"\n🎯 Target Distribution:")
print(df['Music effects'].value_counts())
print(f"\n📊 Target Proportions:")
print(df['Music effects'].value_counts(normalize=True))

## 🔧 Feature Engineering

In [None]:
# Define feature sets
numeric_features = ['Age', 'Hours per day', 'BPM', 'Anxiety', 'Depression', 'Insomnia', 'OCD']
categorical_features = [
    'Fav genre', 'Primary streaming service', 'While working', 
    'Instrumentalist', 'Composer', 'Exploratory', 'Foreign languages'
]

# Filter available features
available_numeric = [col for col in numeric_features if col in df.columns]
available_categorical = [col for col in categorical_features if col in df.columns]

print(f"📊 Numeric features ({len(available_numeric)}): {available_numeric}")
print(f"📊 Categorical features ({len(available_categorical)}): {available_categorical}")

# Extract features
X_numeric = df[available_numeric].copy()
X_categorical = df[available_categorical].copy()

# Handle missing values
X_numeric = X_numeric.fillna(X_numeric.median())
X_categorical = X_categorical.fillna('Unknown')

print("\n✅ Missing values handled")

In [None]:
# One-hot encoding for categorical variables
print("🔄 Encoding categorical variables...")
X_categorical_encoded = pd.get_dummies(X_categorical, prefix=available_categorical, drop_first=True)

# Combine features
X = pd.concat([X_numeric, X_categorical_encoded], axis=1)

print(f"\n📊 Final feature matrix: {X.shape}")
print(f"📋 Total features: {len(X.columns)}")

# Store feature names
feature_names = list(X.columns)
print(f"\n✅ Feature engineering complete!")

## 🎯 Prepare Target Variable

In [None]:
# Encode target variable
y = df['Music effects'].copy()
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

print("🎯 Target Encoding:")
for i, class_name in enumerate(label_encoder.classes_):
    count = sum(y_encoded == i)
    print(f"  {class_name} -> {i} ({count} samples, {count/len(y_encoded)*100:.1f}%)")

print("\n✅ Target variable encoded!")

## ⚙️ Train-Test Split

In [None]:
# Split data with stratification
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded,
    test_size=0.2,
    stratify=y_encoded,
    random_state=42
)

print("🔄 Data split complete!")
print(f"\n📊 Training set: {X_train.shape}")
print(f"📊 Test set: {X_test.shape}")

# Verify stratification
print(f"\n📈 Training set distribution:")
unique, counts = np.unique(y_train, return_counts=True)
for class_idx, count in zip(unique, counts):
    print(f"  {label_encoder.classes_[class_idx]}: {count} ({count/len(y_train)*100:.1f}%)")

print(f"\n📈 Test set distribution:")
unique, counts = np.unique(y_test, return_counts=True)
for class_idx, count in zip(unique, counts):
    print(f"  {label_encoder.classes_[class_idx]}: {count} ({count/len(y_test)*100:.1f}%)")

## 🌲 Train Random Forest Model

In [None]:
print("🌲 Training Random Forest Classifier...\n")

# Initialize model
rf_model = RandomForestClassifier(
    n_estimators=200,      # Number of trees
    max_depth=10,          # Maximum tree depth
    min_samples_split=5,   # Minimum samples to split
    random_state=42,       # Reproducibility
    n_jobs=-1             # Use all cores
)

# Train the model
rf_model.fit(X_train, y_train)

print("✅ Model training complete!")
print(f"\n🎯 Model Configuration:")
print(f"  • Trees: {rf_model.n_estimators}")
print(f"  • Max depth: {rf_model.max_depth}")
print(f"  • Features: {len(feature_names)}")
print(f"  • Classes: {len(label_encoder.classes_)}")

## 📊 Model Evaluation

In [None]:
# Make predictions
y_train_pred = rf_model.predict(X_train)
y_test_pred = rf_model.predict(X_test)

# Calculate accuracies
train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)

print("📊 MODEL PERFORMANCE")
print("="*50)
print(f"Training Accuracy:   {train_accuracy:.4f} ({train_accuracy*100:.2f}%)")
print(f"Testing Accuracy:    {test_accuracy:.4f} ({test_accuracy*100:.2f}%)")
print(f"Accuracy Difference: {(train_accuracy - test_accuracy):.4f}")

# Check overfitting
if train_accuracy - test_accuracy > 0.1:
    print("\n⚠️ Potential overfitting detected")
else:
    print("\n✅ Good generalization")

In [None]:
# Detailed classification report
print("\n📋 CLASSIFICATION REPORT")
print("="*70)
class_names = label_encoder.classes_
report = classification_report(y_test, y_test_pred, target_names=class_names)
print(report)

## 🔥 Confusion Matrix

In [None]:
# Generate confusion matrix
cm = confusion_matrix(y_test, y_test_pred)

# Visualize
plt.figure(figsize=(10, 8))
sns.heatmap(cm, 
            annot=True, 
            fmt='d', 
            cmap='Blues',
            xticklabels=class_names,
            yticklabels=class_names,
            cbar_kws={'label': 'Count'})
plt.title('Confusion Matrix - Random Forest Classifier', fontsize=14, fontweight='bold', pad=20)
plt.ylabel('True Label', fontsize=12)
plt.xlabel('Predicted Label', fontsize=12)
plt.tight_layout()
plt.show()

# Analysis
print("\n🔍 Confusion Matrix Analysis:")
print("="*50)
for i, true_class in enumerate(class_names):
    for j, pred_class in enumerate(class_names):
        count = cm[i, j]
        if i == j and count > 0:
            print(f"✅ {true_class:12} correctly predicted: {count:3} samples")
        elif count > 0:
            print(f"❌ {true_class:12} → {pred_class:12}: {count:3} misclassifications")

## 📊 Feature Importance Analysis

In [None]:
# Extract feature importances
feature_importance = rf_model.feature_importances_
feature_importance_df = pd.DataFrame({
    'feature': feature_names,
    'importance': feature_importance
}).sort_values('importance', ascending=False)

print("🔍 TOP 15 MOST IMPORTANT FEATURES")
print("="*60)
for i, (_, row) in enumerate(feature_importance_df.head(15).iterrows(), 1):
    bar = '█' * int(row['importance'] * 200)
    print(f"{i:2d}. {row['feature']:30} {row['importance']:.4f} {bar}")

# Visualize top 10
plt.figure(figsize=(12, 8))
top_features = feature_importance_df.head(10)
sns.barplot(data=top_features, x='importance', y='feature', palette='viridis')
plt.title('Top 10 Feature Importances - Random Forest', fontsize=14, fontweight='bold', pad=20)
plt.xlabel('Importance Score', fontsize=12)
plt.ylabel('Features', fontsize=12)
plt.grid(True, alpha=0.3, axis='x')
plt.tight_layout()
plt.show()

## 🏷️ Feature Category Analysis

In [None]:
# Categorize features
mental_health_features = ['Anxiety', 'Depression', 'Insomnia', 'OCD']
music_features = ['Hours per day', 'BPM', 'Age']
genre_features = [f for f in feature_names if 'Fav genre' in f]
platform_features = [f for f in feature_names if 'Primary streaming service' in f]

categories = {
    'Mental Health': mental_health_features,
    'Music Habits': music_features,
    'Genre Preferences': genre_features,
    'Platform Usage': platform_features
}

print("\n📊 FEATURE IMPORTANCE BY CATEGORY")
print("="*60)

category_importance = {}
for category, features in categories.items():
    category_features = [f for f in features if f in feature_names]
    if category_features:
        avg_importance = feature_importance_df[
            feature_importance_df['feature'].isin(category_features)
        ]['importance'].mean()
        category_importance[category] = avg_importance
        print(f"{category:20} {avg_importance:.4f} ({len(category_features)} features)")

# Visualize category importance
if category_importance:
    plt.figure(figsize=(10, 6))
    categories_sorted = sorted(category_importance.items(), key=lambda x: x[1], reverse=True)
    cats, imps = zip(*categories_sorted)
    sns.barplot(x=list(imps), y=list(cats), palette='mako')
    plt.title('Average Feature Importance by Category', fontsize=14, fontweight='bold', pad=20)
    plt.xlabel('Average Importance', fontsize=12)
    plt.ylabel('Feature Category', fontsize=12)
    plt.tight_layout()
    plt.show()

## 💾 Save Model Artifacts

In [None]:
# Create models directory
models_dir = '../models'
os.makedirs(models_dir, exist_ok=True)

print("💾 SAVING MODEL ARTIFACTS")
print("="*60)

# Save trained model
model_path = os.path.join(models_dir, 'music_effect_model.pkl')
joblib.dump(rf_model, model_path)
print(f"✅ Model saved: {model_path}")

# Save label encoder
encoder_path = os.path.join(models_dir, 'label_encoder.pkl')
joblib.dump(label_encoder, encoder_path)
print(f"✅ Encoder saved: {encoder_path}")

# Save feature names
features_path = os.path.join(models_dir, 'feature_columns.pkl')
joblib.dump(feature_names, features_path)
print(f"✅ Features saved: {features_path}")

# Save feature importance
importance_path = os.path.join(models_dir, 'feature_importance.csv')
feature_importance_df.to_csv(importance_path, index=False)
print(f"✅ Feature importance saved: {importance_path}")

# Create model summary
model_summary = {
    'model_type': 'RandomForestClassifier',
    'n_estimators': rf_model.n_estimators,
    'max_depth': rf_model.max_depth,
    'random_state': 42,
    'test_accuracy': float(test_accuracy),
    'train_accuracy': float(train_accuracy),
    'n_features': len(feature_names),
    'n_samples_train': len(X_train),
    'n_samples_test': len(X_test),
    'classes': list(label_encoder.classes_),
    'top_5_features': feature_importance_df.head(5)['feature'].tolist()
}

summary_path = os.path.join(models_dir, 'model_summary.pkl')
joblib.dump(model_summary, summary_path)
print(f"✅ Model summary saved: {summary_path}")

print("\n🎉 All artifacts saved successfully!")

## ✅ Test Model Loading

In [None]:
# Verify model can be loaded
print("🔄 Testing model loading...\n")

loaded_model = joblib.load(model_path)
loaded_encoder = joblib.load(encoder_path)
loaded_features = joblib.load(features_path)

# Make test prediction
test_prediction = loaded_model.predict(X_test[:1])
predicted_class = loaded_encoder.inverse_transform(test_prediction)[0]
actual_class = loaded_encoder.inverse_transform([y_test[0]])[0]

print(f"✅ Model loaded successfully!")
print(f"\n🧪 Test Prediction:")
print(f"   Predicted: {predicted_class}")
print(f"   Actual: {actual_class}")
print(f"   Match: {'✅ Yes' if predicted_class == actual_class else '❌ No'}")

## 📝 Model Summary

In [None]:
print("\n" + "="*70)
print("🌲 RANDOM FOREST MODEL - FINAL SUMMARY")
print("="*70)

print(f"\n📊 Performance Metrics:")
print(f"   • Training Accuracy:  {train_accuracy*100:.2f}%")
print(f"   • Testing Accuracy:   {test_accuracy*100:.2f}%")
print(f"   • Generalization Gap: {(train_accuracy-test_accuracy)*100:.2f}%")

print(f"\n🔧 Model Configuration:")
print(f"   • Algorithm:          Random Forest Classifier")
print(f"   • Number of Trees:    {rf_model.n_estimators}")
print(f"   • Max Tree Depth:     {rf_model.max_depth}")
print(f"   • Total Features:     {len(feature_names)}")
print(f"   • Training Samples:   {len(X_train)}")
print(f"   • Test Samples:       {len(X_test)}")

print(f"\n🎯 Prediction Classes:")
for class_name in label_encoder.classes_:
    print(f"   • {class_name}")

print(f"\n🌟 Top 3 Important Features:")
for i, (_, row) in enumerate(feature_importance_df.head(3).iterrows(), 1):
    print(f"   {i}. {row['feature']} ({row['importance']:.4f})")

print(f"\n💾 Saved Artifacts:")
print(f"   • music_effect_model.pkl")
print(f"   • label_encoder.pkl")
print(f"   • feature_columns.pkl")
print(f"   • feature_importance.csv")
print(f"   • model_summary.pkl")

print(f"\n✅ Model ready for deployment!")
print("="*70)