# Early Academic Burnout Signal Detection
## Using Multi-Source Student Behaviour Data

**Project Overview:**
This project aims to detect early signals of academic burnout by analyzing multiple sources of student behavioral data including:
- Academic performance (GPA, assignments)
- Attendance patterns
- Online learning activity (LMS)
- Library usage
- Social engagement
- Health & wellbeing metrics
- Help-seeking behavior

## 1. Import Libraries and Load Data

In [None]:
# Data manipulation
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Machine Learning
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import (
    classification_report, confusion_matrix, roc_auc_score, 
    roc_curve, precision_recall_curve, f1_score
)

# Feature importance
from sklearn.feature_selection import mutual_info_classif

# Warnings
import warnings
warnings.filterwarnings('ignore')

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

print("Libraries imported successfully!")

In [None]:
# Load the dataset
df = pd.read_csv('../data/student_behavior_data.csv')

print(f"Dataset Shape: {df.shape}")
print(f"\nColumns: {df.columns.tolist()}")
print(f"\nFirst 5 rows:")
df.head()

## 2. Exploratory Data Analysis (EDA)

In [None]:
# Basic statistics
print("=" * 50)
print("DATASET OVERVIEW")
print("=" * 50)
print(f"Total Students: {df['student_id'].nunique()}")
print(f"Total Weeks Tracked: {df['week'].nunique()}")
print(f"Total Records: {len(df)}")
print(f"\nBurnout Distribution:")
print(df.groupby('burnout_status')['student_id'].nunique())
print(f"\nBurnout Rate: {df.groupby('burnout_status')['student_id'].nunique()[1] / df['student_id'].nunique() * 100:.1f}%")

print("\n" + "=" * 50)
print("MISSING VALUES")
print("=" * 50)
print(df.isnull().sum())

print("\n" + "=" * 50)
print("BASIC STATISTICS")
print("=" * 50)
df.describe()

In [None]:
# Visualize burnout distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Burnout count
burnout_counts = df.groupby('burnout_status')['student_id'].nunique()
axes[0].bar(['No Burnout', 'Burnout'], burnout_counts.values, color=['green', 'red'], alpha=0.7)
axes[0].set_ylabel('Number of Students')
axes[0].set_title('Student Burnout Distribution')
axes[0].grid(axis='y', alpha=0.3)

# Burnout by major
major_burnout = df.groupby(['major', 'burnout_status'])['student_id'].nunique().unstack(fill_value=0)
major_burnout.plot(kind='bar', ax=axes[1], color=['green', 'red'], alpha=0.7)
axes[1].set_title('Burnout Distribution by Major')
axes[1].set_xlabel('Major')
axes[1].set_ylabel('Number of Students')
axes[1].legend(['No Burnout', 'Burnout'])
axes[1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.savefig('../results/burnout_distribution.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
# Temporal patterns - How burnout develops over time
temporal_data = df.groupby(['week', 'burnout_status']).agg({
    'current_gpa': 'mean',
    'attendance_rate': 'mean',
    'lms_logins': 'mean',
    'stress_level': 'mean',
    'sleep_quality': 'mean'
}).reset_index()

fig, axes = plt.subplots(2, 3, figsize=(18, 10))
metrics = ['current_gpa', 'attendance_rate', 'lms_logins', 'stress_level', 'sleep_quality']
titles = ['GPA Over Time', 'Attendance Rate Over Time', 'LMS Logins Over Time', 
          'Stress Level Over Time', 'Sleep Quality Over Time']

for idx, (metric, title) in enumerate(zip(metrics, titles)):
    ax = axes[idx // 3, idx % 3]
    
    for burnout_status in [0, 1]:
        data = temporal_data[temporal_data['burnout_status'] == burnout_status]
        label = 'Burnout' if burnout_status == 1 else 'No Burnout'
        color = 'red' if burnout_status == 1 else 'green'
        ax.plot(data['week'], data[metric], marker='o', label=label, color=color, alpha=0.7)
    
    ax.set_xlabel('Week')
    ax.set_ylabel(metric.replace('_', ' ').title())
    ax.set_title(title)
    ax.legend()
    ax.grid(alpha=0.3)

# Remove extra subplot
axes[1, 2].remove()

plt.tight_layout()
plt.savefig('../results/temporal_patterns.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
# Correlation heatmap of key features
# Select numeric columns only
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
# Remove IDs and target from correlation
correlation_cols = [col for col in numeric_cols if col not in ['student_id', 'week', 'year', 'burnout_severity']]

plt.figure(figsize=(16, 14))
correlation_matrix = df[correlation_cols].corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, 
            fmt='.2f', square=True, linewidths=0.5)
plt.title('Correlation Heatmap of Student Behavioral Features', fontsize=16, pad=20)
plt.tight_layout()
plt.savefig('../results/correlation_heatmap.png', dpi=300, bbox_inches='tight')
plt.show()

## 3. Feature Engineering

In [None]:
# Create aggregated features per student (to predict burnout at student level)
print("Creating student-level aggregated features...")

# Aggregate features
student_features = df.groupby('student_id').agg({
    # Academic performance
    'current_gpa': ['mean', 'std', 'min'],
    'assignment_score': ['mean', 'std', 'min'],
    
    # Attendance
    'attendance_rate': ['mean', 'std', 'min'],
    'classes_missed': ['sum', 'mean', 'max'],
    
    # Assignment patterns
    'assignments_on_time': ['sum', 'mean'],
    'assignments_late': ['sum', 'mean'],
    'assignments_missing': ['sum', 'mean'],
    
    # LMS activity
    'lms_logins': ['sum', 'mean', 'std'],
    'time_on_lms_hours': ['sum', 'mean', 'std'],
    'video_completion_rate': ['mean', 'std', 'min'],
    'forum_posts': ['sum', 'mean'],
    'days_since_last_login': ['mean', 'max'],
    
    # Library
    'library_visits': ['sum', 'mean'],
    'library_study_hours': ['sum', 'mean', 'std'],
    
    # Social
    'campus_activities': ['sum', 'mean'],
    'peer_interactions': ['sum', 'mean', 'std'],
    
    # Health
    'sleep_quality': ['mean', 'std', 'min'],
    'sleep_hours': ['mean', 'std', 'min'],
    'stress_level': ['mean', 'max', 'std'],
    'exercise_frequency': ['sum', 'mean'],
    
    # Help-seeking
    'office_hours_visits': ['sum', 'mean'],
    'tutoring_sessions': ['sum', 'mean'],
    'counseling_visits': ['sum'],
    
    # Target
    'burnout_status': 'first',
    'year': 'first',
    'major': 'first'
}).reset_index()

# Flatten column names
student_features.columns = ['_'.join(col).strip('_') for col in student_features.columns.values]
student_features.rename(columns={'student_id_': 'student_id'}, inplace=True)

print(f"Student-level features created: {student_features.shape}")
print(f"Number of features: {student_features.shape[1] - 1}")

student_features.head()

In [None]:
# Create additional derived features
print("Creating derived features...")

# GPA decline rate
student_features['gpa_decline'] = student_features['current_gpa_mean'] - student_features['current_gpa_min']

# Total assignments completed ratio
total_assignments = student_features['assignments_on_time_sum'] + student_features['assignments_late_sum'] + student_features['assignments_missing_sum']
student_features['completion_ratio'] = (student_features['assignments_on_time_sum'] + student_features['assignments_late_sum']) / (total_assignments + 1e-6)

# Engagement score (composite)
student_features['engagement_score'] = (
    student_features['lms_logins_mean'] + 
    student_features['library_visits_mean'] + 
    student_features['campus_activities_mean']
) / 3

# Academic distress indicator
student_features['academic_distress'] = (
    (student_features['current_gpa_mean'] < 3.0).astype(int) +
    (student_features['assignments_missing_sum'] > 5).astype(int) +
    (student_features['attendance_rate_mean'] < 0.8).astype(int)
)

# Wellbeing score
student_features['wellbeing_score'] = (
    student_features['sleep_quality_mean'] - 
    student_features['stress_level_mean'] + 
    student_features['exercise_frequency_mean']
) / 3

# Help-seeking total
student_features['total_help_seeking'] = (
    student_features['office_hours_visits_sum'] + 
    student_features['tutoring_sessions_sum'] + 
    student_features['counseling_visits_sum']
)

print(f"Derived features added. New shape: {student_features.shape}")
print(f"\nNew derived features:")
print(['gpa_decline', 'completion_ratio', 'engagement_score', 'academic_distress', 
       'wellbeing_score', 'total_help_seeking'])

## 4. Feature Selection and Importance Analysis

In [None]:
# Prepare features for modeling
# Encode categorical variables
le_major = LabelEncoder()
student_features['major_encoded'] = le_major.fit_transform(student_features['major_first'])

# Select feature columns (exclude ID, target, and categorical original)
feature_cols = [col for col in student_features.columns 
                if col not in ['student_id', 'burnout_status_first', 'major_first', 'year_first']]

X = student_features[feature_cols]
y = student_features['burnout_status_first']

print(f"Features shape: {X.shape}")
print(f"Target distribution:\n{y.value_counts()}")
print(f"\nFeature columns: {len(feature_cols)}")

In [None]:
# Calculate feature importance using Mutual Information
mi_scores = mutual_info_classif(X, y, random_state=42)
mi_scores = pd.Series(mi_scores, index=X.columns).sort_values(ascending=False)

# Plot top 20 features
plt.figure(figsize=(12, 8))
mi_scores.head(20).plot(kind='barh', color='steelblue')
plt.xlabel('Mutual Information Score')
plt.title('Top 20 Most Important Features for Burnout Detection')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.savefig('../results/feature_importance_mi.png', dpi=300, bbox_inches='tight')
plt.show()

print("Top 10 most important features:")
print(mi_scores.head(10))

## 5. Model Training and Evaluation

In [None]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set size: {X_train.shape}")
print(f"Test set size: {X_test.shape}")
print(f"\nTraining set class distribution:")
print(y_train.value_counts())
print(f"\nTest set class distribution:")
print(y_test.value_counts())

In [None]:
# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("Features scaled successfully!")

In [None]:
# Train multiple models
models = {
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, random_state=42),
    'SVM': SVC(kernel='rbf', probability=True, random_state=42)
}

results = {}

print("Training models...\n")
print("=" * 80)

for name, model in models.items():
    print(f"\nTraining {name}...")
    
    # Train
    model.fit(X_train_scaled, y_train)
    
    # Predictions
    y_pred = model.predict(X_test_scaled)
    y_pred_proba = model.predict_proba(X_test_scaled)[:, 1]
    
    # Calculate metrics
    auc_score = roc_auc_score(y_test, y_pred_proba)
    f1 = f1_score(y_test, y_pred)
    
    # Cross-validation
    cv_scores = cross_val_score(model, X_train_scaled, y_train, cv=5, scoring='roc_auc')
    
    results[name] = {
        'model': model,
        'y_pred': y_pred,
        'y_pred_proba': y_pred_proba,
        'auc': auc_score,
        'f1': f1,
        'cv_mean': cv_scores.mean(),
        'cv_std': cv_scores.std()
    }
    
    print(f"  AUC-ROC: {auc_score:.4f}")
    print(f"  F1-Score: {f1:.4f}")
    print(f"  CV AUC: {cv_scores.mean():.4f} (+/- {cv_scores.std():.4f})")
    print("-" * 80)

print("\nAll models trained successfully!")

In [None]:
# Model comparison
comparison_df = pd.DataFrame({
    'Model': list(results.keys()),
    'AUC-ROC': [results[m]['auc'] for m in results.keys()],
    'F1-Score': [results[m]['f1'] for m in results.keys()],
    'CV AUC (mean)': [results[m]['cv_mean'] for m in results.keys()],
    'CV AUC (std)': [results[m]['cv_std'] for m in results.keys()]
})

comparison_df = comparison_df.sort_values('AUC-ROC', ascending=False)
print("\nModel Comparison:")
print(comparison_df)

# Visualize model comparison
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

comparison_df.plot(x='Model', y='AUC-ROC', kind='bar', ax=axes[0], color='steelblue', legend=False)
axes[0].set_ylabel('AUC-ROC Score')
axes[0].set_title('Model Performance Comparison (AUC-ROC)')
axes[0].set_ylim([0.7, 1.0])
axes[0].tick_params(axis='x', rotation=45)
axes[0].grid(axis='y', alpha=0.3)

comparison_df.plot(x='Model', y='F1-Score', kind='bar', ax=axes[1], color='coral', legend=False)
axes[1].set_ylabel('F1 Score')
axes[1].set_title('Model Performance Comparison (F1-Score)')
axes[1].set_ylim([0.5, 1.0])
axes[1].tick_params(axis='x', rotation=45)
axes[1].grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.savefig('../results/model_comparison.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
# Select best model (highest AUC)
best_model_name = comparison_df.iloc[0]['Model']
best_model = results[best_model_name]['model']

print(f"\nBest Model: {best_model_name}")
print(f"AUC-ROC: {results[best_model_name]['auc']:.4f}")
print(f"F1-Score: {results[best_model_name]['f1']:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, results[best_model_name]['y_pred'], 
                          target_names=['No Burnout', 'Burnout']))

In [None]:
# Confusion Matrix for best model
cm = confusion_matrix(y_test, results[best_model_name]['y_pred'])

plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=True,
            xticklabels=['No Burnout', 'Burnout'],
            yticklabels=['No Burnout', 'Burnout'])
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title(f'Confusion Matrix - {best_model_name}')
plt.tight_layout()
plt.savefig('../results/confusion_matrix.png', dpi=300, bbox_inches='tight')
plt.show()

# Calculate additional metrics from confusion matrix
tn, fp, fn, tp = cm.ravel()
print(f"\nConfusion Matrix Breakdown:")
print(f"True Negatives: {tn}")
print(f"False Positives: {fp}")
print(f"False Negatives: {fn}")
print(f"True Positives: {tp}")
print(f"\nSensitivity (Recall): {tp/(tp+fn):.3f}")
print(f"Specificity: {tn/(tn+fp):.3f}")
print(f"Precision: {tp/(tp+fp):.3f}")

In [None]:
# ROC Curves for all models
plt.figure(figsize=(10, 8))

for name in results.keys():
    fpr, tpr, _ = roc_curve(y_test, results[name]['y_pred_proba'])
    auc = results[name]['auc']
    plt.plot(fpr, tpr, label=f"{name} (AUC = {auc:.3f})", linewidth=2)

plt.plot([0, 1], [0, 1], 'k--', label='Random (AUC = 0.500)', linewidth=1)
plt.xlabel('False Positive Rate', fontsize=12)
plt.ylabel('True Positive Rate', fontsize=12)
plt.title('ROC Curves - All Models', fontsize=14)
plt.legend(loc='lower right', fontsize=10)
plt.grid(alpha=0.3)
plt.tight_layout()
plt.savefig('../results/roc_curves.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
# Precision-Recall curve for best model
precision, recall, thresholds = precision_recall_curve(y_test, results[best_model_name]['y_pred_proba'])

plt.figure(figsize=(10, 6))
plt.plot(recall, precision, linewidth=2, color='steelblue')
plt.xlabel('Recall', fontsize=12)
plt.ylabel('Precision', fontsize=12)
plt.title(f'Precision-Recall Curve - {best_model_name}', fontsize=14)
plt.grid(alpha=0.3)
plt.tight_layout()
plt.savefig('../results/precision_recall_curve.png', dpi=300, bbox_inches='tight')
plt.show()

## 6. Feature Importance from Best Model

In [None]:
# Feature importance (if tree-based model)
if hasattr(best_model, 'feature_importances_'):
    feature_importance = pd.DataFrame({
        'feature': X.columns,
        'importance': best_model.feature_importances_
    }).sort_values('importance', ascending=False)
    
    plt.figure(figsize=(12, 8))
    feature_importance.head(20).plot(x='feature', y='importance', kind='barh', color='forestgreen')
    plt.xlabel('Feature Importance')
    plt.title(f'Top 20 Feature Importances - {best_model_name}')
    plt.gca().invert_yaxis()
    plt.tight_layout()
    plt.savefig('../results/feature_importance_model.png', dpi=300, bbox_inches='tight')
    plt.show()
    
    print("\nTop 10 Most Important Features:")
    print(feature_importance.head(10))
else:
    print(f"Feature importance not available for {best_model_name}")

## 7. Save the Best Model

In [None]:
import joblib

# Save the model and scaler
joblib.dump(best_model, '../models/best_burnout_model.pkl')
joblib.dump(scaler, '../models/scaler.pkl')
joblib.dump(le_major, '../models/label_encoder_major.pkl')

# Save feature names
with open('../models/feature_names.txt', 'w') as f:
    f.write('\n'.join(X.columns.tolist()))

print(f"\nBest model ({best_model_name}) saved successfully!")
print("Saved files:")
print("  - best_burnout_model.pkl")
print("  - scaler.pkl")
print("  - label_encoder_major.pkl")
print("  - feature_names.txt")

## 8. Key Insights and Recommendations

In [None]:
print("="*80)
print("KEY INSIGHTS FROM BURNOUT DETECTION ANALYSIS")
print("="*80)

print("\n1. MODEL PERFORMANCE:")
print(f"   - Best Model: {best_model_name}")
print(f"   - AUC-ROC Score: {results[best_model_name]['auc']:.4f}")
print(f"   - This indicates {'excellent' if results[best_model_name]['auc'] > 0.9 else 'good' if results[best_model_name]['auc'] > 0.8 else 'moderate'} predictive performance")

print("\n2. MOST PREDICTIVE SIGNALS (Top 5):")
if hasattr(best_model, 'feature_importances_'):
    for idx, row in feature_importance.head(5).iterrows():
        print(f"   - {row['feature']}: {row['importance']:.4f}")
else:
    for idx, (feature, score) in enumerate(mi_scores.head(5).items(), 1):
        print(f"   - {feature}: {score:.4f}")

print("\n3. EARLY WARNING INDICATORS:")
print("   - Declining GPA trends")
print("   - Reduced LMS engagement")
print("   - Increasing stress levels")
print("   - Deteriorating sleep quality")
print("   - Missing assignments")

print("\n4. RECOMMENDATIONS FOR INTERVENTION:")
print("   a) Monitor students with:")
print("      - GPA decline > 0.3 points")
print("      - Attendance rate < 80%")
print("      - 3+ missing assignments")
print("      - Stress level > 7/10")
print("\n   b) Proactive support:")
print("      - Early outreach to at-risk students")
print("      - Counseling services promotion")
print("      - Academic support programs")
print("      - Peer mentorship programs")
print("\n   c) System-level changes:")
print("      - Implement early warning dashboard")
print("      - Regular check-ins for flagged students")
print("      - Workload assessment and adjustment")

print("\n" + "="*80)

## Conclusion

This project successfully demonstrates the use of multi-source behavioral data to detect early signals of academic burnout. The model can help institutions:

1. **Identify at-risk students early** - before burnout becomes severe
2. **Understand key risk factors** - through feature importance analysis
3. **Implement targeted interventions** - based on data-driven insights
4. **Monitor effectiveness** - through continuous model evaluation

The comprehensive data sources (academic, behavioral, social, health) provide a holistic view of student wellbeing, enabling more accurate and timely interventions.