In [1]:
# Risk_Predictor2.ipynb - Complete Model Training and Saving

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score
from sklearn.model_selection import GridSearchCV
import pickle
import joblib
import os
import warnings
warnings.filterwarnings('ignore')


In [2]:
print("="*60)
print("COMPREHENSIVE STUDENT RISK PREDICTION MODEL")
print("="*60)

COMPREHENSIVE STUDENT RISK PREDICTION MODEL


In [3]:
# Create models directory if it doesn't exist
os.makedirs('../models', exist_ok=True)

In [5]:
# 1. Load and explore data
print("\n1. LOADING AND EXPLORING DATA")
print("-" * 40)

df = pd.read_csv('../data/StudentPerformanceFactors.csv')
df = df.dropna()

print(f"Dataset shape: {df.shape}")
print(f"Features: {df.columns.tolist()}")

# Display basic statistics
numeric_features = ['Hours_Studied', 'Attendance', 'Sleep_Hours', 'Previous_Scores', 
                   'Tutoring_Sessions', 'Physical_Activity', 'Exam_Score']
print("\nBasic statistics for key numeric features:")
print(df[numeric_features].describe())



1. LOADING AND EXPLORING DATA
----------------------------------------
Dataset shape: (6378, 21)
Features: ['Student_ID', 'Hours_Studied', 'Attendance', 'Parental_Involvement', 'Access_to_Resources', 'Extracurricular_Activities', 'Sleep_Hours', 'Previous_Scores', 'Motivation_Level', 'Internet_Access', 'Tutoring_Sessions', 'Family_Income', 'Teacher_Quality', 'School_Type', 'Peer_Influence', 'Physical_Activity', 'Learning_Disabilities', 'Parental_Education_Level', 'Distance_from_Home', 'Gender', 'Exam_Score']

Basic statistics for key numeric features:
       Hours_Studied   Attendance  Sleep_Hours  Previous_Scores  \
count    6378.000000  6378.000000  6378.000000      6378.000000   
mean       19.977109    80.020853     7.034964        75.066165   
std         5.985460    11.550723     1.468033        14.400389   
min         1.000000    60.000000     4.000000        50.000000   
25%        16.000000    70.000000     6.000000        63.000000   
50%        20.000000    80.000000     7.

In [6]:
def create_comprehensive_risk_label(df):
    """
    Create risk labels based on multiple weighted factors
    This considers poor performance across multiple dimensions
    """
    
    # 1. Academic Performance Risk (40% weight)
    exam_low = df['Exam_Score'] < df['Exam_Score'].quantile(0.25)
    prev_low = df['Previous_Scores'] < df['Previous_Scores'].quantile(0.25)
    academic_risk = (exam_low.astype(int) * 0.6 + prev_low.astype(int) * 0.4)
    
    # 2. Engagement Risk (25% weight)
    attendance_low = df['Attendance'] < df['Attendance'].quantile(0.3)
    hours_low = df['Hours_Studied'] < df['Hours_Studied'].quantile(0.3)
    engagement_risk = (attendance_low.astype(int) * 0.6 + hours_low.astype(int) * 0.4)
    
    # 3. Support System Risk (20% weight)
    parental_low = df['Parental_Involvement'].str.lower() == 'low'
    resources_low = df['Access_to_Resources'].str.lower() == 'low'
    support_risk = (parental_low.astype(int) * 0.6 + resources_low.astype(int) * 0.4)
    
    # 4. Personal Factors Risk (15% weight)
    motivation_low = df['Motivation_Level'].str.lower() == 'low'
    sleep_poor = (df['Sleep_Hours'] < 6) | (df['Sleep_Hours'] > 9)
    peer_negative = df['Peer_Influence'].str.lower() == 'negative'
    learning_disability = df['Learning_Disabilities'].str.lower() == 'yes'
    personal_risk = (motivation_low.astype(int) * 0.4 + sleep_poor.astype(int) * 0.2 + 
                    peer_negative.astype(int) * 0.2 + learning_disability.astype(int) * 0.2)
    
    # Combine all risk factors with weights
    composite_risk_score = (academic_risk * 0.40 + engagement_risk * 0.25 + 
                           support_risk * 0.20 + personal_risk * 0.15)
    
    # Students in top 30% of risk scores are considered "at risk"
    risk_threshold = np.percentile(composite_risk_score, 70)
    at_risk = (composite_risk_score >= risk_threshold).astype(int)
    
    return at_risk, composite_risk_score, risk_threshold

In [7]:
# Create risk labels
df['AtRisk'], df['RiskScore'], risk_threshold = create_comprehensive_risk_label(df)

print(f"Risk Distribution:")
print(df['AtRisk'].value_counts())
print(f"Percentage at risk: {df['AtRisk'].mean()*100:.1f}%")
print(f"Risk threshold: {risk_threshold:.3f}")

Risk Distribution:
AtRisk
0    4441
1    1937
Name: count, dtype: int64
Percentage at risk: 30.4%
Risk threshold: 0.290


In [8]:
# 3. Prepare features for modeling
print("\n3. PREPARING FEATURES FOR MODELING")
print("-" * 40)

# Include ALL features except Student_ID and our created risk labels
feature_cols = [col for col in df.columns if col not in ['Student_ID', 'AtRisk', 'RiskScore']]
X = df[feature_cols].copy()
y = df['AtRisk']

print(f"Features used in modeling ({len(feature_cols)}):")
for i, col in enumerate(feature_cols, 1):
    print(f"  {i:2d}. {col}")


3. PREPARING FEATURES FOR MODELING
----------------------------------------
Features used in modeling (20):
   1. Hours_Studied
   2. Attendance
   3. Parental_Involvement
   4. Access_to_Resources
   5. Extracurricular_Activities
   6. Sleep_Hours
   7. Previous_Scores
   8. Motivation_Level
   9. Internet_Access
  10. Tutoring_Sessions
  11. Family_Income
  12. Teacher_Quality
  13. School_Type
  14. Peer_Influence
  15. Physical_Activity
  16. Learning_Disabilities
  17. Parental_Education_Level
  18. Distance_from_Home
  19. Gender
  20. Exam_Score


In [9]:
# 4. Encode categorical variables
print("\n4. ENCODING CATEGORICAL VARIABLES")
print("-" * 40)

cat_cols = X.select_dtypes(include=['object']).columns.tolist()
le_dict = {}
X_encoded = X.copy()

print(f"Categorical features to encode: {len(cat_cols)}")

for col in cat_cols:
    le = LabelEncoder()
    X_encoded[col] = le.fit_transform(X[col])
    le_dict[col] = le
    print(f"  {col}: {len(le.classes_)} categories -> {le.classes_}")


4. ENCODING CATEGORICAL VARIABLES
----------------------------------------
Categorical features to encode: 13
  Parental_Involvement: 3 categories -> ['High' 'Low' 'Medium']
  Access_to_Resources: 3 categories -> ['High' 'Low' 'Medium']
  Extracurricular_Activities: 2 categories -> ['No' 'Yes']
  Motivation_Level: 3 categories -> ['High' 'Low' 'Medium']
  Internet_Access: 2 categories -> ['No' 'Yes']
  Family_Income: 3 categories -> ['High' 'Low' 'Medium']
  Teacher_Quality: 3 categories -> ['High' 'Low' 'Medium']
  School_Type: 2 categories -> ['Private' 'Public']
  Peer_Influence: 3 categories -> ['Negative' 'Neutral' 'Positive']
  Learning_Disabilities: 2 categories -> ['No' 'Yes']
  Parental_Education_Level: 3 categories -> ['College' 'High School' 'Postgraduate']
  Distance_from_Home: 3 categories -> ['Far' 'Moderate' 'Near']
  Gender: 2 categories -> ['Female' 'Male']


In [10]:
# 5. Feature scaling
print("\n5. FEATURE SCALING")
print("-" * 40)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_encoded)
feature_names = X_encoded.columns.tolist()

print(f"Features scaled: {X_scaled.shape}")


5. FEATURE SCALING
----------------------------------------
Features scaled: (6378, 20)


In [11]:
# 6. Train/test split
print("\n6. CREATING TRAIN/TEST SPLIT")
print("-" * 40)

X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set: {X_train.shape[0]} samples ({y_train.mean()*100:.1f}% at risk)")
print(f"Test set: {X_test.shape[0]} samples ({y_test.mean()*100:.1f}% at risk)")



6. CREATING TRAIN/TEST SPLIT
----------------------------------------
Training set: 5102 samples (30.4% at risk)
Test set: 1276 samples (30.4% at risk)


In [12]:
# 7. Model training with hyperparameter tuning
print("\n7. TRAINING MODEL WITH HYPERPARAMETER TUNING")
print("-" * 40)

# Grid search for optimal parameters
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, 15, 20, None],
    'min_samples_split': [5, 10],
    'min_samples_leaf': [2, 4],
    'max_features': ['sqrt', 'log2']
}

print("Starting hyperparameter tuning...")
rf = RandomForestClassifier(random_state=42, class_weight='balanced')
grid_search = GridSearchCV(rf, param_grid, cv=5, scoring='f1', n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train)

print(f"\nBest parameters: {grid_search.best_params_}")
print(f"Best cross-validation F1 score: {grid_search.best_score_:.3f}")

# Use the best model
best_model = grid_search.best_estimator_


7. TRAINING MODEL WITH HYPERPARAMETER TUNING
----------------------------------------
Starting hyperparameter tuning...
Fitting 5 folds for each of 64 candidates, totalling 320 fits

Best parameters: {'max_depth': 20, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 5, 'n_estimators': 100}
Best cross-validation F1 score: 0.931


In [None]:
# 8. Model evaluation
print("\n8. MODEL EVALUATION")
print("-" * 40)

# Cross-validation for model stability
cv_scores = cross_val_score(best_model, X_train, y_train, cv=5, scoring='f1')
print(f"Cross-validation F1 scores: {[f'{score:.3f}' for score in cv_scores]}")
print(f"Mean CV F1 score: {cv_scores.mean():.3f} (±{cv_scores.std():.3f})")

# Test set evaluation
y_pred = best_model.predict(X_test)
y_pred_proba = best_model.predict_proba(X_test)[:, 1]

test_accuracy = accuracy_score(y_test, y_pred)
test_auc = roc_auc_score(y_test, y_pred_proba)

print(f"\nTest Set Results:")
print(f"  Accuracy: {test_accuracy:.3f}")
print(f"  AUC-ROC: {test_auc:.3f}")
print(f"  Generalization Gap: {abs(cv_scores.mean() - test_accuracy):.3f}")

print(f"\nDetailed Classification Report:")
print(classification_report(y_test, y_pred))

print(f"\nConfusion Matrix:")
cm = confusion_matrix(y_test, y_pred)
print(cm)


In [None]:
# 9. Feature importance analysis
print("\n9. FEATURE IMPORTANCE ANALYSIS")
print("-" * 40)

feature_importance = pd.DataFrame({
    'feature': feature_names,
    'importance': best_model.feature_importances_
}).sort_values('importance', ascending=False)

print("Top 15 Most Important Features:")
for i, row in feature_importance.head(15).iterrows():
    print(f"  {row['feature']:25} {row['importance']:.4f}")

# 10. Save model and preprocessing components

In [None]:
# 10. Save model and preprocessing components
print("\n10. SAVING MODEL AND PREPROCESSING COMPONENTS")
print("-" * 40)

# Save the trained model
model_path = '../models/student_risk_model.pkl'
joblib.dump(best_model, model_path)
print(f"✅ Model saved to: {model_path}")

# Save the scaler
scaler_path = '../models/feature_scaler.pkl'
joblib.dump(scaler, scaler_path)
print(f"✅ Scaler saved to: {scaler_path}")

# Save label encoders
le_path = '../models/label_encoders.pkl'
with open(le_path, 'wb') as f:
    pickle.dump(le_dict, f)
print(f"✅ Label encoders saved to: {le_path}")

# Save feature names and other metadata
metadata = {
    'feature_names': feature_names,
    'feature_cols': feature_cols,
    'cat_cols': cat_cols,
    'risk_threshold': risk_threshold,
    'model_performance': {
        'cv_f1_mean': cv_scores.mean(),
        'cv_f1_std': cv_scores.std(),
        'test_accuracy': test_accuracy,
        'test_auc': test_auc
    },
    'feature_importance': feature_importance.to_dict('records')
}

metadata_path = '../models/model_metadata.pkl'
with open(metadata_path, 'wb') as f:
    pickle.dump(metadata, f)
print(f"✅ Metadata saved to: {metadata_path}")

# Save a sample of the original dataframe for testing
sample_df = df.sample(n=min(100, len(df)), random_state=42)
sample_path = '../models/sample_data.csv'
sample_df.to_csv(sample_path, index=False)
print(f"✅ Sample data saved to: {sample_path}")


In [None]:
# 11. Create prediction function for deployment
print("\n11. CREATING PREDICTION FUNCTION")
print("-" * 40)

def predict_student_risk(student_id, df, model, scaler, le_dict, feature_names, detailed=True):
    """
    Predict risk for a student by ID
    
    Args:
        student_id: Student ID from dataset
        df: Original dataframe
        model: Trained model
        scaler: Fitted StandardScaler
        le_dict: Dictionary of fitted LabelEncoders
        feature_names: List of feature names
        detailed: If True, show detailed analysis
    
    Returns:
        prediction, probability, student_info
    """
    
    row = df[df['Student_ID'] == student_id]
    if row.empty:
        print(f"Student_ID {student_id} not found.")
        return None, None, None
    
    student_info = row.iloc[0].to_dict()
    
    # Prepare features
    feature_cols = [col for col in df.columns if col not in ['Student_ID', 'AtRisk', 'RiskScore']]
    features = row[feature_cols].copy()
    
    # Encode categorical features
    features_encoded = features.copy()
    for col in le_dict.keys():
        if col in features_encoded.columns:
            features_encoded[col] = le_dict[col].transform(features[col])
    
    # Scale features
    features_scaled = scaler.transform(features_encoded)
    
    # Make prediction
    pred = model.predict(features_scaled)[0]
    prob = model.predict_proba(features_scaled)[0]
    
    if detailed:
        print(f"\n" + "="*40)
        print(f"RISK ASSESSMENT FOR STUDENT ID: {student_id}")
        print("="*40)
        
        print(f"📊 PREDICTION: {'🚨 AT RISK' if pred == 1 else '✅ NOT AT RISK'}")
        print(f"📈 Risk Probability: {prob[1]:.1%}")
        print(f"📈 Safe Probability: {prob[0]:.1%}")
        
        print(f"\n📋 STUDENT PROFILE:")
        print(f"   Exam Score: {student_info['Exam_Score']}")
        print(f"   Attendance: {student_info['Attendance']}%")
        print(f"   Hours Studied: {student_info['Hours_Studied']}")
        print(f"   Previous Scores: {student_info['Previous_Scores']}")
        print(f"   Motivation Level: {student_info['Motivation_Level']}")
        print(f"   Parental Involvement: {student_info['Parental_Involvement']}")
        
        # Show most influential factors for this prediction
        feature_contributions = model.feature_importances_ * features_scaled[0]
        top_factors = pd.DataFrame({
            'factor': feature_names,
            'contribution': np.abs(feature_contributions)
        }).sort_values('contribution', ascending=False).head(5)
        
        print(f"\n🔍 TOP 5 INFLUENTIAL FACTORS FOR THIS PREDICTION:")
        for _, row in top_factors.iterrows():
            print(f"   {row['factor']:25} {row['contribution']:.4f}")
    
    return pred, prob[1], student_info



In [None]:
# Test the prediction function
print("\n12. TESTING PREDICTION FUNCTION")
print("-" * 40)

# Test on a few random students
test_students = df['Student_ID'].sample(3, random_state=42).tolist()
print(f"Testing on students: {test_students}")

for student_id in test_students:
    predict_student_risk(student_id, df, best_model, scaler, le_dict, feature_names, detailed=True)
    print()

In [None]:
# 13. Final summary
print("\n" + "="*60)
print("MODEL TRAINING COMPLETED SUCCESSFULLY!")
print("="*60)

print(f"✅ Model trained and saved with {test_accuracy:.1%} accuracy")
print(f"✅ {len(feature_cols)} features used for prediction")
print(f"✅ {df['AtRisk'].sum()} out of {len(df)} students identified as at-risk")
print(f"✅ All model components saved to '../models/' directory")

print(f"\nModel Files Created:")
print(f"  - student_risk_model.pkl (main model)")
print(f"  - feature_scaler.pkl (preprocessing)")
print(f"  - label_encoders.pkl (categorical encoding)")
print(f"  - model_metadata.pkl (feature names & performance)")
print(f"  - sample_data.csv (test data)")

print(f"\n🎉 Ready for Streamlit deployment!")