In [2]:
"""
HIJAB JOURNEY - CLEAN ML PIPELINE
==================================
Step-by-Step: Data Generation ‚Üí Cleaning ‚Üí Training ‚Üí Testing ‚Üí Evaluation
"""

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import seaborn as sns
import matplotlib.pyplot as plt
import joblib

print("="*70)
print("HIJAB JOURNEY ML PIPELINE")
print("="*70)

# ============================================================
# STEP 1: GENERATE REALISTIC DATA
# ============================================================

print("\n" + "="*70)
print("STEP 1: GENERATING DATA")
print("="*70)

np.random.seed(42)

# Platform structure
LEVELS = {
    1: {'topic': 'Introduction to Hijab', 'lessons': 5},
    2: {'topic': 'Why We Wear Hijab', 'lessons': 4},
    3: {'topic': 'Proper Hijab Requirements', 'lessons': 5},
    4: {'topic': 'Hijab in Daily Life', 'lessons': 3},
    5: {'topic': 'Being Confident in Hijab', 'lessons': 4},
    6: {'topic': 'Role Model & Inspiration', 'lessons': 4}
}

CLASSES = ['Class A', 'Class B', 'Class C']
NUM_STUDENTS = 30

data = []
student_id = 1

# Create student profiles with more variation
import random
random.seed(42)

for class_name in CLASSES:
    students_in_class = 10 if class_name == 'Class A' else (12 if class_name == 'Class B' else 8)
    
    for _ in range(students_in_class):
        # Random performance level
        performance_type = random.choice(['high', 'high', 'medium', 'medium', 'medium', 'low', 'low'])
        
        if performance_type == 'high':
            base_watch = np.random.uniform(80, 95)
            base_quiz = np.random.uniform(70, 90)
            base_inactive = np.random.uniform(1, 5)
        elif performance_type == 'medium':
            base_watch = np.random.uniform(60, 80)
            base_quiz = np.random.uniform(50, 70)
            base_inactive = np.random.uniform(4, 10)
        else:  # low
            base_watch = np.random.uniform(30, 60)
            base_quiz = np.random.uniform(25, 50)
            base_inactive = np.random.uniform(8, 20)
        
        # Current level (weighted toward early levels)
        current_level = np.random.choice([1, 2, 3], p=[0.5, 0.35, 0.15])
        
        # Number of lessons completed in current level
        max_lessons = LEVELS[current_level]['lessons']
        num_completed = random.randint(2, min(5, max_lessons))
        
        # Generate lesson data
        for lesson_num in range(num_completed):
            # Add noise to each lesson
            watch_pct = np.clip(base_watch + np.random.uniform(-15, 15), 0, 100)
            quiz_score = np.clip(base_quiz + np.random.uniform(-15, 15), 0, 100)
            days_inactive = max(0, int(base_inactive + np.random.uniform(-3, 3)))
            
            video_completed = 1 if watch_pct >= 80 else 0
            
            data.append({
                'student_id': student_id,
                'class_name': class_name,
                'current_level': current_level,
                'lesson_id': lesson_num + 1,
                'watched_percentage': round(watch_pct, 2),
                'video_completed': video_completed,
                'quiz_score': round(quiz_score, 2),
                'last_activity_days': days_inactive
            })
        
        student_id += 1

df_raw = pd.DataFrame(data)
print(f"‚úÖ Generated {len(df_raw)} raw records from {NUM_STUDENTS} students")

# ============================================================
# STEP 2: CLEAN DATA
# ============================================================

print("\n" + "="*70)
print("STEP 2: CLEANING DATA")
print("="*70)

# Check for missing values
missing = df_raw.isnull().sum().sum()
print(f"Missing values: {missing}")

# Check for duplicates
duplicates = df_raw.duplicated().sum()
print(f"Duplicate rows: {duplicates}")

# Remove any invalid data
df_clean = df_raw[
    (df_raw['watched_percentage'] >= 0) & 
    (df_raw['watched_percentage'] <= 100) &
    (df_raw['quiz_score'] >= 0) & 
    (df_raw['quiz_score'] <= 100) &
    (df_raw['last_activity_days'] >= 0)
].copy()

print(f"Records after cleaning: {len(df_clean)}")

# ============================================================
# STEP 3: FEATURE ENGINEERING (AGGREGATE BY STUDENT)
# ============================================================

print("\n" + "="*70)
print("STEP 3: FEATURE ENGINEERING")
print("="*70)

# Aggregate student performance in their current level
student_features = df_clean.groupby('student_id').agg({
    'watched_percentage': 'mean',
    'video_completed': 'mean',
    'quiz_score': 'mean',
    'last_activity_days': 'max',
    'lesson_id': 'count',
    'current_level': 'first',
    'class_name': 'first'
}).reset_index()

student_features.columns = [
    'student_id', 'avg_watch_pct', 'completion_rate', 'avg_quiz_score',
    'days_inactive', 'lessons_completed', 'current_level', 'class_name'
]

# Calculate risk level (TARGET VARIABLE)
# Using weighted score to create realistic overlap between classes
def calculate_risk(row):
    # Composite score (0-1 scale)
    score = (
        (row['avg_watch_pct'] / 100) * 0.35 +
        (row['avg_quiz_score'] / 100) * 0.35 +
        (row['completion_rate']) * 0.15 +
        (max(0, 20 - row['days_inactive']) / 20) * 0.15
    )
    
    # Add slight randomness to prevent perfect separation
    score += np.random.uniform(-0.05, 0.05)
    score = np.clip(score, 0, 1)
    
    # Assign risk level with some overlap
    if score >= 0.60:
        return 0  # Will Pass
    elif score >= 0.40:
        return 1  # May Struggle
    else:
        return 2  # Needs Help

student_features['risk_level'] = student_features.apply(calculate_risk, axis=1)

print(f"Total students: {len(student_features)}")
print(f"\nRisk Distribution:")
for risk in [0, 1, 2]:
    count = (student_features['risk_level'] == risk).sum()
    pct = (count / len(student_features)) * 100
    labels = {0: 'Will Pass', 1: 'May Struggle', 2: 'Needs Help'}
    print(f"   {labels[risk]:15} : {count:2} students ({pct:.1f}%)")

# Save clean data
df_clean.to_csv('hijab_journey_clean_data.csv', index=False)
student_features.to_csv('student_features.csv', index=False)
print(f"\nüíæ Saved cleaned data")

# ============================================================
# STEP 4: PREPARE TRAIN/TEST SPLIT
# ============================================================

print("\n" + "="*70)
print("STEP 4: TRAIN/TEST SPLIT")
print("="*70)

# Features (X) and Target (y)
X = student_features[[
    'avg_watch_pct',
    'completion_rate',
    'avg_quiz_score',
    'days_inactive',
    'lessons_completed'
]]

y = student_features['risk_level']

print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")

# Check if we can use stratified split
min_class_count = y.value_counts().min()
can_stratify = min_class_count >= 2

if can_stratify:
    print(f"\n‚úÖ Using stratified split (min class: {min_class_count})")
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.30, random_state=42, stratify=y
    )
else:
    print(f"\n‚ö†Ô∏è Using random split (min class: {min_class_count})")
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.30, random_state=42
    )

print(f"\nTraining set: {len(X_train)} students")
print(f"Testing set: {len(X_test)} students")

print(f"\nTraining set risk distribution:")
for risk in [0, 1, 2]:
    count = (y_train == risk).sum()
    print(f"   Risk {risk}: {count}")

print(f"\nTesting set risk distribution:")
for risk in [0, 1, 2]:
    count = (y_test == risk).sum()
    print(f"   Risk {risk}: {count}")

# ============================================================
# STEP 5: TRAIN MODEL
# ============================================================

print("\n" + "="*70)
print("STEP 5: TRAINING MODEL")
print("="*70)

# Initialize Logistic Regression with regularization
model = LogisticRegression(
    max_iter=1000,
    random_state=42,
    C=0.5,  # Regularization (lower = more regularization)
    solver='lbfgs',
    class_weight='balanced',  # Handle imbalanced classes
    penalty='l2'
)

# Train the model
print("Training Logistic Regression model...")
model.fit(X_train, y_train)
print("‚úÖ Training complete!")

# ============================================================
# STEP 6: TEST MODEL & EVALUATE
# ============================================================

print("\n" + "="*70)
print("STEP 6: MODEL EVALUATION")
print("="*70)

# Predictions on training set
y_train_pred = model.predict(X_train)
train_accuracy = accuracy_score(y_train, y_train_pred)

# Predictions on test set
y_test_pred = model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)

print(f"\nüìä Accuracy Results:")
print(f"   Training Accuracy: {train_accuracy:.1%}")
print(f"   Testing Accuracy: {test_accuracy:.1%}")
print(f"   Difference: {abs(train_accuracy - test_accuracy):.1%}")

# Check for overfitting
if train_accuracy - test_accuracy > 0.20:
    print(f"\n‚ö†Ô∏è WARNING: Large train-test gap suggests overfitting!")
elif test_accuracy > 0.95:
    print(f"\n‚ö†Ô∏è WARNING: Suspiciously high accuracy - check for data leakage!")
elif test_accuracy < 0.50:
    print(f"\n‚ö†Ô∏è WARNING: Low accuracy - model not learning properly!")
else:
    print(f"\n‚úÖ Model performance looks realistic!")

# Detailed classification report
print(f"\nüìã Classification Report (Test Set):")
print(classification_report(
    y_test, 
    y_test_pred,
    target_names=['Will Pass', 'May Struggle', 'Needs Help'],
    zero_division=0
))

# ============================================================
# STEP 7: CONFUSION MATRIX
# ============================================================

print("\n" + "="*70)
print("STEP 7: CONFUSION MATRIX")
print("="*70)

cm = confusion_matrix(y_test, y_test_pred)

print(f"\nConfusion Matrix:")
print(f"                Predicted")
print(f"              Pass  Struggle  Help")
print(f"Actual Pass   [{cm[0][0]:3}     {cm[0][1]:3}     {cm[0][2]:3}]")
print(f"      Strug   [{cm[1][0]:3}     {cm[1][1]:3}     {cm[1][2]:3}]")
print(f"      Help    [{cm[2][0]:3}     {cm[2][1]:3}     {cm[2][2]:3}]")

# Visualize confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(
    cm, 
    annot=True, 
    fmt='d', 
    cmap='Blues',
    xticklabels=['Will Pass', 'May Struggle', 'Needs Help'],
    yticklabels=['Will Pass', 'May Struggle', 'Needs Help']
)
plt.title('Confusion Matrix - Student Risk Prediction')
plt.ylabel('Actual Risk Level')
plt.xlabel('Predicted Risk Level')
plt.tight_layout()
plt.savefig('confusion_matrix.png', dpi=300, bbox_inches='tight')
print(f"\nüíæ Saved confusion matrix visualization: confusion_matrix.png")
plt.close()

# ============================================================
# STEP 8: FEATURE IMPORTANCE
# ============================================================

print("\n" + "="*70)
print("STEP 8: FEATURE IMPORTANCE")
print("="*70)

# Get feature coefficients
feature_names = X.columns
coefficients = model.coef_

print(f"\nüéØ Feature Impact on Risk Prediction:")
print(f"(Positive = increases risk, Negative = decreases risk)\n")

# Average coefficients across all classes
avg_coef = coefficients.mean(axis=0)
feature_importance = list(zip(feature_names, avg_coef))
feature_importance.sort(key=lambda x: abs(x[1]), reverse=True)

for feature, coef in feature_importance:
    direction = "‚Üë Increases risk" if coef > 0 else "‚Üì Decreases risk"
    print(f"   {feature:25} : {coef:+.4f}  {direction}")

# ============================================================
# STEP 9: SAVE MODEL
# ============================================================

print("\n" + "="*70)
print("STEP 9: SAVING MODEL")
print("="*70)

# Save the trained model
joblib.dump(model, 'student_risk_model.pkl')
print(f"‚úÖ Model saved: student_risk_model.pkl")

# Save feature names for later use
feature_info = {
    'feature_names': list(feature_names),
    'training_accuracy': train_accuracy,
    'testing_accuracy': test_accuracy,
    'n_students_train': len(X_train),
    'n_students_test': len(X_test)
}

import json
with open('model_info.json', 'w') as f:
    json.dump(feature_info, f, indent=2)
print(f"‚úÖ Model info saved: model_info.json")

# ============================================================
# STEP 10: TEST SAVED MODEL (VERIFICATION)
# ============================================================

print("\n" + "="*70)
print("STEP 10: TESTING SAVED MODEL")
print("="*70)

# Load model from disk
loaded_model = joblib.load('student_risk_model.pkl')
print("‚úÖ Model loaded successfully")

# Test on example student
example_student = pd.DataFrame({
    'avg_watch_pct': [72.5],
    'completion_rate': [0.75],
    'avg_quiz_score': [65.0],
    'days_inactive': [6],
    'lessons_completed': [3]
})

prediction = loaded_model.predict(example_student)[0]
probabilities = loaded_model.predict_proba(example_student)[0]

risk_labels = {0: 'Will Pass ‚úÖ', 1: 'May Struggle ‚ö†Ô∏è', 2: 'Needs Help üÜò'}

print(f"\nüëß Example Student:")
print(f"   Watch %: {example_student['avg_watch_pct'].values[0]:.1f}%")
print(f"   Completion: {example_student['completion_rate'].values[0]:.1%}")
print(f"   Avg Quiz: {example_student['avg_quiz_score'].values[0]:.1f}")
print(f"   Days Inactive: {int(example_student['days_inactive'].values[0])}")
print(f"   Lessons Done: {int(example_student['lessons_completed'].values[0])}")

print(f"\nüéØ Prediction: {risk_labels[prediction]}")
print(f"\nüìä Confidence:")
for i, prob in enumerate(probabilities):
    print(f"   {risk_labels[i]:20} : {prob:.1%}")

# ============================================================
# FINAL SUMMARY
# ============================================================

print("\n" + "="*70)
print("PIPELINE COMPLETE - SUMMARY")
print("="*70)

print(f"""
‚úÖ ALL STEPS COMPLETED SUCCESSFULLY

üìä Model Performance:
   ‚Ä¢ Training Accuracy: {train_accuracy:.1%}
   ‚Ä¢ Testing Accuracy: {test_accuracy:.1%}
   ‚Ä¢ Model Type: Logistic Regression
   ‚Ä¢ Features Used: {len(feature_names)}
   ‚Ä¢ Training Students: {len(X_train)}
   ‚Ä¢ Testing Students: {len(X_test)}

üìÅ Files Generated:
   1. hijab_journey_clean_data.csv - Cleaned lesson data
   2. student_features.csv - Aggregated student features
   3. student_risk_model.pkl - Trained ML model
   4. model_info.json - Model metadata
   5. confusion_matrix.png - Visualization

üéì Ready for Senior Project Presentation!

üí° Key Points to Remember:
   ‚Ä¢ {test_accuracy:.0%} accuracy is realistic for education
   ‚Ä¢ Model identifies at-risk students early
   ‚Ä¢ Teachers can intervene before level quiz
   ‚Ä¢ Scalable to more students in production
""")

print("="*70)
print("üéâ SUCCESS!")
print("="*70)

HIJAB JOURNEY ML PIPELINE

STEP 1: GENERATING DATA
‚úÖ Generated 91 raw records from 30 students

STEP 2: CLEANING DATA
Missing values: 0
Duplicate rows: 0
Records after cleaning: 91

STEP 3: FEATURE ENGINEERING
Total students: 30

Risk Distribution:
   Will Pass       : 13 students (43.3%)
   May Struggle    :  8 students (26.7%)
   Needs Help      :  9 students (30.0%)

üíæ Saved cleaned data

STEP 4: TRAIN/TEST SPLIT
Features shape: (30, 5)
Target shape: (30,)

‚úÖ Using stratified split (min class: 8)

Training set: 21 students
Testing set: 9 students

Training set risk distribution:
   Risk 0: 9
   Risk 1: 6
   Risk 2: 6

Testing set risk distribution:
   Risk 0: 4
   Risk 1: 2
   Risk 2: 3

STEP 5: TRAINING MODEL
Training Logistic Regression model...




‚úÖ Training complete!

STEP 6: MODEL EVALUATION

üìä Accuracy Results:
   Training Accuracy: 100.0%
   Testing Accuracy: 77.8%
   Difference: 22.2%


üìã Classification Report (Test Set):
              precision    recall  f1-score   support

   Will Pass       1.00      0.75      0.86         4
May Struggle       0.50      0.50      0.50         2
  Needs Help       0.75      1.00      0.86         3

    accuracy                           0.78         9
   macro avg       0.75      0.75      0.74         9
weighted avg       0.81      0.78      0.78         9


STEP 7: CONFUSION MATRIX

Confusion Matrix:
                Predicted
              Pass  Struggle  Help
Actual Pass   [  3       1       0]
      Strug   [  0       1       1]
      Help    [  0       0       3]

üíæ Saved confusion matrix visualization: confusion_matrix.png

STEP 8: FEATURE IMPORTANCE

üéØ Feature Impact on Risk Prediction:
(Positive = increases risk, Negative = decreases risk)

   avg_watch_pct        

In [3]:


# ============================================================
# STEP 4: BATCH TESTING (Multiple Students)
# ============================================================

print(f"\n{'='*70}")
print("BATCH TESTING - MULTIPLE STUDENTS")
print(f"{'='*70}")

# Create a batch of students to test
batch_students = pd.DataFrame({
    'avg_watch_pct': [85.0, 70.0, 45.0, 78.0, 92.0, 38.0],
    'completion_rate': [0.9, 0.7, 0.4, 0.8, 1.0, 0.3],
    'avg_quiz_score': [80.0, 65.0, 40.0, 72.0, 88.0, 35.0],
    'days_inactive': [3, 7, 12, 5, 2, 18],
    'lessons_completed': [4, 3, 2, 4, 5, 2]
})

# Predict for all students
predictions = model.predict(batch_students)
probabilities = model.predict_proba(batch_students)

print(f"\nüìä Batch Prediction Results:")
print(f"\n{'Student':<10} {'Watch%':<10} {'Quiz':<8} {'Inactive':<10} {'Prediction':<20} {'Confidence'}")
print("-" * 80)

for i, (pred, probs) in enumerate(zip(predictions, probabilities), 1):
    watch = batch_students['avg_watch_pct'].iloc[i-1]
    quiz = batch_students['avg_quiz_score'].iloc[i-1]
    inactive = batch_students['days_inactive'].iloc[i-1]
    confidence = probs[pred]
    
    print(f"Student {i:<3} {watch:>5.1f}%{'':<4} {quiz:>5.1f}{'':<3} "
          f"{int(inactive):>2} days{'':<3} {risk_labels[pred]:<20} {confidence:>6.1%}")

# Summary statistics
print(f"\nüìà Batch Summary:")
print(f"   Total Students: {len(predictions)}")
print(f"   Will Pass: {(predictions == 0).sum()} students")
print(f"   May Struggle: {(predictions == 1).sum()} students")
print(f"   Needs Help: {(predictions == 2).sum()} students")


BATCH TESTING - MULTIPLE STUDENTS

üìä Batch Prediction Results:

Student    Watch%     Quiz     Inactive   Prediction           Confidence
--------------------------------------------------------------------------------
Student 1    85.0%      80.0     3 days    Will Pass ‚úÖ          100.0%
Student 2    70.0%      65.0     7 days    May Struggle ‚ö†Ô∏è       94.3%
Student 3    45.0%      40.0    12 days    Needs Help üÜò          95.3%
Student 4    78.0%      72.0     5 days    Will Pass ‚úÖ           95.3%
Student 5    92.0%      88.0     2 days    Will Pass ‚úÖ          100.0%
Student 6    38.0%      35.0    18 days    Needs Help üÜò         100.0%

üìà Batch Summary:
   Total Students: 6
   Will Pass: 3 students
   May Struggle: 1 students
   Needs Help: 2 students
