In [110]:
import os
import joblib
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
# Load preprocessed data
preprocessing_objects = joblib.load("../outputs/preprocessing_all.pkl")

df_final_encoded = preprocessing_objects["data"]
binary_encoder = preprocessing_objects["binary_encoder"]
multi_encoder = preprocessing_objects["multi_encoder"]

print("Preprocessed data loaded successfully!")

# Load model training data with feature selection
training_data = joblib.load("../outputs/model_training_data_with_features.pkl")

X_final = training_data["X_final"]  # Only feature-selected columns
y = training_data["y"]              # Target variable

print("Feature-selected data loaded successfully!")
print("Selected features:", X_final.columns.tolist())

Preprocessed data loaded successfully!
Feature-selected data loaded successfully!
Selected features: ['Mental_Health_Score', 'Sleep_Hours_Per_Night', 'Avg_Daily_Usage_Hours', 'Affects_Academic_Performance_Yes']


In [111]:
# 1. Logistic Regression
from sklearn.linear_model import LogisticRegression

# Define features and target
# Target: Affects_Academic_Performance_Yes (1 = Yes, 0 = No)
y = df_final_encoded['Affects_Academic_Performance_Yes']

# Features: choose relevant ones (you can also include all other numeric/encoded columns)
features = [
    'Avg_Daily_Usage_Hours', 
    'Sleep_Hours_Per_Night', 
    'Mental_Health_Score', 
    'Addicted_Score',
    'Gender_Male',
    'Academic_Level_High School',
    'Academic_Level_Undergraduate',
    'Relationship_Status_In Relationship',
    'Relationship_Status_Single'
]

X = df_final_encoded[features]


# Train/Test Split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Train Logistic Regression

logreg = LogisticRegression(max_iter=1000)  # Increase max_iter if it doesn’t converge
logreg.fit(X_train, y_train)

# Predictions & Evaluation

y_pred = logreg.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 0.9929078014184397

Classification Report:
               precision    recall  f1-score   support

         0.0       0.98      1.00      0.99        54
         1.0       1.00      0.99      0.99        87

    accuracy                           0.99       141
   macro avg       0.99      0.99      0.99       141
weighted avg       0.99      0.99      0.99       141


Confusion Matrix:
 [[54  0]
 [ 1 86]]


### **1️⃣What the code does**

1. **Define target (y)**:
    - Affects_Academic_Performance_Yes — binary (1 = Yes, 0 = No).
2. **Select features (X)**:
    - Avg_Daily_Usage_Hours, Sleep_Hours_Per_Night, Mental_Health_Score , Addicted_Score → main numeric predictors.
    - Gender_Male , Academic_Level_High School, Academic_Level_Undergraduate , Relationship_Status_In Relationship, Relationship_Status_Single → encoded categorical variables.
3. **Split data**:
    - 80% for training, 20% for testing (train_test_split).
4. **Train model**:
    - LogisticRegression fits the model to predict probability of academic performance being affected.
5. **Predict & evaluate**:
    - y_pred = logreg.predict(X_test) predicts 0 or 1 for the test set.
    - Metrics used:
        - **Accuracy** – overall correctness.
        - **Classification report** – precision, recall, f1-score for each class.
        - **Confusion matrix** – actual vs predicted counts.

### **2️⃣ Interpreting your output**

**Accuracy: 0.9929**

- The model correctly predicts **~99%** of students’ academic impact.
- **Precision**: When the model predicts “Yes”, how often is it correct?
- **Recall**: Of all actual “Yes” cases, how many did it catch?
- **F1-score**: Harmonic mean of precision & recall (balance).
- **Support**: Number of actual samples in each class.
- **Confusion Matrix**

Row = actual, Column = predicted

- 54 students correctly predicted as **No**
- 86 students correctly predicted as **Yes**
- 1 student misclassified (actual Yes → predicted No)