In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, StratifiedKFold, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
 
# ---------------------------
# 1. Load data
# ---------------------------

data = pd.read_excel('processed_student_data.xlsx', sheet_name='in')
 
# ---------------------------
# 2. Prepare features
# ---------------------------
X = data[['traveltime','studytime','failures','schoolsup','famsup','paid',
          'activities','internet','freetime','goout','Dalc','Walc','health',
          'absences','G1','G2','sub','sex']]
 
# Binary yes/no → 0/1
binary_cols = ['schoolsup','famsup','paid','activities','internet']
for col in binary_cols:
    X[col] = X[col].map({'yes':1, 'no':0})
 
# One-hot encode categorical features
X = pd.get_dummies(X, columns=['sub','sex'], drop_first=True)
 
# Targets
y_reg = data['G3']                # Regression
y_cls = (data['G3'] >= 7.2).astype(int)  # Classification (Pass/Fail)
 
# ---------------------------
# 3. Random Forest Regressor
# ---------------------------
rf_reg = RandomForestRegressor(random_state=42)
 
param_grid_reg = {
    'n_estimators': [100, 200, 500],
    'max_depth': [None, 5, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}
 
kf = KFold(n_splits=5, shuffle=True, random_state=42)
search_reg = RandomizedSearchCV(rf_reg, param_grid_reg, n_iter=20, cv=kf, scoring='r2', random_state=42)
search_reg.fit(X, y_reg)
best_rf_reg = search_reg.best_estimator_
 
# Evaluate regression
y_pred_reg = best_rf_reg.predict(X)
print("\n===== Random Forest Regressor Metrics =====")
print("R²:", r2_score(y_reg, y_pred_reg))
print("RMSE:", np.sqrt(mean_squared_error(y_reg, y_pred_reg)))
print("MAE:", mean_absolute_error(y_reg, y_pred_reg))
 
# ---------------------------
# 4. Random Forest Classifier
# ---------------------------
rf_clf = RandomForestClassifier(random_state=42, class_weight='balanced')
 
param_grid_clf = {
    'n_estimators': [100, 200, 500],
    'max_depth': [None, 5, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2', None]
}
 
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
search_clf = RandomizedSearchCV(rf_clf, param_grid_clf, n_iter=20, cv=skf, scoring='f1', random_state=42)
search_clf.fit(X, y_cls)
best_rf_clf = search_clf.best_estimator_
 
# Evaluate classification
y_pred_cls = best_rf_clf.predict(X)
y_proba_cls = best_rf_clf.predict_proba(X)[:,1]
cm = confusion_matrix(y_cls, y_pred_cls)
 
print("\n===== Random Forest Classifier Metrics =====")
print("Accuracy:", accuracy_score(y_cls, y_pred_cls))
print("Precision:", precision_score(y_cls, y_pred_cls))
print("Recall:", recall_score(y_cls, y_pred_cls))
print("F1 Score:", f1_score(y_cls, y_pred_cls))
print("ROC-AUC:", roc_auc_score(y_cls, y_proba_cls))
print("Confusion Matrix:\n", cm)
 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[col] = X[col].map({'yes':1, 'no':0})
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[col] = X[col].map({'yes':1, 'no':0})
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[col] = X[col].map({'yes':1, 'no':0})
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_in


===== Random Forest Regressor Metrics =====
R²: 0.9315350990814619
RMSE: 0.7169161621163695
MAE: 0.5576768305785602

===== Random Forest Classifier Metrics =====
Accuracy: 0.9968051118210862
Precision: 1.0
Recall: 0.9967320261437909
F1 Score: 0.9983633387888707
ROC-AUC: 0.9995331465919701
Confusion Matrix:
 [[ 14   0]
 [  2 610]]


In [6]:
# Updated code for your .ipynb file
# Replace the bias-variance section with this

from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import mean_squared_error
import numpy as np
import joblib

# Function for bias-variance calculation (Regression)
def calculate_bias_variance_reg(X, y, depths, n_bootstraps=20):
    bias_list = []
    var_list = []
    error_list = []
    for d in depths:
        preds = []
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        for i in range(n_bootstraps):
            X_boot, y_boot = resample(X_train, y_train, random_state=i)
            model = RandomForestRegressor(max_depth=d, n_estimators=100, random_state=42)
            model.fit(X_boot, y_boot)
            preds.append(model.predict(X_test))
        preds = np.array(preds)
        avg_pred = np.mean(preds, axis=0)
        bias = np.mean((avg_pred - y_test) ** 2)
        variance = np.mean(np.var(preds, axis=0))
        error = bias + variance  # Corrected: total error = bias^2 + variance
        bias_list.append(bias)
        var_list.append(variance)
        error_list.append(error)
    return bias_list, var_list, error_list

# Function for bias-variance calculation (Classification)
def calculate_bias_variance_clf(X, y, depths, n_bootstraps=20):
    bias_list = []
    var_list = []
    error_list = []
    for d in depths:
        preds = []
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
        for i in range(n_bootstraps):
            X_boot, y_boot = resample(X_train, y_train, random_state=i)
            model = RandomForestClassifier(max_depth=d, n_estimators=100, random_state=42, class_weight='balanced')
            model.fit(X_boot, y_boot)
            preds.append(model.predict_proba(X_test)[:, 1])
        preds = np.array(preds)
        avg_pred = np.mean(preds, axis=0)
        bias = np.mean((avg_pred - y_test) ** 2)
        variance = np.mean(np.var(preds, axis=0))
        error = bias + variance  # Corrected: total error = bias^2 + variance (Brier score decomposition)
        bias_list.append(bias)
        var_list.append(variance)
        error_list.append(error)
    return bias_list, var_list, error_list

# Compute bias-variance data
depths = [1, 3, 5, 10, 20]  # Changed None to 20 to avoid plotting issues with None
bias_reg, var_reg, error_reg = calculate_bias_variance_reg(X, y_reg, depths)
bias_clf, var_clf, error_clf = calculate_bias_variance_clf(X, y_cls, depths)

# Save to a dictionary and dump to pkl
bias_var_data = {
    'depths': depths,
    'reg': {'bias': bias_reg, 'var': var_reg, 'error': error_reg},
    'clf': {'bias': bias_clf, 'var': var_clf, 'error': error_clf}
}
joblib.dump(bias_var_data, 'bias_var_data.pkl')

# Also save your models if not already done
joblib.dump(best_rf_reg, 'best_rf_reg.pkl')
joblib.dump(best_rf_clf, 'best_rf_clf.pkl')

print("Bias-variance data and models saved successfully!")

Bias-variance data and models saved successfully!


In [3]:
import joblib

# Save the Random Forest Regressor model
joblib.dump(best_rf_reg, 'best_rf_reg.pkl')

# Save the Random Forest Classifier model
joblib.dump(best_rf_clf, 'best_rf_clf.pkl')

['best_rf_clf.pkl']

In [None]:
print("gdfd")