In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error,root_mean_squared_error
from joblib import dump

# Load your dataset
df = pd.read_csv("data/student_por (1).csv") # fle path 

# Select important features
important_features = ['G1', 'G2', 'failures', 'absences', 'higher', 'studytime', 'age', 'Dalc', 'goout']
target = 'G3'

# Convert 'higher' to binary  (yes/no → 1/0)
df['higher'] = df['higher'].map({'yes': 1, 'no': 0})

# Drop rows with missing values (if any)
df = df.dropna(subset=important_features + [target])

# Split data
X = df[important_features]
y = df[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)




In [4]:
df[['Dalc', 'goout']].max()

Dalc     5
goout    5
dtype: int64

In [7]:
df['studytime'].max()

np.int64(4)

In [9]:
#checking once again model on the old data  
y_train_pred=model.predict(X_train)
#performance measure
rmse =root_mean_squared_error(y_train, y_train_pred)
print(f" RMSE on test set: {rmse:.2f}")

 RMSE on test set: 0.53


In [11]:
# Evaluate
y_pred = model.predict(X_test)
rmse = root_mean_squared_error(y_test, y_pred)
print(f"RMSE on test set: {rmse:.2f}")

# Save the new lightweight model
dump(model, "light_student_performance_predictor_model.joblib")
print(" Model saved as light_student_performance_predictor_model.joblib")

RMSE on test set: 1.24
 Model saved as light_student_performance_predictor_model.joblib


In [13]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn .linear_model import LogisticRegression


# Create at_risk target 
df["at_risk"] = df["G3"].apply(lambda x: 1 if x < 7.5 else 0)

# Features 
features = ['G1', 'G2', 'failures', 'absences', 'higher', 'studytime', 'age', 'Dalc', 'goout']
X = df[features]
y = df["at_risk"]

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)




In [14]:
# Train model
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train, y_train)
y_train_pred=clf.predict(X_train)
# Evaluate on familiar data

print(classification_report(y_train, y_train_pred))


              precision    recall  f1-score   support

           0       1.00      1.00      1.00       492
           1       1.00      1.00      1.00        27

    accuracy                           1.00       519
   macro avg       1.00      1.00      1.00       519
weighted avg       1.00      1.00      1.00       519



In [15]:
from sklearn.metrics import  confusion_matrix
confusion_mrx=confusion_matrix(y_train,y_train_pred)

print(confusion_mrx)

[[492   0]
 [  0  27]]


In [16]:
# logistic regression for classification
logicr_reg=LogisticRegression()
logicr_reg.fit(X_train, y_train)
#evaluate  on familiar data
y_train_pred=logicr_reg.predict(X_train)
print("the confusion matrix report of the logistic reg:")
print(classification_report(y_train, y_train_pred))


the confusion matrix report of the logistic reg:
              precision    recall  f1-score   support

           0       0.98      0.99      0.99       492
           1       0.85      0.63      0.72        27

    accuracy                           0.97       519
   macro avg       0.91      0.81      0.86       519
weighted avg       0.97      0.97      0.97       519



In [17]:
confusion_mrx=confusion_matrix(y_train,y_train_pred)
print("the confusion matrix of the logistic reg:")
print(confusion_mrx)

the confusion matrix of the logistic reg:
[[489   3]
 [ 10  17]]


In [19]:
# Evaluate
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

# Save our best classifcation model
import  joblib
joblib.dump(clf, "light_at_risk_classifier_model.joblib")

              precision    recall  f1-score   support

           0       0.98      0.99      0.99       127
           1       0.50      0.33      0.40         3

    accuracy                           0.98       130
   macro avg       0.74      0.66      0.69       130
weighted avg       0.97      0.98      0.97       130



['light_at_risk_classifier_model.joblib']

In [20]:
confusion_mrx=confusion_matrix(y_test,y_pred)

print(confusion_mrx)

[[126   1]
 [  2   1]]


In [22]:
#let's see logistic on test data
y_pred=logicr_reg.predict(X_test)
print("the confusion matrix report of the logistic reg:")
print(classification_report(y_test, y_pred))
#confusion matrix
print('confusion matrix of the logistic on test set')
confusion_matrix(y_test,y_pred)

the confusion matrix report of the logistic reg:
              precision    recall  f1-score   support

           0       0.99      0.99      0.99       127
           1       0.67      0.67      0.67         3

    accuracy                           0.98       130
   macro avg       0.83      0.83      0.83       130
weighted avg       0.98      0.98      0.98       130

confusion matrix of the logistic on test set


array([[126,   1],
       [  1,   2]])