In [71]:
import pandas as pd
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

In [72]:
df = pd.read_csv('student_performance_dataset.csv')
df.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,male,group E,high school,free/reduced,none,95,67,91
1,female,group E,master's degree,standard,completed,88,90,67
2,male,group A,bachelor's degree,free/reduced,completed,81,82,53
3,male,group C,some college,standard,none,81,97,66
4,male,group B,high school,standard,completed,95,44,80


In [73]:
# types of columns we have
df.dtypes

gender                         object
race/ethnicity                 object
parental level of education    object
lunch                          object
test preparation course        object
math score                      int64
reading score                   int64
writing score                   int64
dtype: object

In [74]:
# Any dupes?
df = df.drop_duplicates()

In [75]:
# missing values?
df.notnull().count()

gender                         500
race/ethnicity                 500
parental level of education    500
lunch                          500
test preparation course        500
math score                     500
reading score                  500
writing score                  500
dtype: int64

In [76]:
# Create a Final_Score column
sum_of_of_scores = df['math score'] + df['reading score'] + df['writing score']
df['Final_Score'] = sum_of_of_scores / 3
df['Final_Score']

0      84.333333
1      81.666667
2      72.000000
3      81.333333
4      73.000000
         ...    
495    79.000000
496    76.666667
497    69.333333
498    51.333333
499    77.000000
Name: Final_Score, Length: 500, dtype: float64

In [77]:
# drop
df = df.drop(columns=['math score', 'reading score', 'writing score'])
df.columns

Index(['gender', 'race/ethnicity', 'parental level of education', 'lunch',
       'test preparation course', 'Final_Score'],
      dtype='object')

Features

In [78]:
features = df.loc[:,df.columns != 'Final_Score']

labels

In [79]:
labels = df['Final_Score']

In [80]:
# One hot encode categorical features
features = pd.get_dummies(features)
features.columns

Index(['gender_female', 'gender_male', 'race/ethnicity_group A',
       'race/ethnicity_group B', 'race/ethnicity_group C',
       'race/ethnicity_group D', 'race/ethnicity_group E',
       'parental level of education_associate's degree',
       'parental level of education_bachelor's degree',
       'parental level of education_high school',
       'parental level of education_master's degree',
       'parental level of education_some college', 'lunch_free/reduced',
       'lunch_standard', 'test preparation course_completed',
       'test preparation course_none'],
      dtype='object')

train, test, split

In [81]:
features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size=0.3)

Create untrained model

In [82]:
model = DecisionTreeRegressor()

Train model on the Training Set

In [83]:
model.fit(features_train, labels_train)

In [84]:
# Compute Training Accuracy
train_predictions = model.predict(features_train)
print('Training Score :', mean_squared_error(labels_train, train_predictions))

Training Score : 49.40503275384227


In [85]:
# Compute Test Accuracy
test_predictions = model.predict(features_test)
print('Testing Score:', mean_squared_error(labels_test, test_predictions))

Testing Score: 177.93400055009658


In [None]:
# So my training score is 57.765
# While my testing score is 156.635
# This clearly demonstrates that the model was not generalizing well to unseen data and its overfitting the training data.

In [86]:
                    # Model 2(Random Forest)

In [87]:
# import libraries
from sklearn.ensemble import RandomForestRegressor

In [88]:
# Random Forest Model
model = RandomForestRegressor(n_estimators= 100, max_depth= 5, random_state= 42)

In [89]:
# Train model on the training set
model.fit(features_train, labels_train)

In [90]:
# Compute training Accuracy 
train_predictions = model.predict(features_train)
print('Training Score:', mean_squared_error(labels_train, train_predictions))

Training Score: 74.25171968262326


Compute Test Accuracy

In [91]:
test_predictions = model.predict(features_test)
print('Testing Score:', mean_squared_error(labels_test, test_predictions))

Testing Score: 109.59186660386852


Training MSE is now 75.988
Testing MSE is 105.408
For this Random Forest model, my generalization is way better. The test error has really improved over the original decision tree.
The training error is a bit higher but since the testing error is lower now and similar to the training error, now I feel that the model is good.
I say this because additionally, since my MSE is 105.408, then that means that my RMSE is 10.27. So my predictions are going to be about 10 points off
out of 100. I feel like for this model where we are determining student scores, this should be good. 

Now im going to deploy my model with joblip

In [92]:
import joblib
joblib.dump(model, "final_rf_model.pkl")

['final_rf_model.pkl']

In [93]:
import streamlit as st
import pandas as pd
import joblib

In [94]:
# Load your trained model
model = joblib.load("final_rf_model.pkl")

In [95]:
st.title("Student Final Score Predictor")

DeltaGenerator()

In [96]:
# Input form
gender = st.selectbox("Gender", ["male", "female"])
lunch = st.selectbox("Lunch Type", ["standard", "free/reduced"])
test_prep = st.selectbox("Test Preparation Course", ["none", "completed"])
parent_edu = st.selectbox("Parental Education Level", [
    "high school", "some college", "associate's degree",
    "bachelor's degree", "master's degree"
])
race_ethnicity = st.selectbox("Race/Ethnicity Group", ["group A", "group B", "group C", "group D", "group E"])

In [97]:
# Prepare input as a DataFrame
user_input = pd.DataFrame({
    "gender": [gender],
    "lunch": [lunch],
    "test preparation course": [test_prep],
    "parental level of education": [parent_edu],
    "race/ethnicity": [race_ethnicity]
})

In [98]:
# One-hot encode to match training
user_input_encoded = pd.get_dummies(user_input)

In [99]:
# Align with training features
model_features = model.feature_names_in_  # only available in sklearn >= 1.0
for col in model_features:
    if col not in user_input_encoded.columns:
        user_input_encoded[col] = 0
user_input_encoded = user_input_encoded[model_features]

In [100]:
# Predict
if st.button("Predict Final Score"):
    prediction = model.predict(user_input_encoded)[0]
    st.success(f"Estimated Final Score: **{round(prediction, 2)}** / 100")