In [None]:
import pandas as pd
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

In [None]:
df = pd.read_csv('student_performance_dataset.csv')
df.head()

In [None]:
# types of columns we have
df.dtypes

In [None]:
# Any dupes?
df = df.drop_duplicates()

In [None]:
# missing values?
df.notnull().count()

In [None]:
# Create a Final_Score column
sum_of_of_scores = df['math score'] + df['reading score'] + df['writing score']
df['Final_Score'] = sum_of_of_scores / 3
df['Final_Score']

In [None]:
# drop
df = df.drop(columns=['math score', 'reading score', 'writing score'])
df.columns

Features

In [None]:
features = df.loc[:,df.columns != 'Final_Score']

labels

In [None]:
labels = df['Final_Score']

In [None]:
# One hot encode categorical features
features = pd.get_dummies(features)
features.columns

train, test, split

In [None]:
features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size=0.3)

Create untrained model

In [None]:
model = DecisionTreeRegressor()

Train model on the Training Set

In [None]:
model.fit(features_train, labels_train)

In [None]:
# Compute Training Accuracy
train_predictions = model.predict(features_train)
print('Training Score :', mean_squared_error(labels_train, train_predictions))

In [None]:
# Compute Test Accuracy
test_predictions = model.predict(features_test)
print('Testing Score:', mean_squared_error(labels_test, test_predictions))

So my training score is 57.765
While my testing score is 156.635
This clearly demonstrates thatm model was not generalizing well to unseen data and its overfitting the training data.

In [None]:
                        # MODEL 2

In [None]:
# Im going to regularize the tree and give it 5 stems of depth and min samples to split a node.
model = DecisionTreeRegressor(max_depth= 5, min_samples_split= 10, min_samples_leaf= 5)

In [None]:
# Train model on the training set 
model.fit(features_train, labels_train)

In [None]:
# Compute training accuracy 
train_predictions = model.predict(features_train)
print('Training Score:', mean_squared_error(labels_train, train_predictions))

In [None]:
# Compute test Accuracy
test_predictions = model.predict(features_test)
print('Testing ScoreL', mean_squared_error(labels_test, test_predictions))

Training MSE is now 57.77
Testing MSE is 156.63
Nothing really changed by me regularizing the tree and giving it less stems. The gap between both error scores are still too large.
I dont think that a single decision tree is going to give me my desired Generalization. Im going to go ahead and see if a Random Forest is better.

In [None]:
                    # Model 3(Random Forest)

In [None]:
# import libraries
from sklearn.ensemble import RandomForestRegressor

In [None]:
# Random Forest Model
model = RandomForestRegressor(n_estimators= 100, max_depth= 5, random_state= 42)

In [None]:
# Train model on the training set
model.fit(features_train, labels_train)

In [None]:
# Compute training Accuracy 
train_predictions = model.predict(features_train)
print('Training Score:', mean_squared_error(labels_train, train_predictions))

Compute Test Accuracy

In [None]:
test_predictions = model.predict(features_test)
print('Testing Score:', mean_squared_error(labels_test, test_predictions))

Training MSE is now 75.988
Testing MSE is 105.408
For this Random Forest model, my generalization is way better. The test error has really improved over the original decision tree.
The training error is a bit higher but since the testing error is lower now and similar to the training error, now I feel that the model is good.
I say this because additionally, since my MSE is 105.408, then that means that my RMSE is 10.27. So my predictions are going to be about 10 points off
out of 100. I feel like for this model where we are determining student scores, this should be good. 

Now im going to deploy my model with joblip

In [None]:
import joblib
joblib.dump(model, "final_rf_model.pkl")

In [None]:
import streamlit as st
import pandas as pd
import joblib

In [None]:
# Load your trained model
model = joblib.load("final_rf_model.pkl")

In [None]:
st.title("Student Final Score Predictor")

In [None]:
# Input form
gender = st.selectbox("Gender", ["male", "female"])
lunch = st.selectbox("Lunch Type", ["standard", "free/reduced"])
test_prep = st.selectbox("Test Preparation Course", ["none", "completed"])
parent_edu = st.selectbox("Parental Education Level", [
    "high school", "some college", "associate's degree",
    "bachelor's degree", "master's degree"
])
race_ethnicity = st.selectbox("Race/Ethnicity Group", ["group A", "group B", "group C", "group D", "group E"])

In [None]:
# Prepare input as a DataFrame
user_input = pd.DataFrame({
    "gender": [gender],
    "lunch": [lunch],
    "test preparation course": [test_prep],
    "parental level of education": [parent_edu],
    "race/ethnicity": [race_ethnicity]
})

In [None]:
# One-hot encode to match training
user_input_encoded = pd.get_dummies(user_input)

In [None]:
# Align with training features
model_features = model.feature_names_in_  # only available in sklearn >= 1.0
for col in model_features:
    if col not in user_input_encoded.columns:
        user_input_encoded[col] = 0
user_input_encoded = user_input_encoded[model_features]

In [None]:
# Predict
if st.button("Predict Final Score"):
    prediction = model.predict(user_input_encoded)[0]
    st.success(f"Estimated Final Score: **{round(prediction, 2)}** / 100")