In [25]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [26]:
df = pd.read_csv("test_estandarizado_percentiles.csv")
target = "TestScore_Math"

In [33]:

y = df[target]
x = df.drop(columns = [target])


x_train_val, x_test, y_train_val, y_test = train_test_split(x,y, test_size = 0.2, random_state = 42)

x_train, x_val, y_train, y_val = train_test_split(x_train_val,y_train_val, test_size = 0.2, random_state = 42)


print(f"Train : {len(x_train_val)}")
print(f"Validation : {len(x_val)}")
print(f"Test : {len(x_test)}")

Train : 799997
Validation : 160000
Test : 200000


In [34]:
x_train.to_csv("X_train.csv", index=False)
y_train.to_csv("Y_train.csv", index=False)
x_val.to_csv("X_val.csv", index=False)
y_val.to_csv("Y_val.csv", index=False)
x_test.to_csv("X_test.csv", index=False)
y_test.to_csv("Y_test.csv", index=False)



In [35]:
trainX = pd.read_csv("X_train.csv")
trainY = np.ravel(pd.read_csv("Y_train.csv"))
testX = pd.read_csv("X_test.csv")
testY = np.ravel(pd.read_csv("Y_test.csv"))
valX = pd.read_csv("X_val.csv")
valY = np.ravel(pd.read_csv("Y_val.csv"))

In [36]:
train_sample = trainX.sample(frac=0.1, random_state=42)
trainY_sample = trainY[:len(train_sample)]

if "PercentileRange" in train_sample.columns:
    train_sample = train_sample.drop(columns=["PercentileRange"])
    valX = valX.drop(columns=["PercentileRange"])


In [37]:
results = []
n_estimators_values = [50,100]
max_depth_values = [5,10]
min_samples_split_values = [2,5]

In [38]:

for n_estimators in n_estimators_values:
    for max_depth in max_depth_values:
        for min_samples_split in min_samples_split_values:

            model = RandomForestRegressor(
                n_estimators=n_estimators,
                max_depth=max_depth,
                min_samples_split=min_samples_split,
                random_state=42,
                n_jobs=-1  )


            model.fit(train_sample, trainY_sample)
            val_pred = model.predict(valX)
            mse = mean_squared_error(valY, val_pred)
            r2 = r2_score(valY, val_pred)

            results.append({
                "n_estimators": n_estimators,
                "max_depth": max_depth,
                "min_samples_split": min_samples_split,
                "MSE": mse,
                "R2": r2
            })

results_df = pd.DataFrame(results)
print(results_df.sort_values(by="R2", ascending=False))

   n_estimators  max_depth  min_samples_split        MSE        R2
0            50          5                  2  98.025817  0.006016
1            50          5                  5  98.027839  0.005995
3            50         10                  5  98.200424  0.004245
2            50         10                  2  98.215591  0.004092
4           100          5                  2  98.223889  0.004007
5           100          5                  5  98.227461  0.003971
7           100         10                  5  98.347262  0.002756
6           100         10                  2  98.358120  0.002646
