In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

In [2]:
df = pd.read_csv('data.csv')

In [3]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
columns_to_label_encode = ['NOMBRE_UNIDAD_EDUC', 'NOMBRE_COMUNA_EGRESO',
                            'PAGO_MATRICULA2020', 'PAGO_MENSUAL2020', 'PAGO_MATRICULA2021', 'PAGO_MENSUAL2021',
                            'PAGO_MATRICULA2022', 'PAGO_MENSUAL2022', 'PAGO_MATRICULA2023', 'PAGO_MENSUAL2023']

label_encoder = LabelEncoder()
for column in columns_to_label_encode:
    df.loc[:, column] = label_encoder.fit_transform(df.loc[:, column])

In [4]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
df_normalized = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)

In [5]:
data = df_normalized.drop(['MRUN', 'CLEC_REG_ACTUAL', 'MATE1_REG_ACTUAL', 'MATE2_REG_ACTUAL', 'PROMEDIO_CM_MAX', ], axis=1).values
columns = df_normalized.drop(['MRUN', 'CLEC_REG_ACTUAL', 'MATE1_REG_ACTUAL', 'MATE2_REG_ACTUAL', 'PROMEDIO_CM_MAX', ], axis=1).columns
target = df_normalized['PROMEDIO_CM_MAX'].values

In [6]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.1)

### Variación de hiperparámetros

In [7]:
# Define the hyperparameter grid
param_grid = {
    'min_samples_leaf': [1, 2, 4],
    'n_estimators': [50,100, 200],
    'min_samples_split': [2, 5, 10],
    'max_depth': [None, 10,20],
    'max_features': ['sqrt', 'log2', None],
    'oob_score': [True]
}
# Create a random forest regressor
rf = RandomForestRegressor()

# Perform grid search with cross-validation
from sklearn.model_selection import GridSearchCV
grid_search = GridSearchCV(rf, param_grid, cv=3,verbose=3)
grid_search.fit(X_train[:5000], y_train[:5000])

# Get the best hyperparameters
best_params = grid_search.best_params_

# Train the model with the best hyperparameters
rf_best = RandomForestRegressor(**best_params)
rf_best.fit(X_train, y_train)

# Evaluate the model on the test set
predictions = rf_best.predict(X_test)
mse = mean_squared_error(y_test, predictions)
print(f'Testing Mean Squared Error: {mse}')
mae = mean_absolute_error(y_test, predictions)
print(f'Testing Mean Absolute Error: {mae}')
r2 = r2_score(y_test, predictions)
print(f'Test R-squared: {r2}')
print(best_params)

Fitting 3 folds for each of 243 candidates, totalling 729 fits
[CV 1/3] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=50, oob_score=True;, score=0.465 total time=   0.6s
[CV 2/3] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=50, oob_score=True;, score=0.481 total time=   0.8s
[CV 3/3] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=50, oob_score=True;, score=0.491 total time=   0.8s
[CV 1/3] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100, oob_score=True;, score=0.480 total time=   2.7s
[CV 2/3] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100, oob_score=True;, score=0.490 total time=   2.1s
[CV 3/3] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100, oob_score=True;, score=0.490 total time=   2.1s
[CV 1/3] E

In [8]:
importance=np.array(rf_best.feature_importances_)

In [9]:
for a in range(len(columns)):
    print(f'{columns[a]}: {importance[a]:.3f}')

COD_SEXO: 0.044
RBD: 0.020
NOMBRE_UNIDAD_EDUC: 0.010
DEPENDENCIA: 0.021
NOMBRE_COMUNA_EGRESO: 0.008
ANYO_DE_EGRESO: 0.164
PROMEDIO_NOTAS: 0.337
PTJE_NEM: 0.174
PORC_SUP_NOTAS: 0.006
PTJE_RANKING: 0.013
PROM_GRAL2020: 0.002
ASISTENCIA2020: 0.001
PROM_GRAL2021: 0.002
ASISTENCIA2021: 0.002
PROM_GRAL2022: 0.002
ASISTENCIA2022: 0.003
GEN_ALU2023: 0.000
EDAD_ALU: 0.035
COD_COM_ALU2023: 0.002
PROM_GRAL2023: 0.002
ASISTENCIA2023: 0.003
COD_JOR: 0.000
COD_COM_RBD: 0.002
MismaCom: 0.000
PAGO_MATRICULA2020: 0.001
PAGO_MENSUAL2020: 0.001
PAGO_MATRICULA2021: 0.001
PAGO_MENSUAL2021: 0.001
PAGO_MATRICULA2022: 0.001
PAGO_MENSUAL2022: 0.001
PAGO_MATRICULA2023: 0.004
PAGO_MENSUAL2023: 0.137
RURAL: 0.000
