# Modelado: Random Forest


### Carga de librerías

In [1]:
import pandas as pd
import numpy as np
# Import mlflow for models tracking
import mlflow
# Libraries for models and metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score, roc_curve, f1_score, precision_score, recall_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV


### Carga de ficheros

In [2]:
df = pd.read_csv("../../data/final/df_train_final.csv")
df.head(5)

Unnamed: 0,TARGET,DAYS_BIRTH,EXT_SOURCE_3,EXT_SOURCE_1,EXT_SOURCE_2,QTY_APP_LAST_YEAR,RATIO_NONAPPROVED
0,1,-9461,0.139376,0.083037,0.262949,1.0,0.0
1,0,-16765,0.507626,0.311267,0.622246,0.0,0.0
2,0,-13778,0.49206,0.774761,0.724,1.0,0.0
3,0,-10197,0.363945,0.31976,0.651862,0.0,0.0
4,0,-13439,0.176653,0.464831,0.715042,4.0,0.0


In [3]:
df_all = pd.read_csv("../../data/final/df_train_final_all.csv")
df_all.head(5)

Unnamed: 0,TARGET,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,...,HOUSETYPE_MODE_terraced house,WALLSMATERIAL_MODE_Block,WALLSMATERIAL_MODE_Mixed,WALLSMATERIAL_MODE_Monolithic,WALLSMATERIAL_MODE_Others,WALLSMATERIAL_MODE_Panel,"WALLSMATERIAL_MODE_Stone, brick",WALLSMATERIAL_MODE_Wooden,EMERGENCYSTATE_MODE_No,EMERGENCYSTATE_MODE_Yes
0,1,0,202500.0,406597.5,24700.5,351000.0,0.018801,-9461,-637.0,-3648.0,...,0,0,0,0,0,0,1,0,1,0
1,0,0,270000.0,1293502.5,35698.5,1129500.0,0.003541,-16765,-1188.0,-1186.0,...,0,1,0,0,0,0,0,0,1,0
2,0,1,171000.0,1560726.0,41301.0,1395000.0,0.035792,-13778,-3130.0,-1213.0,...,0,0,0,0,0,0,0,0,0,0
3,0,1,112500.0,652500.0,21177.0,652500.0,0.0228,-10197,-679.0,-4427.0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,67500.0,80865.0,5881.5,67500.0,0.031329,-13439,-2717.0,-311.0,...,0,0,0,0,0,0,0,0,1,0


### Dataset final

#### Preparación datasets train and test

In [4]:
# Separate TARGET variable from dependent variables
X = df.drop('TARGET', axis=1)
y = df['TARGET']

# Divide dataset in train (70%) and test (30%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Generate expermient tracking in MLflow
exp_name = 'Model RandomForest'
exp_id = mlflow.create_experiment(name=exp_name)

#### Prueba Nro. 1

Evaluamos GridSearchCV para encontrar los mejores parámetros para Random Forest

In [5]:
# Define the parameter grid
param_grid = {
    'n_estimators': [10, 20, 50, 100, 150, 200], 
    'max_depth': [None, 5, 10, 15, 20]
}

# Initialize the model
model = RandomForestClassifier(random_state=42)

# Perform grid search with cross-validation
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, scoring='roc_auc', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Find best model
best_model = grid_search.best_estimator_
y_pred_prob = best_model.predict_proba(X_test)[:, 1]

# Get the best number of trees
print("Best n_estimators:", grid_search.best_params_['n_estimators'])
# Get the best max depth
print("Best max_depth:", grid_search.best_params_['max_depth'])
# Best AUC
print(f"Best AUC: {grid_search.best_score_:.4f}")

# Evaluate best model
auc_score = roc_auc_score(y_test, y_pred_prob)
print(f"AUC in test dataset: {auc_score:.4f}")

Parameters: { "use_label_encoder" } are not used.



AUC: 0.5074
Accuracy: 92.58%
Precision: 0.4500
Recall: 0.0164
F1-Score: 0.0317
Confusion Matrix:
[[54739    88]
 [ 4306    72]]
