# **Final Project on Introduction to Machine Learning**
## Andrés Peña Montalvo
## 23-05-2025

In [None]:
# instalación de pycaret
pip install git+https://github.com/pycaret/pycaret.git@master --upgrade

In [None]:
# check installed version
import pycaret
pycaret.__version__

In [None]:
# importando librerías para regresión
from pycaret.regression import *

In [None]:
# leyendo la tabla de datos alojada en github
import pandas as pd
github_url = "https://raw.githubusercontent.com/APDataSc/mli_colmex_2025/refs/heads/main/cpv_geo_2022.csv"
df = pd.read_csv(github_url)
print(df.head())

In [None]:
# dimensiones
print(df.shape)

In [None]:
# preprocesamiento
df['nivel_peligrosidad'] = 4 - df['nivel_peligrosidad']  # 1→3, 2→2, 3→1

X = df.drop(columns=["tot_person", 'id_prov', 'tot_viv_vopp', 'id_can', 'id_parr', 'id_sector', 'tot_viv',
                     'nivel_urbanizacion', 'num_cuartos', 'num_hogares', 'tot_viv_colectivas', 'id_zona',
                     'tot_otra_particular', 'tot_viv_otra', 'tot_viv_col'])
y = df["tot_person"]

from sklearn.model_selection import train_test_split

# Dividir los datos en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

In [None]:
import pandas as pd
# Combinar X_train y y_train
df_train = pd.concat([X_train.reset_index(drop=True),
                     y_train.reset_index(drop=True)], axis=1)

# Declare categorical columns
# categorical_cols = ['id_zona']

In [None]:
# visualizar la tabla de entrenamiento
df_train.head()

In [None]:
# setup PyCaret for regression
s = setup(
    data=df_train,
    target='tot_person',  # Numerical target variable
    session_id=123,       # For reproducibility
    normalize=True,       # Optional: scales numeric features
    train_size=0.8,        # 80% training, 20% validation (adjust as needed)
    #categorical_features=categorical_cols
    )

In [None]:
# compare baseline models
best_model = compare_models(sort='RMSE', exclude=['rf', 'et'])

In [None]:
# importancia de las covariables
plot_model(best_model, plot='feature_all')

In [None]:
# gráfica SHAP
interpret_model(best_model)

In [None]:
# residuales
plot_model(best_model, plot='residuals')

In [None]:
# gráfica de error de predicción
plot_model(best_model, plot='error')

In [None]:
# distancia de Cook para atípicos
plot_model(best_model, plot='cooks')

In [None]:
# número de covariables seleccionadas
plot_model(best_model, plot='rfe')

In [None]:
# tuneo de hyperparametros
tuned_model = tune_model(best_model, optimize = 'RMSE', fold=10)

In [None]:
# Finalize the best model
final_model = finalize_model(tuned_model)

In [None]:
# Save the model
save_model(final_model, 'my_regression_model')

In [None]:
# error de predicción con modelo tuneado
plot_model(final_model, plot='error')