<a href="https://colab.research.google.com/github/DahianaRH/Project_ModYSim/blob/main/04_modelos_con_codificaci%C3%B3n_ordinal_y_%C3%A1rboles.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

###Árboles y Codificación Ordinal
**Introducción**

Este notebook implementa una estrategia distinta basada en:

Codificación ordinal para variables categóricas.

Imputación simple para ambos tipos de variables.

Modelos basados en árboles (Decision Tree y Random Forest liviano).

Sin one-hot encoding ni escalado.

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score, f1_score, classification_report

from google.colab import drive

##Carga de datos

In [11]:
drive.mount('/content/drive')

train = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Project_ModYSim_2/train.csv')
test = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Project_ModYSim_2/test.csv')

train.head()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Unnamed: 0,ID,PERIODO_ACADEMICO,E_PRGM_ACADEMICO,E_PRGM_DEPARTAMENTO,E_VALORMATRICULAUNIVERSIDAD,E_HORASSEMANATRABAJA,F_ESTRATOVIVIENDA,F_TIENEINTERNET,F_EDUCACIONPADRE,F_TIENELAVADORA,...,E_PRIVADO_LIBERTAD,E_PAGOMATRICULAPROPIO,F_TIENECOMPUTADOR,F_TIENEINTERNET.1,F_EDUCACIONMADRE,RENDIMIENTO_GLOBAL,INDICADOR_1,INDICADOR_2,INDICADOR_3,INDICADOR_4
0,904256,20212,ENFERMERIA,BOGOTÁ,Entre 5.5 millones y menos de 7 millones,Menos de 10 horas,Estrato 3,Si,Técnica o tecnológica incompleta,Si,...,N,No,Si,Si,Postgrado,medio-alto,0.322,0.208,0.31,0.267
1,645256,20212,DERECHO,ATLANTICO,Entre 2.5 millones y menos de 4 millones,0,Estrato 3,No,Técnica o tecnológica completa,Si,...,N,No,Si,No,Técnica o tecnológica incompleta,bajo,0.311,0.215,0.292,0.264
2,308367,20203,MERCADEO Y PUBLICIDAD,BOGOTÁ,Entre 2.5 millones y menos de 4 millones,Más de 30 horas,Estrato 3,Si,Secundaria (Bachillerato) completa,Si,...,N,No,No,Si,Secundaria (Bachillerato) completa,bajo,0.297,0.214,0.305,0.264
3,470353,20195,ADMINISTRACION DE EMPRESAS,SANTANDER,Entre 4 millones y menos de 5.5 millones,0,Estrato 4,Si,No sabe,Si,...,N,No,Si,Si,Secundaria (Bachillerato) completa,alto,0.485,0.172,0.252,0.19
4,989032,20212,PSICOLOGIA,ANTIOQUIA,Entre 2.5 millones y menos de 4 millones,Entre 21 y 30 horas,Estrato 3,Si,Primaria completa,Si,...,N,No,Si,Si,Primaria completa,medio-bajo,0.316,0.232,0.285,0.294


##Limpieza

In [12]:
train = train.dropna(subset=['RENDIMIENTO_GLOBAL'])

test_ids = test['ID']

train = train.drop(columns=['ID'])
test_clean = test.drop(columns=['ID'])

X = train.drop(columns=['RENDIMIENTO_GLOBAL'])
y = train['RENDIMIENTO_GLOBAL']

##División train-val

In [13]:
X_train, X_val, y_train, y_val = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

##Columnas o preprocesamiento

In [14]:
OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)

cat_cols = X.select_dtypes(include='object').columns
num_cols = X.select_dtypes(exclude='object').columns

preprocess = ColumnTransformer(
    transformers=[
        ('num', SimpleImputer(strategy='median'), num_cols),
        ('cat', Pipeline([
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('encoder', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
        ]), cat_cols)
    ]
)

##Definición de dos modelos

In [15]:
model_tree = DecisionTreeClassifier(
    max_depth=12,
    min_samples_split=20,
    random_state=42
)

model_rf_small = RandomForestClassifier(
    n_estimators=200,
    max_depth=15,
    random_state=42
)

pipeline_tree = Pipeline([
    ('prep', preprocess),
    ('clf', model_tree)
])

pipeline_rf = Pipeline([
    ('prep', preprocess),
    ('clf', model_rf_small)
])

##Entrenamiento

In [16]:
pipeline_tree.fit(X_train, y_train)
pipeline_rf.fit(X_train, y_train)

##Evaluación

In [18]:
for name, model in [
    ("Árbol de decisión", pipeline_tree),
    ("Random Forest pequeño", pipeline_rf)
]:
    print("\nEvaluando:", name)
    preds = model.predict(X_val)
    print("Accuracy:", accuracy_score(y_val, preds))
    print("F1 weighted:", f1_score(y_val, preds, average='weighted'))
    print(classification_report(y_val, preds))


Evaluando: Árbol de decisión
Accuracy: 0.4004043321299639
F1 weighted: 0.39211699141490897
              precision    recall  f1-score   support

        alto       0.51      0.58      0.54     35124
        bajo       0.42      0.50      0.46     34597
  medio-alto       0.31      0.25      0.28     34324
  medio-bajo       0.31      0.27      0.29     34455

    accuracy                           0.40    138500
   macro avg       0.39      0.40      0.39    138500
weighted avg       0.39      0.40      0.39    138500


Evaluando: Random Forest pequeño
Accuracy: 0.41011552346570396
F1 weighted: 0.39502674331600973
              precision    recall  f1-score   support

        alto       0.50      0.61      0.55     35124
        bajo       0.43      0.54      0.48     34597
  medio-alto       0.31      0.22      0.26     34324
  medio-bajo       0.32      0.25      0.28     34455

    accuracy                           0.41    138500
   macro avg       0.39      0.41      0.39    138

##Selección del mejor

In [19]:
best_model = pipeline_rf   # basado en resultados usuales

##Predicción test

In [20]:
test_preds = best_model.predict(test_clean)

submission = pd.DataFrame({
    "ID": test_ids,
    "RENDIMIENTO_GLOBAL": test_preds
})

submission.to_csv("submission_trees.csv", index=False)
submission.head()

Unnamed: 0,ID,RENDIMIENTO_GLOBAL
0,550236,alto
1,98545,medio-bajo
2,499179,alto
3,782980,bajo
4,785185,bajo


##Descarga

In [21]:
from google.colab import files
files.download("submission_trees.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>