<a href="https://colab.research.google.com/github/EAFIT-BI/Supervised-Learning-2025-I/blob/main/Modelos_de_ensamble.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Modelos de ensamble

Los modelos de ensamble utilizan la unión de varios modelos para mejorar el desempeño individual. Dentro los modelos generales de ensamble encontramos:

- Bagging (bootstrap)
- Bosques aleatorios (Random Forest)
- Boosting

In [None]:
import pandas as pd
# Leemos los datos
datos = pd.read_csv('auto-mpg.data-original',
                    sep = '\s+', header = None)
datos = datos.dropna()
# Asignamos los nombres a las columnas
datos.columns = ['mpg', 'cylinders', 'displacement',
                'horsepower', 'weight', 'acceleration',
                'model year', 'origin', 'car name']
# Cambiamos el índice por defecto
datos.set_index('car name', inplace = True)
# Visualizamos la información modificada
datos.head()

Unnamed: 0_level_0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin
car name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
chevrolet chevelle malibu,18.0,8.0,307.0,130.0,3504.0,12.0,70.0,1.0
buick skylark 320,15.0,8.0,350.0,165.0,3693.0,11.5,70.0,1.0
plymouth satellite,18.0,8.0,318.0,150.0,3436.0,11.0,70.0,1.0
amc rebel sst,16.0,8.0,304.0,150.0,3433.0,12.0,70.0,1.0
ford torino,17.0,8.0,302.0,140.0,3449.0,10.5,70.0,1.0


# Clasificador lineal

En este caso usaremos un regresor logístico para la clasificación.

In [None]:
# Separamos las variables predictoras de la variable objetivo
X = datos.drop('origin', axis = 1)
y = datos[['origin']]

In [None]:
# Particionamos en train y test
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test
= train_test_split(X, y, train_size = 0.75, random_state = 27,
                         stratify = y)

In [None]:
# Preprocesamos los datos
from sklearn.preprocessing import StandardScaler
# Instanciamos el escalador
escalador = StandardScaler()
# Aplicamos al train y al test
escalador.fit_transform(X_train)

array([[-1.66176538,  1.51161332,  1.47955054, ...,  0.84461771,
        -1.64579885, -0.84069528],
       [ 1.03186594, -0.83049111, -0.97505908, ..., -1.13578084,
        -0.22716033,  0.51884494],
       [-0.21640224,  0.3405611 ,  0.05004609, ..., -0.13848336,
        -0.00890825, -1.65641942],
       ...,
       [-0.45291621,  1.51161332,  1.02812809, ...,  0.73341254,
        -0.99104261,  0.51884494],
       [-1.53036873,  1.51161332,  2.22251667, ...,  2.36836523,
        -1.46392212, -0.84069528],
       [-0.74198884,  0.3405611 ,  0.61432416, ...,  0.014128  ,
        -0.73641518, -1.38451137]])

In [None]:
escalador.fit_transform(X_test)

array([[-0.22216825, -0.97150069, -0.8068807 , -0.50307542, -0.0952579 ,
         1.40919494, -0.99466053],
       [-1.18661956,  1.40867601,  1.53433505,  1.37784815,  1.52908454,
        -0.93455748, -0.72161646],
       [ 1.15217487, -0.97150069, -0.93863743, -0.90045364, -0.90390815,
         0.5077517 ,  1.189692  ],
       [-1.18661956,  1.40867601,  2.03095658,  2.22558835,  1.59833035,
        -1.11484612, -0.99466053],
       [-0.46328108, -1.56654487, -1.31363736, -0.23815661, -0.85696183,
        -0.75426883, -0.99466053],
       [ 0.44089203, -0.97150069, -0.60417804, -0.90045364, -0.58115224,
        -0.7182111 ,  0.09751574],
       [ 0.98339589, -0.97150069, -1.22242116, -1.03291304, -1.24426892,
         0.14717441, -0.44857239],
       [-0.63206006,  0.21858766,  0.25730828, -0.55605918,  0.4751398 ,
         0.36352078,  1.46273607],
       [ 0.01894458, -0.97150069, -0.8068807 , -0.23815661, -0.65039806,
        -0.39369154, -0.99466053],
       [-1.00578494,  1.4086

In [None]:
from sklearn.linear_model import LogisticRegression
from scipy.stats import loguniform
from sklearn.model_selection import RandomizedSearchCV
# El modelo de regresión logística tiene dos hiperparámetros
# a considerar. La C y la penalización.
modelo = LogisticRegression(max_iter = 100,
                            solver = 'liblinear',
                           class_weight = 'balanced')

# Instanciamos la sintonización
distribucion = {'C': loguniform(0.001, 10),
            'penalty': ['l1', 'l2']}
grid_search = RandomizedSearchCV(estimator = modelo,
                                param_distributions = distribucion,
                                scoring = 'f1_weighted',
                                cv = 5)

In [None]:
# Hacemos la búsqueda
grid_search.fit(X_train, y_train)

# Visualizamos el mejor valor con los mejores hiperparámetros
print(f'Score y parámetros:
      {grid_search.best_score_, grid_search.best_params_}')

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = colu

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Score y parámetros: (0.7781106474744663, {'C': 2.942103926073016, 'penalty': 'l2'})
