## ETAPA 4: MODELOS DE MACHINE LEARNING

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import HuberRegressor
from sklearn.linear_model import Ridge, RidgeCV, Lasso, LassoCV, ElasticNet, ElasticNetCV
from sklearn.tree import DecisionTreeRegressor 
from sklearn.ensemble import RandomForestRegressor 
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error
from sklearn.metrics import r2_score

#!pip install xgboost
from xgboost import XGBRegressor

### *Preparar Datos*

In [5]:
# Importar el dataset
df = pd.read_csv("../13 - Exports (preprocesamiento)/inmigrantes_merge.csv")

df.info()
df

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9360 entries, 0 to 9359
Data columns (total 22 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Year                             9360 non-null   int64  
 1   Nationality code                 9360 non-null   object 
 2   Sex                              9360 non-null   object 
 3   Age group                        9360 non-null   object 
 4   Immigrant count                  9360 non-null   int64  
 5   Liberal democracy index          9360 non-null   float64
 6   Continent                        9360 non-null   object 
 7   Sub-region                       9360 non-null   object 
 8   Health equality                  9360 non-null   float64
 9   Judicial accountability          9360 non-null   float64
 10  Public sector corrupt exchanges  9360 non-null   float64
 11  One-sided violence_deaths        9360 non-null   int64  
 12  Non-state_deaths    

Unnamed: 0,Year,Nationality code,Sex,Age group,Immigrant count,Liberal democracy index,Continent,Sub-region,Health equality,Judicial accountability,...,Non-state_deaths,Intrastate_deaths,Interstate_deaths,Number of residents,Political regime,Homicide Rate,Number of Turist,Spanish language,Restricciones_pandemia,Año post_pandemia
0,2008,DZA,Both,0 - 14,759,0.164,Africa,Africa,0.61,0.39,...,0,345,0,51922,3,0.95,44400000,0,0,0
1,2008,PER,Males,35 - 44,2938,0.649,America,South America,0.40,0.44,...,0,40,0,60185,7,5.27,44400000,1,0,0
2,2008,PER,Males,45 - 54,1128,0.649,America,South America,0.40,0.44,...,0,40,0,60185,7,5.27,44400000,1,0,0
3,2008,PER,Males,55 - 64,265,0.649,America,South America,0.40,0.44,...,0,40,0,60185,7,5.27,44400000,1,0,0
4,2008,PER,Males,65+,156,0.649,America,South America,0.40,0.44,...,0,40,0,60185,7,5.27,44400000,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9355,2022,PAK,Males,55 - 64,330,0.234,Asia,Asia,0.27,0.22,...,0,670,0,68821,6,4.21,59310000,0,0,1
9356,2022,PAK,Females,55 - 64,146,0.234,Asia,Asia,0.27,0.22,...,0,670,0,31675,6,4.21,59310000,0,0,1
9357,2022,PAK,Both,65+,169,0.234,Asia,Asia,0.27,0.22,...,0,670,0,100496,6,4.21,59310000,0,0,1
9358,2022,PAK,Males,65+,99,0.234,Asia,Asia,0.27,0.22,...,0,670,0,68821,6,4.21,59310000,0,0,1


Antes de proceder, debemos resolver la variable "Year" a una variable ordinal (los regímenes políticos ya están en formato ordinal) y el resto de variables categóricas a variables dummy.

En el caso de Year, simplemente restaremos 2007 a la columna entera, y para el resto de las variables objeto usaremos la funcion *.get_dummies()*.

In [8]:
# hacer copia del df
df_copy = df.copy()

# Transformar Year a variable ordinal de 1 (2008) a 15 (2022)
df_copy['Year'] = df_copy['Year'] - 2007

# Generar variables dummies a partir de nuestras variables categóricas "object" (no ordinales)
df_copy = pd.get_dummies(df_copy)

# Convertir las variables dummies booleanas en "int"
col_bool = df_copy.select_dtypes(include = ['bool']).columns
df_copy[col_bool] = df_copy[col_bool].astype(int)

# Verificar cambio
df_copy.info()
df_copy

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9360 entries, 0 to 9359
Data columns (total 65 columns):
 #   Column                                    Non-Null Count  Dtype  
---  ------                                    --------------  -----  
 0   Year                                      9360 non-null   int64  
 1   Immigrant count                           9360 non-null   int64  
 2   Liberal democracy index                   9360 non-null   float64
 3   Health equality                           9360 non-null   float64
 4   Judicial accountability                   9360 non-null   float64
 5   Public sector corrupt exchanges           9360 non-null   float64
 6   One-sided violence_deaths                 9360 non-null   int64  
 7   Non-state_deaths                          9360 non-null   int64  
 8   Intrastate_deaths                         9360 non-null   int64  
 9   Interstate_deaths                         9360 non-null   int64  
 10  Number of residents                 

Unnamed: 0,Year,Immigrant count,Liberal democracy index,Health equality,Judicial accountability,Public sector corrupt exchanges,One-sided violence_deaths,Non-state_deaths,Intrastate_deaths,Interstate_deaths,...,Continent_America,Continent_Asia,Continent_Europe,Sub-region_Africa,Sub-region_Asia,Sub-region_Central America and Caribbean,Sub-region_European Union,Sub-region_North America,Sub-region_Rest of Europe,Sub-region_South America
0,1,759,0.164,0.61,0.39,0.35,0,0,345,0,...,0,0,0,1,0,0,0,0,0,0
1,1,2938,0.649,0.40,0.44,0.30,0,0,40,0,...,1,0,0,0,0,0,0,0,0,1
2,1,1128,0.649,0.40,0.44,0.30,0,0,40,0,...,1,0,0,0,0,0,0,0,0,1
3,1,265,0.649,0.40,0.44,0.30,0,0,40,0,...,1,0,0,0,0,0,0,0,0,1
4,1,156,0.649,0.40,0.44,0.30,0,0,40,0,...,1,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9355,15,330,0.234,0.27,0.22,0.23,68,0,670,0,...,0,1,0,0,1,0,0,0,0,0
9356,15,146,0.234,0.27,0.22,0.23,68,0,670,0,...,0,1,0,0,1,0,0,0,0,0
9357,15,169,0.234,0.27,0.22,0.23,68,0,670,0,...,0,1,0,0,1,0,0,0,0,0
9358,15,99,0.234,0.27,0.22,0.23,68,0,670,0,...,0,1,0,0,1,0,0,0,0,0


### *Separar conjunto de entrenamiento y prueba*

In [None]:
# Separar variables input y variable target "Immigrant count"
X = df_copy.drop("Immigrant count", axis = 1) # variables predictoras
y = df_copy["Immigrant count"]  # Target

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 58) # separar datos en conjunto train y test en un 75% / 25%
scaler = RobustScaler() # definir scaler de datos 
X_train = scaler.fit_transform(X_train) # escalar los datos de entrenamiento 
X_test = scaler.fit_transform(X_test) # # escalar los datos de prueba

### *Modelos de Machine Learning*

#### Regresión Lineal