In [4]:
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import os
import seaborn as sns

## Lectura del Dataset

In [2]:
path = './World Happiness Report/'

In [5]:
data_csv = os.listdir(path)
print(data_csv)

['2015.csv', '2016.csv', '2017.csv', '2018.csv', '2019.csv', 'ds_final.csv', 'full_2015.csv', 'full_2016.csv', 'full_2017.csv', 'full_2018.csv', 'full_2019.csv', 'full_data.csv']


In [7]:
ds = pd.read_csv(path+'ds_final.csv')

In [9]:
ds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 782 entries, 0 to 781
Data columns (total 9 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Year                       782 non-null    float64
 1   Country                    782 non-null    object 
 2   Happiness Score            782 non-null    float64
 3   GDP per Capita             782 non-null    float64
 4   Life Expectancy            782 non-null    float64
 5   Freedom                    782 non-null    float64
 6   Perceptions of corruption  782 non-null    float64
 7   Generosity                 782 non-null    float64
 8   Region                     782 non-null    object 
dtypes: float64(7), object(2)
memory usage: 55.1+ KB


In [8]:
ds.head()

Unnamed: 0,Year,Country,Happiness Score,GDP per Capita,Life Expectancy,Freedom,Perceptions of corruption,Generosity,Region
0,2015.0,Switzerland,7.587,1.39651,0.94143,0.66557,0.41978,0.29678,Western Europe
1,2015.0,Iceland,7.561,1.30232,0.94784,0.62877,0.14145,0.4363,Western Europe
2,2015.0,Denmark,7.527,1.32548,0.87464,0.64938,0.48357,0.34139,Western Europe
3,2015.0,Norway,7.522,1.459,0.88521,0.66973,0.36503,0.34699,Western Europe
4,2015.0,Canada,7.427,1.32629,0.90563,0.63297,0.32957,0.45811,North America


Para el entrenamiento dejaremos fuera la feature 'Country'. Por otro lado crearemos 4 modelos:

- Modelo 1:
    - 'Region' con OHE
    - Variables numericas sin modificar
- Modelo 2:
    - 'Region' con OHE
    - Variables numericas normalizadas
- Modelo 3:
    - Se quita feature 'Region
    - Variables numercias sin modificar
- Modelo 4:
    - Se quita feature 'Region
    - Variables numericas normalizadas

In [11]:
ds.drop(['Country'], axis=1, inplace=True)

In [12]:
categoricas = ['Region']
numericas = ['GDP per Capita', 'Life Expectancy','Freedom', 'Perceptions of corruption', 'Generosity']
output =  ['Happiness Score']

## Separación en train/test

In [14]:
from sklearn.model_selection import train_test_split


features = ['Region', 'GDP per Capita', 'Life Expectancy',
            'Freedom', 'Perceptions of corruption', 'Generosity']

x_train, x_test, y_train, y_test = train_test_split(
    ds[features], 
    ds[output],  
    test_size=0.3,
    random_state=42)

In [15]:
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

(547, 6)
(547, 1)
(235, 6)
(235, 1)


## Entrenamiento de modelos

In [18]:
# Entrenamiento de modelos de prueba
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import RandomForestRegressor

# Evaluación de modelos de prueba
from sklearn.metrics import balanced_accuracy_score

def pipeline_ml_models(preprocessing_pipeline):
  # Modelos a ensayar.
  models = [
      ("RandomForest", 
          Pipeline([
              ('preprocessor', preprocessing_pipeline),
              ('model', RandomForestRegressor(n_estimators=200, random_state=42)) 
          ])
      ),
      
      ("Adaboost", 
          Pipeline([
              ('preprocessor', preprocessing_pipeline),
              ('model', AdaBoostRegressor(n_estimators=200, random_state=42)) 
          ])
      ),
      
      ("LinearRegression", 
          Pipeline([
              ('preprocessor', preprocessing_pipeline),
              ('model', LinearRegression())
          ])
      ),
      
  ]
  
  return models

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

# Cadena de preprocesamiento

preprocessing_pipeline1 = ColumnTransformer(
    # Transformaciones a aplicar a cada columna
             transformers=[
                           ("numerical", StandardScaler(), numericas)
                           ('categoric', OneHotEncoder(handle_unknown='ignore'),categoricas)
              ],
              remainder='passthrough'
)
models1 = pipeline_ml_models(preprocessing_pipeline)

In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import KFold
from sklearn.base import clone
from sklearn.metrics import roc_auc_score

def kfold_pipeline(models_pipeline, samples , target, x_test, y_test, k = 5):
    
    """
    Esta funcion recibe:
        models_pipeline: el pipeline de los modelos a evaluar
        samples: set de entrenamiento
        target: salidas del set de entrenamiento
        x_test: set de testeo
        y_test: set de testeo
        k: el numero de segmentos que utilizara el algoritmo de K-Fold
    """
    results = {}
    for model in models_pipeline:
        model_name = model[0]
        results[model_name] = {}
        
        train_auc = 0
        valid_auc = 0
        
        kf = KFold(n_splits=k)
        kf.get_n_splits(samples)
        
        for train_index, valid_index in kf.split(samples, target):
            x_valid = samples.iloc[valid_index]
            y_valid = target.iloc[valid_index]
            x_train = samples.iloc[train_index]
            y_train = target.iloc[train_index]
            
            model[1].fit(x_train, y_train)
            pred_train = model[1].predict_proba(x_train)
            pred_valid = model[1].predict_proba(x_valid)
            
            if len(pred_train.shape)>1 and (pred_train.shape[1]>1):
                train_auc = train_auc + roc_auc_score(y_train, pred_train[:,1]) / k
                valid_auc = valid_auc + roc_auc_score(y_valid, pred_valid[:,1]) / k
            else:
                train_auc = train_auc + roc_auc_score(y_train, pred_train) / k
                valid_auc = valid_auc + roc_auc_score(y_valid, pred_valid) / k
        
        pred_test = model[1].predict_proba(x_test)

        results[model_name]["Train"] = train_auc
        results[model_name]["Valid"] = valid_auc
        results[model_name]["test"] = roc_auc_score(y_test, pred_test[:,1])

    models_results = pd.DataFrame(results).T
    return models_results