> Fundação Getúlio Vargas - RJ <br>
> Escola de Matemática Aplicada (EMAp) <br>
> Graduação em Ciência de Dados e Inteligência Artificial <br>
> Alunos: Gianlucca Devigili e Maisa O. Fraiz <br>
# Projetos em Ciência de Dados - A1

## Instruções de execução

__(#1)__ De modo a tornar mais rápida a carga dos dados, o projeto utiliza uma cópia dos arquivos em formato pickle (`.pkl`). Para executar o projeto utilizando o o dataset em formato `.csv`, atribua o valor `True` para a variável `V_load_from_csv`

**(#2)** Redefina a variável `raw_data_path` para o caminho até o arquivo `train_updated.csv`. Substitua o nome do arquivo caso necessário.

**(#3)** Atribua o valor `True` para a variável `save_files` caso ainda não tenha os arquivos `.pkl` dos datasets auxiliares salvos. (Necessário apenas para a primeira execução da sessão de preparação de dados).

**(#4)** Atribua o valor `True` para a variável `prepare_data` caso deseje executar a sessão de preparação de dados. (Necessário apenas para a primeira execução da sessão de preparação de dados). Caso contrário as variáveis serão carregadas a partir dos arquivos `.pkl`.

In [1]:
# Variáveis de configuração
# (#1) Variável que define se o dataset será carregado de um .csv ou de um .pkl
load_from_csv = False

# Caminho para o dataset
raw_data_path = '../data/raw-data/'
dataset_path = raw_data_path + 'train_updated.csv'

# Caminho onde serão salvos os dados processados
processed_data_path = '../data/processed-data/'

# Prepare data
# (#4) Variável que define se o dataset será processado ou se será carregado de um .pkl
prepare_data = False

save_files = False

## 0. Setup Inicial

In [2]:
# Imports

# Data manipulation
import pandas as pd
import numpy as np

import pickle as pkl

from joblib import Parallel, delayed


# disable warnings
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=FutureWarning)
pd.options.mode.chained_assignment = None  # default='warn'

In [3]:
# variáveis globais

PROCESSED_DATA_PATH = '../data/processed-data/'
MODEL_PATH = '../models/trained-models/'

TARGET_COLS = ['target1', 'target2', 'target3', 'target4']

RANDOM_SEED = 42

TEST_SPLIT_DATE = '2021-04-30'

## 1. Preparação dos Dados

### Funções Auxiliares

Algumas funções utilizadas para a preparação dos dados

In [4]:
# Funções auxiliares para carregar os dados
def unpack_json(json_str):
    return pd.DataFrame() if pd.isna(json_str) else pd.read_json(json_str)

def unpack_data(data, dfs=None, n_jobs=-1):
    if dfs is not None:
        data = data.loc[:, dfs]
    unnested_dfs = {}
    for name, column in data.iteritems():
        daily_dfs = Parallel(n_jobs=n_jobs)(
            delayed(unpack_json)(item) for date, item in column.iteritems())
        df = pd.concat(daily_dfs)
        unnested_dfs[name] = df
    return unnested_dfs

def create_id(df, id_cols, id_col_name, dt_col_name = 'Dt'):
    df['Id' + dt_col_name + id_col_name] = df[id_cols].apply(lambda x: '_'.join(x.astype(str)), axis=1)
    return df

### Carregando os dados

In [6]:
%%time
# (#1)
if load_from_csv:
    df_train = pd.read_csv(dataset_path)
else: 
    dataset_path = raw_data_path + 'train.pkl'
    df_train = pd.read_pickle(dataset_path)

Wall time: 1min 22s


#### Cria o Dataframe Targets

Dataframe contendo as 4 variáveis _target_ bem como suas respectivas chaves `IdPlayer` e `Dt`.

In [7]:
%%time 
if prepare_data:
    # criação do dataset de targets
    # unpack the data
    Y = unpack_data(df_train, dfs = ['nextDayPlayerEngagement'])['nextDayPlayerEngagement']

    # change datatypes
    Y = Y.astype({name: np.float32 for name in ["target1", "target2", "target3", "target4"]})

    # match target dates to feature dates and create date index
    Y = Y.rename(columns={'engagementMetricsDate': 'date'})

    # change datatypes
    Y['date'] = pd.to_datetime(Y['date'])

    # reset index
    Y = Y.set_index('date').to_period('D')
    Y.index = Y.index - 1
    Y = Y.reset_index()

    # rename and select columns
    cols_Y = {
        'date': 'Dt',
        'playerId': 'IdPlayer',
        'target1': 'target1',
        'target2': 'target2',
        'target3': 'target3',
        'target4': 'target4'
    }
    Y = Y[list(cols_Y)]
    Y.columns = list(cols_Y.values())
    Y['Dt'] = Y['Dt'].astype('datetime64[ns]')
    Y = create_id(Y, ['Dt', 'IdPlayer'], 'Player')

    if save_files:
        pd.to_pickle(Y, processed_data_path + 'targets.pkl')

    del Y

Wall time: 0 ns
Compiler : 126 ms
Parser   : 659 ms


#### Cria o Dataframe Player Box Scores

In [8]:
if prepare_data:
    # load the data
    df_playerBoxScores = unpack_data(df_train, dfs = ['playerBoxScores'])['playerBoxScores']

    # Cria o dataset de jogos
    cols = {
        # columns related to other dimensions
        'gamePk': 'IdGame',
        'gameDate': 'DtGame',
        'gameTimeUTC': 'DtGameUTC',
        'playerId': 'IdPlayer',
        'teamId': 'IdTeam',
        'jerseyNum': 'NuJersey',
        'positionCode': 'CdPosition',
        # suggested column
        'strikeOutsPitching': 'NuStrikeOutsPitching',
    }  
    # numeric columns
    for numeric_col in list(df_playerBoxScores.columns[12:]):
        # skip the columns that contains data about pitching due the amount of Nan values
        if 'Pitching' not in numeric_col:
            cols[numeric_col] = 'Nu' + numeric_col[0].upper() + numeric_col[1:]

    df_playerBoxScores['gameDate'] = df_playerBoxScores['gameDate'] + " 00:00:00"

    df_playerBoxScores = df_playerBoxScores[list(cols)]
    df_playerBoxScores.columns = list(cols.values())
    df_playerBoxScores = create_id(df_playerBoxScores, ['DtGame', 'IdPlayer'], 'Player')

    # Salva o dataset
    if save_files:
        pd.to_pickle(df_playerBoxScores, processed_data_path + 'playerBoxScores.pkl')
    del df_playerBoxScores

## Preparação dos dados target

In [9]:
df = pd.read_pickle(processed_data_path + 'targets.pkl')

### Funções Auxiliares

In [10]:
# Funções auxiliares para o pré-processamento dos dados
def sort_df(df: pd.DataFrame, columns: list = ['IdPlayer', 'Dt']) -> None:
    """Sort the dataframe by the columns passed as argument.
    
    Args:
        df (pd.DataFrame): Dataframe to be sorted.
        columns (list, optional): Columns to sort the dataframe. Defaults to ['IdPlayer', 'Dt'].
        
        Returns:
            None
    """
    df.sort_values(by=columns, inplace=True)
    # reset index
    df.reset_index(drop=True, inplace=True)


def shift_targets(df, shift_vals: list = [1, 2, 3, 4, 5, 6, 7, 14, 30]):
    """Shift the targets by the values passed as argument.

    Args:
        df (pd.DataFrame): Dataframe to be shifted.
        shift_vals (list, optional): Values to shift the targets. Defaults to [1, 2, 3, 4, 5, 6, 7, 14, 30].

    Returns:
        pd.DataFrame: Dataframe with the shifted targets.
    """
    df_aux = pd.DataFrame()
    # Iterate over players to make the shift only using the player data
    for player in df['IdPlayer'].unique():
        df_player = df[df['IdPlayer'] == player]
        # Iterate over the pre-defined shift values
        for shift_val in shift_vals:
            # Iterate over the targets
            for target in TARGET_COLS:
                # Make the shift
                df_player[f'{target}_shift_{shift_val}'] = df_player[target].shift(shift_val)
        # Concatenate the player data with the rest of the data
        df_aux = pd.concat([df_aux, df_player], axis=0)
        # Remove the player data from memory
        del df_player
    # df.dropna(inplace=True)
    return df_aux


def train_test_split(
    df: pd.DataFrame
    ,test_split_date: str = TEST_SPLIT_DATE
    ):
    """Split the dataframe into train and test sets.

    Args:
        df (pd.DataFrame): Dataframe to be split.
        test_split_date (str, optional): Date to split the dataframe. Defaults to TEST_SPLIT_DATE.
    """

    train = df[(df.Dt <= "2021-01-31") & (df.Dt >= "2018-01-01")] 
    val = df[(df.Dt <= "2021-04-30") & (df.Dt >= "2021-02-01")] 
    test = df[(df.Dt <= "2021-07-31") & (df.Dt >= "2021-05-01")]
    # train.to_csv('train.csv', index=None)
    # val.to_csv('validation.csv', index=None) 
    # test.to_csv('test.csv', index=None) 

    return train, test, val


def x_y_split(df: pd.DataFrame, target_cols: list = TARGET_COLS):
    """Split the dataframe into x and y sets.

    Args:
        df (pd.DataFrame): Dataframe to be split.
    """
    y = df[target_cols]
    x = df.drop(target_cols, axis=1)
    return x, y

### Ordenação dos valores

Os dados serão ordenados por jogador e então por data, para facilitar a criação das variáveis de _shift_.

In [11]:
sort_df(df)
df.head()

Unnamed: 0,Dt,IdPlayer,target1,target2,target3,target4,IdDtPlayer
0,2018-01-01,112526,0.055277,5.496109,0.025839,16.17647,2018-01-01 00:00:00_112526
1,2018-01-02,112526,0.060625,3.252914,0.030486,8.541353,2018-01-02 00:00:00_112526
2,2018-01-03,112526,0.029341,1.648352,0.032613,10.490111,2018-01-03 00:00:00_112526
3,2018-01-04,112526,0.014799,2.665894,0.087422,19.091467,2018-01-04 00:00:00_112526
4,2018-01-05,112526,0.083916,1.161002,0.024759,6.643879,2018-01-05 00:00:00_112526


### Shifts

In [12]:
df = shift_targets(df, shift_vals=[1, 2, 3, 4, 5, 6, 7])
df.head()

Unnamed: 0,Dt,IdPlayer,target1,target2,target3,target4,IdDtPlayer,target1_shift_1,target2_shift_1,target3_shift_1,...,target3_shift_5,target4_shift_5,target1_shift_6,target2_shift_6,target3_shift_6,target4_shift_6,target1_shift_7,target2_shift_7,target3_shift_7,target4_shift_7
0,2018-01-01,112526,0.055277,5.496109,0.025839,16.17647,2018-01-01 00:00:00_112526,,,,...,,,,,,,,,,
1,2018-01-02,112526,0.060625,3.252914,0.030486,8.541353,2018-01-02 00:00:00_112526,0.055277,5.496109,0.025839,...,,,,,,,,,,
2,2018-01-03,112526,0.029341,1.648352,0.032613,10.490111,2018-01-03 00:00:00_112526,0.060625,3.252914,0.030486,...,,,,,,,,,,
3,2018-01-04,112526,0.014799,2.665894,0.087422,19.091467,2018-01-04 00:00:00_112526,0.029341,1.648352,0.032613,...,,,,,,,,,,
4,2018-01-05,112526,0.083916,1.161002,0.024759,6.643879,2018-01-05 00:00:00_112526,0.014799,2.665894,0.087422,...,,,,,,,,,,


### Cria dados com playerBoxScores

In [13]:
df_playerBoxScores = pd.read_pickle(processed_data_path + 'playerBoxScores.pkl')

In [19]:
df_join = pd.merge(df, df_playerBoxScores, on=['IdDtPlayer'], how='left')
df_join.info(null_counts=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2698457 entries, 0 to 2698456
Data columns (total 88 columns):
 #   Column                    Non-Null Count    Dtype         
---  ------                    --------------    -----         
 0   Dt                        2698457 non-null  datetime64[ns]
 1   IdPlayer_x                2698457 non-null  int64         
 2   target1                   2698457 non-null  float32       
 3   target2                   2698457 non-null  float32       
 4   target3                   2698457 non-null  float32       
 5   target4                   2698457 non-null  float32       
 6   IdDtPlayer                2698457 non-null  object        
 7   target1_shift_1           2696396 non-null  float32       
 8   target2_shift_1           2696396 non-null  float32       
 9   target3_shift_1           2696396 non-null  float32       
 10  target4_shift_1           2696396 non-null  float32       
 11  target1_shift_2           2694335 non-null  float3

In [20]:
f = [c for c in df_join.columns if c not in ['IdGame',
                                              'DtGame',
                                              'DtGameUTC',
                                              'IdPlayer_y',
                                              'IdTeam',
                                              'NuJersey',
                                              'CdPosition', 
                                              'target1_shift_1', 
                                              'target2_shift_1',
                                              'target3_shift_1',
                                              'target1_shift_2',
                                              'target3_shift_2',
                                              'target4_shift_2',
                                              'target1_shift_3',
                                              'target2_shift_3',
                                              'target3_shift_3',
                                              'target4_shift_3',
                                              'target1_shift_4',
                                              'target2_shift_4',
                                              'target3_shift_4',
                                              'target4_shift_4',
                                              'target1_shift_5',
                                              'target2_shift_5',
                                              'target3_shift_5',
                                              'target4_shift_5',
                                              'target1_shift_6',
                                              'target2_shift_6',
                                              'target3_shift_6',
                                              'target4_shift_6',
                                              'target1_shift_7',
                                              'target2_shift_7',
                                              'target3_shift_7',
                                              'target4_shift_7']]

df_join[f] = df_join[f].fillna(0)                      

In [21]:
df_join.info(null_counts=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2698457 entries, 0 to 2698456
Data columns (total 88 columns):
 #   Column                    Non-Null Count    Dtype         
---  ------                    --------------    -----         
 0   Dt                        2698457 non-null  datetime64[ns]
 1   IdPlayer_x                2698457 non-null  int64         
 2   target1                   2698457 non-null  float32       
 3   target2                   2698457 non-null  float32       
 4   target3                   2698457 non-null  float32       
 5   target4                   2698457 non-null  float32       
 6   IdDtPlayer                2698457 non-null  object        
 7   target1_shift_1           2696396 non-null  float32       
 8   target2_shift_1           2696396 non-null  float32       
 9   target3_shift_1           2696396 non-null  float32       
 10  target4_shift_1           2698457 non-null  float32       
 11  target1_shift_2           2694335 non-null  float3

In [26]:
df_join = df_join.dropna(subset=[             
    'target1_shift_1', 
    'target2_shift_1',
    'target3_shift_1',
    'target1_shift_2',
    'target3_shift_2',
    'target4_shift_2',
    'target1_shift_3',
    'target2_shift_3',
    'target3_shift_3',
    'target4_shift_3',
    'target1_shift_4',
    'target2_shift_4',
    'target3_shift_4',
    'target4_shift_4',
    'target1_shift_5',
    'target3_shift_5',
    'target4_shift_5', 
    'target1_shift_6',
    'target2_shift_6',
    'target3_shift_6',
    'target4_shift_6',
    'target1_shift_7',
    'target2_shift_7',
    'target3_shift_7',
    'target4_shift_7'])

In [29]:
df_join.drop(['IdGame',
             'DtGame',
             'DtGameUTC',
             'IdPlayer_y',
             'IdTeam',
             'NuJersey',
             'CdPosition'], axis = 1, inplace = True)

In [30]:
df_join.isna().any()[lambda x: x]

Series([], dtype: bool)

In [31]:
df_join.rename(columns={'IdPlayer_x': 'IdPlayer'}, inplace=True)

In [18]:
del df_train, df_playerBoxScores

In [32]:
df_join.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2684030 entries, 7 to 2698456
Data columns (total 81 columns):
 #   Column                    Dtype         
---  ------                    -----         
 0   Dt                        datetime64[ns]
 1   IdPlayer                  int64         
 2   target1                   float32       
 3   target2                   float32       
 4   target3                   float32       
 5   target4                   float32       
 6   IdDtPlayer                object        
 7   target1_shift_1           float32       
 8   target2_shift_1           float32       
 9   target3_shift_1           float32       
 10  target4_shift_1           float32       
 11  target1_shift_2           float32       
 12  target2_shift_2           float32       
 13  target3_shift_2           float32       
 14  target4_shift_2           float32       
 15  target1_shift_3           float32       
 16  target2_shift_3           float32       
 17  target3_

### Divisão treino, teste e validação

In [83]:
train, test, val = train_test_split(df)
print(f"Train shape: {train.shape}, Test shape: {test.shape}, Val shape: {val.shape}")

del df

Train shape: (2322747, 35), Test shape: (189612, 35), Val shape: (183429, 35)


In [84]:
train_join, test_join, val_join = train_test_split(df_join)
print(f"Train shape: {train.shape}, Test shape: {test.shape}, Val shape: {val.shape}")

del df_join

Train shape: (2322747, 35), Test shape: (189612, 35), Val shape: (183429, 35)


## Treinando Modelos

In [85]:
from sklearn.metrics import mean_absolute_error

In [86]:
df_results = pd.DataFrame(columns = ['model', 'target1', 'target2', 'target3', 'target4', 'average'])

### Funções Auxiliares

In [87]:
# functions to train, predict and evaluate models
def train_models(model, x_train, y_train):
    """Train a model for each target column
    
    Parameters
    ----------
    model : sklearn model
        Model to be trained
    x_train : pd.DataFrame
        Training features
    y_train : pd.DataFrame
        Training targets
    
    Returns
    
    -------
    list
        List of trained models
    """

    models = []
    for target in TARGET_COLS:
        model.fit(x_train, y_train[target])
        models.append(model)
    return models


def predict_targets(models, x_test):
    """Predict the targets for each model

    Parameters
    ----------
    models : list
        List of trained models
    x_test : pd.DataFrame
        Test features

    Returns
    -------
    pd.DataFrame
        Predictions for each target column
    """

    y_preds = pd.DataFrame(columns=TARGET_COLS)
    for target, model in zip(TARGET_COLS, models):
        y_preds[target] = model.predict(x_test)
    return y_preds


def evaluate_mae(y_true, y_pred):
    """Evaluate the mean absolute error for each target column and the average MAE

    Parameters
    ----------
    y_true : pd.DataFrame
        True labels
    y_pred : pd.DataFrame
        Predictions
    
    Returns
    -------
    dict
        Mean absolute error for each target column
    """
    maes = {}
    for target in TARGET_COLS:
        mae = mean_absolute_error(y_true[target], y_pred[target])
        maes[target] = mae
    maes['average'] = np.mean(list(maes.values()))
    return maes

### Baseline Models

#### Mean

In [88]:
train_val = pd.concat([train, val], axis=0)

In [89]:
media = train_val[TARGET_COLS].mean()
media_por_jogador = train_val.groupby('IdPlayer')[TARGET_COLS].mean()

#### Median

In [90]:
mediana = train_val[TARGET_COLS].median()
mediana_por_jogador = train_val.groupby('IdPlayer')[TARGET_COLS].median()

#### Naive

In [91]:
naive = train_val[train_val['Dt']=='2021-04-30'].set_index('IdPlayer')[TARGET_COLS]

In [94]:
df_results

Unnamed: 0,model,target1,target2,target3,target4,average


In [116]:
#mae_media = []
#mae_mediapj = []
#mae_mediana = []
#mae_medianapj = []
#mae_naive = []
summary = pd.DataFrame(columns=TARGET_COLS,index=['Média','Média por Jogador','Mediana','Mediana por Jogador','Naive'])

for target in TARGET_COLS:
    
    y_true = test[target]
    
    mediapj_pred = test['IdPlayer'].map(media_por_jogador[target].to_dict())
    medianapj_pred = test['IdPlayer'].map(mediana_por_jogador[target].to_dict())
    naive_pred = test['IdPlayer'].map(naive[target].to_dict())
    
    mediana_pred = [mediana[target] for i in test.index]
    media_pred = [media[target] for i in test.index]
    

    summary.loc['Média',target]  = mean_absolute_error(y_true,media_pred)
    summary.loc['Média por Jogador',target]  = mean_absolute_error(y_true,mediapj_pred)
    summary.loc['Mediana',target]  = mean_absolute_error(y_true,mediana_pred)
    summary.loc['Mediana por Jogador',target]  = mean_absolute_error(y_true,medianapj_pred)
    summary.loc['Naive',target]  = mean_absolute_error(y_true,naive_pred)
    
summary['Average'] = summary.mean(axis=1)

In [132]:
summary.loc["Média"][0]

1.1254551

In [134]:
df_results = df_results.append({'model': 'Média', mean_absolute_err}, ignore_index=True)
    #df_results = df_results.append({'model': 'Média por Jogador', mean_absolute_error(y_true,mediapj_pred)}, ignore_index=True)
    #df_results = df_results.append({'model': 'Mediana', mean_absolute_error(y_true,mediana_pred)}, ignore_index=True)
    #df_results = df_results.append({'model': 'Mediana por Jogador', mean_absolute_error(y_true,medianapj_pred)}, ignore_index=True)
    #df_results = df_results.append({'model': 'Naive', mean_absolute_error(y_true,naive_pred)}, ignore_index=True)

TypeError: Can only append a dict if ignore_index=True

### Linear Models

In [17]:
train =train.drop(['Dt', 'IdDtPlayer'], axis=1)
train = train.dropna()
val = val.drop(['Dt', 'IdDtPlayer'], axis=1)  
test = test.drop(['Dt', 'IdDtPlayer'], axis=1)

#### LASSO

In [18]:
from sklearn.linear_model import Lasso

In [19]:
%%time
model = Lasso(alpha=0.1, random_state=RANDOM_SEED)

# train the models
models = train_models(
            model = model, 
            x_train = train.drop(TARGET_COLS, axis=1), 
            y_train = train[TARGET_COLS]
        )

# predict the targets for each trained model
y_pred = predict_targets(models, test.drop(TARGET_COLS, axis=1))

# evaluate the models
mae = evaluate_mae(y_true = test[TARGET_COLS], y_pred = y_pred)

# save the results
df_results = df_results.append({'model': 'Lasso | alpha = 0.1', **mae}, ignore_index=True)

# delete the variables to save RAM
del models, y_pred, mae

# show the results
df_results[df_results['model'] == 'Lasso | alpha = 0.1']

CPU times: user 2min 5s, sys: 45.7 s, total: 2min 51s
Wall time: 26.6 s


Unnamed: 0,model,target1,target2,target3,target4,average
0,Lasso | alpha = 0.1,1.353516,1.446735,1.303287,0.753244,1.214195


#### Lasso CV

In [20]:
from sklearn.linear_model import LassoCV

In [21]:
%%time
model = LassoCV(random_state=RANDOM_SEED)

# train the models
models = train_models(
            model = model, 
            x_train = train.drop(TARGET_COLS, axis=1), 
            y_train = train[TARGET_COLS]
        )

# predict the targets for each trained model
y_pred = predict_targets(models, test.drop(TARGET_COLS, axis=1))

# evaluate the models
mae = evaluate_mae(y_true = test[TARGET_COLS], y_pred = y_pred)

# save the results
df_results = df_results.append({'model': 'LassoCV', **mae}, ignore_index=True)

# delete the variables to save RAM
del models, y_pred, mae

# show the results
df_results[df_results['model'] == 'LassoCV']

CPU times: user 1min 21s, sys: 46.1 s, total: 2min 7s
Wall time: 57.9 s


Unnamed: 0,model,target1,target2,target3,target4,average
1,LassoCV,1.580195,1.968568,1.446175,1.43636,1.607824


### Tree Models

In [22]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import export_graphviz 

In [23]:
%%time
model = DecisionTreeRegressor(random_state=RANDOM_SEED)

# train the models
models = train_models(
            model = model, 
            x_train = train.drop(TARGET_COLS, axis=1), 
            y_train = train[TARGET_COLS]
        )

# predict the targets for each trained model
y_pred = predict_targets(models, test.drop(TARGET_COLS, axis=1))

# evaluate the models
mae = evaluate_mae(y_true = test[TARGET_COLS], y_pred = y_pred)

# save the results
df_results = df_results.append({'model': 'Decision Tree Regressor', **mae}, ignore_index=True)

# delete the variables to save RAM
del models, y_pred, mae

df_results[df_results['model'] == 'Decision Tree Regressor']

CPU times: user 12min 43s, sys: 1.09 s, total: 12min 44s
Wall time: 12min 44s


Unnamed: 0,model,target1,target2,target3,target4,average
2,Decision Tree Regressor,1.535279,1.712461,1.48524,1.119111,1.463023


In [24]:
from sklearn.ensemble import RandomForestRegressor

In [25]:
%%time
model = RandomForestRegressor(random_state=RANDOM_SEED)

# train the models
models = train_models(
            model = model, 
            x_train = train.drop(TARGET_COLS, axis=1), 
            y_train = train[TARGET_COLS]
        )

# predict the targets for each trained model
y_pred = predict_targets(models, test.drop(TARGET_COLS, axis=1))

# evaluate the models
mae = evaluate_mae(y_true = test[TARGET_COLS], y_pred = y_pred)

# save the results
df_results = df_results.append({'model': 'Random Forest Regressor', **mae}, ignore_index=True)

# delete the variables to save RAM
del models, y_pred, mae

df_results[df_results['model'] == 'Random Forest Regressor']

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

In [None]:
%%time
model = GradientBoostingRegressor(random_state=RANDOM_SEED)

# train the models
models = train_models(
            model = model,
            x_train = train.drop(TARGET_COLS, axis=1),
            y_train = train[TARGET_COLS]
        )

# predict the targets for each trained model
y_pred = predict_targets(models, test.drop(TARGET_COLS, axis=1))

# evaluate the models
mae = evaluate_mae(y_true = test[TARGET_COLS], y_pred = y_pred)

# save the results
df_results = df_results.append({'model': 'Gradient Boosting Regressor', **mae}, ignore_index=True)

# delete the variables to save RAM
del models, y_pred, mae

df_results[df_results['model'] == 'Gradient Boosting Regressor']

In [None]:
from sklearn.ensemble import AdaBoostRegressor

In [None]:
%%time
model = AdaBoostRegressor(random_state=RANDOM_SEED)

# train the models
models = train_models(
            model = model,
            x_train = train.drop(TARGET_COLS, axis=1),
            y_train = train[TARGET_COLS]
        )

# predict the targets for each trained model
y_pred = predict_targets(models, test.drop(TARGET_COLS, axis=1))

# evaluate the models
mae = evaluate_mae(y_true = test[TARGET_COLS], y_pred = y_pred)

# save the results
df_results = df_results.append({'model': 'AdaBoost Regressor', **mae}, ignore_index=True)

# delete the variables to save RAM
del models, y_pred, mae

df_results[df_results['model'] == 'AdaBoost Regressor']

In [None]:
from sklearn.ensemble import BaggingRegressor

In [None]:
%%time
model = BaggingRegressor(random_state=RANDOM_SEED)

# train the models
models = train_models(
            model = model,
            x_train = train.drop(TARGET_COLS, axis=1),
            y_train = train[TARGET_COLS]
        )

# predict the targets for each trained model
y_pred = predict_targets(models, test.drop(TARGET_COLS, axis=1))

# evaluate the models
mae = evaluate_mae(y_true = test[TARGET_COLS], y_pred = y_pred)

# save the results
df_results = df_results.append({'model': 'Bagging Regressor', **mae}, ignore_index=True)

# delete the variables to save RAM
del models, y_pred, mae

df_results[df_results['model'] == 'Bagging Regressor']

In [None]:
df_results