## Libraries

In [2]:
import random
import pickle

# Data manipulation
import pandas as pd
import numpy as np

# Data visualization
import matplotlib.pyplot as plt
import seaborn as sns

# models
import sklearn
import sktime
import keras
import statsmodels.api as sm
from sktime.forecasting.naive import NaiveForecaster
from sktime.forecasting.exp_smoothing import ExponentialSmoothing
from tensorflow import keras
from tensorflow.keras import layers

# model selection and metrics
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_absolute_error
from sklearn.feature_selection import RFE

# constants
PROCESSED_DATA_PATH = '../data/processed-data/'
MODEL_PATH = '../models/trained-models/'
TARGET_COLS = ['target1', 'target2', 'target3', 'target4']
TEST_SPLIT_DATE = '2021-04-30'

# disable warnings
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=FutureWarning)
pd.options.mode.chained_assignment = None  # default='warn'

## Auxiliary Functions

In [17]:
def plot_time_series(df, groupby = 'median', cols = TARGET_COLS):
    df_melted = df.groupby('Dt').median()[cols]\
    .melt(var_name='target', value_name='value', ignore_index=False)
    df_melted.reset_index(inplace=True)

    sns.set(rc={'figure.figsize':(10,20)})
    sns.FacetGrid(df_melted, col='target', col_wrap=2, height=9,aspect=2, sharey=False)\
    .map(sns.lineplot, 'Dt', 'value');
    
def plot_train_test_pred(train, test, pred = pd.DataFrame(), groupby = 'median', cols = TARGET_COLS):
    train['type'] = 'train'
    test['type'] = 'test'
    pred['type'] = 'pred'
    
    cols.append('type')

    df = pd.concat([train, test, pred], axis=0)
    df_melted = df.groupby('Dt').median()[cols]\
        .melt(var_name='target', value_name='value', ignore_index=False)
    df_melted.reset_index(inplace=True)
    
    sns.set(rc={'figure.figsize':(10,20)})
    sns.FacetGrid(df_melted, col='target', col_wrap=2, height=9,aspect=2, sharey=False)\
        .map(sns.lineplot, 'Dt', 'value', hue='type');

    
def sort_df(df: pd.DataFrame, columns: list = ['IdPlayer', 'Dt']) -> None:
    """Sort the dataframe by the columns passed as argument.
    
    Args:
        df (pd.DataFrame): Dataframe to be sorted.
        columns (list, optional): Columns to sort the dataframe. Defaults to ['IdPlayer', 'Dt'].
        
        Returns:
            None
    """
    df.sort_values(by=columns, inplace=True)
    # reset index
    df.reset_index(drop=True, inplace=True)
    
def evaluate_mae(y_true, y_pred):
    """Evaluate the mean absolute error for each target column and the average MAE

    Parameters
    ----------
    y_true : pd.DataFrame
        True labels
    y_pred : pd.DataFrame
        Predictions
    
    Returns
    -------
    dict
        Mean absolute error for each target column
    """
    maes = {}
    for target in TARGET_COLS:
        mae = mean_absolute_error(y_true[target], y_pred[target])
        maes[target] = mae
    maes['average'] = np.mean(list(maes.values()))
    return maes

def train_models(model, x_train, y_train):
    """Train a model for each target column
    
    Parameters
    ----------
    model : sklearn model
        Model to be trained
    x_train : pd.DataFrame
        Training features
    y_train : pd.DataFrame
        Training targets
    
    Returns
    
    -------
    list
        List of trained models
    """

    models = []
    for target in TARGET_COLS:
        model.fit(x_train, y_train[target])
        models.append(model)
    return models


def predict_targets(models, x_test):
    """Predict the targets for each model

    Parameters
    ----------
    models : list
        List of trained models
    x_test : pd.DataFrame
        Test features

    Returns
    -------
    pd.DataFrame
        Predictions for each target column
    """

    y_preds = pd.DataFrame(columns=TARGET_COLS)
    for target, model in zip(TARGET_COLS, models):
        y_preds[target] = model.predict(x_test)
    return y_preds

def train_test_split(
    df: pd.DataFrame
    ,test_split_date: str = TEST_SPLIT_DATE
    ):
    """Split the dataframe into train and test sets.

    Args:
        df (pd.DataFrame): Dataframe to be split.
        test_split_date (str, optional): Date to split the dataframe. Defaults to TEST_SPLIT_DATE.
    """

    train = df[(df.Dt <= "2021-01-31") & (df.Dt >= "2018-01-01")] 
    val = df[(df.Dt <= "2021-04-30") & (df.Dt >= "2021-02-01")] 
    test = df[(df.Dt <= "2021-07-31") & (df.Dt >= "2021-05-01")]
    # train.to_csv('train.csv', index=None)
    # val.to_csv('validation.csv', index=None) 
    # test.to_csv('test.csv', index=None) 

    return train, test, val


## Data Prep

In [40]:
df = pd.read_pickle(PROCESSED_DATA_PATH + 'shifted_targets.pkl')

In [None]:
#train = df[(df.Dt <= "2021-01-31") & (df.Dt >= "2018-01-01")] 
#val = df[(df.Dt <= "2021-04-30") & (df.Dt >= "2021-02-01")] 
#test = df[(df.Dt <= "2021-07-31") & (df.Dt >= "2021-05-01")]

#train.to_csv('train.csv', index=None)
#val.to_csv('validation.csv', index=None) 
#test.to_csv('test.csv', index=None)

train = pd.read_csv('train.csv')
val = pd.read_csv('validation.csv')
test = pd.read_csv('test.csv')

## Baselines

In [None]:
summary = pd.DataFrame(columns=TARGET_COLS,index=['Média','Média por Jogador','Mediana','Mediana por Jogador','Naive'])
# train + validation dataset
train_val = pd.concat([train, val], axis=0)

In [None]:
train.shape[0] + val.shape[0] == train_val.shape[0]

In [None]:
media = train_val[TARGET_COLS].mean()
media_por_jogador = train_val.groupby('IdPlayer')[TARGET_COLS].mean()
mediana = train_val[TARGET_COLS].median()
mediana_por_jogador = train_val.groupby('IdPlayer')[TARGET_COLS].median()
naive = train_val[train_val['Dt']=='2021-04-30'].set_index('IdPlayer')[TARGET_COLS]

In [None]:
for target in TARGET_COLS:
    
    y_true = test[target]
    
    mediapj_pred = test['IdPlayer'].map(media_por_jogador[target].to_dict())
    medianapj_pred = test['IdPlayer'].map(mediana_por_jogador[target].to_dict())
    naive_pred = test['IdPlayer'].map(naive[target].to_dict())
    
    mediana_pred = [mediana[target] for i in test.index]
    media_pred = [media[target] for i in test.index]
    
    summary.loc['Média',target]  = mean_absolute_error(y_true,media_pred)
    summary.loc['Média por Jogador',target]  = mean_absolute_error(y_true,mediapj_pred)
    summary.loc['Mediana',target]  = mean_absolute_error(y_true,mediana_pred)
    summary.loc['Mediana por Jogador',target]  = mean_absolute_error(y_true,medianapj_pred)
    summary.loc['Naive',target]  = mean_absolute_error(y_true,naive_pred)
    
summary['Average'] = summary.mean(axis=1)

In [None]:
summary

## LASSO and Multitask LASSO

In [41]:
# dropping the columns with a high shift
for i in range(4):
    for shift in [6, 7, 14, 30]:
        df.drop(f'target{i+1}_shift_{shift}', axis=1, inplace=True)

In [42]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2695788 entries, 0 to 2695787
Data columns (total 27 columns):
 #   Column           Dtype         
---  ------           -----         
 0   Dt               datetime64[ns]
 1   IdPlayer         int64         
 2   target1          float32       
 3   target2          float32       
 4   target3          float32       
 5   target4          float32       
 6   IdDtPlayer       object        
 7   target1_shift_1  float32       
 8   target2_shift_1  float32       
 9   target3_shift_1  float32       
 10  target4_shift_1  float32       
 11  target1_shift_2  float32       
 12  target2_shift_2  float32       
 13  target3_shift_2  float32       
 14  target4_shift_2  float32       
 15  target1_shift_3  float32       
 16  target2_shift_3  float32       
 17  target3_shift_3  float32       
 18  target4_shift_3  float32       
 19  target1_shift_4  float32       
 20  target2_shift_4  float32       
 21  target3_shift_4  float32       

In [43]:
df_playerBoxScores = pd.read_pickle(PROCESSED_DATA_PATH + 'playerBoxScores.pkl')
df_playerBoxScores.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 219727 entries, 0 to 451
Data columns (total 54 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   IdGame                    219727 non-null  int64  
 1   DtGame                    219727 non-null  object 
 2   DtGameUTC                 219727 non-null  object 
 3   IdPlayer                  219727 non-null  int64  
 4   IdTeam                    219727 non-null  int64  
 5   NuJersey                  219690 non-null  object 
 6   CdPosition                219727 non-null  int64  
 7   NuStrikeOutsPitching      65466 non-null   float64
 8   NuBattingOrder            183390 non-null  float64
 9   NuGamesPlayedBatting      183395 non-null  float64
 10  NuFlyOuts                 183395 non-null  float64
 11  NuGroundOuts              183395 non-null  float64
 12  NuRunsScored              183395 non-null  float64
 13  NuDoubles                 183395 non-null  floa

In [44]:
df_join = pd.merge(df, df_playerBoxScores, on=['IdDtPlayer'], how='left')
df_join.info(null_counts=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2698457 entries, 0 to 2698456
Data columns (total 80 columns):
 #   Column                    Non-Null Count    Dtype         
---  ------                    --------------    -----         
 0   Dt                        2698457 non-null  datetime64[ns]
 1   IdPlayer_x                2698457 non-null  int64         
 2   target1                   2698457 non-null  float32       
 3   target2                   2698457 non-null  float32       
 4   target3                   2698457 non-null  float32       
 5   target4                   2698457 non-null  float32       
 6   IdDtPlayer                2698457 non-null  object        
 7   target1_shift_1           2696396 non-null  float32       
 8   target2_shift_1           2696396 non-null  float32       
 9   target3_shift_1           2696396 non-null  float32       
 10  target4_shift_1           2696396 non-null  float32       
 11  target1_shift_2           2694335 non-null  float3

In [45]:
f = [c for c in df_join.columns if c not in ['IdGame',
                                              'DtGame',
                                              'DtGameUTC',
                                              'IdPlayer_y',
                                              'IdTeam',
                                              'NuJersey',
                                              'CdPosition', 
                                              'target1_shift_1', 
                                              'target2_shift_1',
                                              'target3_shift_1',
                                              'target1_shift_2',
                                              'target3_shift_2',
                                              'target4_shift_2',
                                              'target1_shift_3',
                                              'target2_shift_3',
                                              'target3_shift_3',
                                              'target4_shift_3',
                                              'target1_shift_4',
                                              'target2_shift_4',
                                              'target3_shift_4',
                                              'target4_shift_4',
                                              'target1_shift_5',
                                              'target2_shift_5',
                                              'target3_shift_5',
                                              'target4_shift_5']]

df_join[f] = df_join[f].fillna(0)                      

In [46]:
df_join.replace('', np.nan, inplace=True)

In [47]:
df_join = df_join.dropna(subset=[             
    'target1_shift_1', 
    'target2_shift_1',
    'target3_shift_1',
    'target1_shift_2',
    'target3_shift_2',
    'target4_shift_2',
    'target1_shift_3',
    'target2_shift_3',
    'target3_shift_3',
    'target4_shift_3',
    'target1_shift_4',
    'target2_shift_4',
    'target3_shift_4',
    'target4_shift_4',
    'target1_shift_5',
    'target3_shift_5',
    'target4_shift_5',
    'CdPosition',
    'NuJersey'])

In [48]:
df_join.isna().any()[lambda x: x]

Series([], dtype: bool)

In [49]:
del df
df = df_join.copy()
del df_join
del df_playerBoxScores

df.rename(columns={'IdPlayer_x': 'IdPlayer'}, inplace=True)

In [50]:
df.drop('IdDtPlayer', axis = 1, inplace = True)

In [51]:
sort_df(df)

In [52]:
train, test, val = train_test_split(df)

In [53]:
train['Dt'] = pd.to_numeric(pd.to_datetime(train['Dt']))
test['Dt'] = pd.to_numeric(pd.to_datetime(test['Dt']))
val['Dt'] = pd.to_numeric(pd.to_datetime(val['Dt']))
train['DtGame'] = pd.to_numeric(pd.to_datetime(train['DtGame']))
test['DtGame'] = pd.to_numeric(pd.to_datetime(test['DtGame']))
val['DtGame'] = pd.to_numeric(pd.to_datetime(val['DtGame']))
train['DtGameUTC'] = pd.to_numeric(pd.to_datetime(train['DtGameUTC']))
test['DtGameUTC'] = pd.to_numeric(pd.to_datetime(test['DtGameUTC']))
val['DtGameUTC'] = pd.to_numeric(pd.to_datetime(val['DtGameUTC']))

In [84]:
from sklearn.linear_model import Lasso, Ridge, ElasticNet, LassoLars

In [64]:
df_results = pd.DataFrame()

In [65]:
%%time
model = Lasso()

# train the models
models = train_models(
            model = model,
            x_train = train.drop(TARGET_COLS, axis=1),
            y_train = train[TARGET_COLS]
        )

# predict the targets for each trained model
y_pred = predict_targets(models, test.drop(TARGET_COLS, axis=1))

# evaluate the models
mae = evaluate_mae(y_true = test[TARGET_COLS], y_pred = y_pred)

# save the results
df_results = df_results.append({'model': 'LASSO', **mae}, ignore_index=True)

# delete the variables to save RAM
del models, y_pred, mae

df_results[df_results['model'] == 'LASSO']

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


Wall time: 1min 27s


Unnamed: 0,model,target1,target2,target3,target4,average
0,LASSO,3.495544,2.832391,2.991855,1.61472,2.733627


In [68]:
%%time
model = Ridge()

# train the models
models = train_models(
            model = model,
            x_train = train.drop(TARGET_COLS, axis=1),
            y_train = train[TARGET_COLS]
        )

# predict the targets for each trained model
y_pred = predict_targets(models, test.drop(TARGET_COLS, axis=1))

# evaluate the models
mae = evaluate_mae(y_true = test[TARGET_COLS], y_pred = y_pred)

# save the results
df_results = df_results.append({'model': 'Ridge', **mae}, ignore_index=True)

# delete the variables to save RAM
del models, y_pred, mae

df_results[df_results['model'] == 'Ridge']

  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T


Wall time: 7.58 s


Unnamed: 0,model,target1,target2,target3,target4,average
1,Ridge,3.438686,2.877414,2.980205,1.671849,2.742038


In [69]:
%%time
model = ElasticNet()

# train the models
models = train_models(
            model = model,
            x_train = train.drop(TARGET_COLS, axis=1),
            y_train = train[TARGET_COLS]
        )

# predict the targets for each trained model
y_pred = predict_targets(models, test.drop(TARGET_COLS, axis=1))

# evaluate the models
mae = evaluate_mae(y_true = test[TARGET_COLS], y_pred = y_pred)

# save the results
df_results = df_results.append({'model': 'ElasticNet', **mae}, ignore_index=True)

# delete the variables to save RAM
del models, y_pred, mae

df_results

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


Wall time: 1min 12s


Unnamed: 0,model,target1,target2,target3,target4,average
0,LASSO,3.495544,2.832391,2.991855,1.61472,2.733627
1,Ridge,3.438686,2.877414,2.980205,1.671849,2.742038
2,ElasticNer,3.496967,2.841108,2.993748,1.622403,2.738557


In [72]:
%%time
model = LassoLars()

# train the models
models = train_models(
            model = model,
            x_train = train.drop(TARGET_COLS, axis=1),
            y_train = train[TARGET_COLS]
        )

# predict the targets for each trained model
y_pred = predict_targets(models, test.drop(TARGET_COLS, axis=1))

# evaluate the models
mae = evaluate_mae(y_true = test[TARGET_COLS], y_pred = y_pred)

# save the results
df_results = df_results.append({'model': 'LassoLars', **mae}, ignore_index=True)

# delete the variables to save RAM
del models, y_pred, mae

df_results

Wall time: 6.72 s


Unnamed: 0,model,target1,target2,target3,target4,average
0,LASSO,3.495544,2.832391,2.991855,1.61472,2.733627
1,Ridge,3.438686,2.877414,2.980205,1.671849,2.742038
2,ElasticNer,3.496967,2.841108,2.993748,1.622403,2.738557
3,LassoLars,4.456701,3.365556,4.131011,3.281212,3.80862


In [87]:
%%time
model = LassoCV()

# train the models
models = train_models(
            model = model,
            x_train = train.drop(TARGET_COLS, axis=1),
            y_train = train[TARGET_COLS]
        )

# predict the targets for each trained model
y_pred = predict_targets(models, test.drop(TARGET_COLS, axis=1))

# evaluate the models
mae = evaluate_mae(y_true = test[TARGET_COLS], y_pred = y_pred)

# save the results
df_results = df_results.append({'model': 'LassoCV', **mae}, ignore_index=True)

# delete the variables to save RAM
del models, y_pred, mae

df_results

  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gra

  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descen

  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descen

Wall time: 45 s


Unnamed: 0,model,target1,target2,target3,target4,average
0,LASSO,3.495544,2.832391,2.991855,1.61472,2.733627
1,Ridge,3.438686,2.877414,2.980205,1.671849,2.742038
2,ElasticNer,3.496967,2.841108,2.993748,1.622403,2.738557
3,LassoLars,4.456701,3.365556,4.131011,3.281212,3.80862
4,LassoLars,4.456701,3.365556,4.131011,3.281212,3.80862
5,LassoCV,4.456701,3.365556,4.131011,3.281212,3.80862
6,LassoCV,4.456701,3.365556,4.131011,3.281212,3.80862


In [88]:
model.alpha_

415527862156129.6

In [92]:
%%time
model = LassoLarsIC(criterion="aic")

# train the models
models = train_models(
            model = model,
            x_train = train.drop(TARGET_COLS, axis=1),
            y_train = train[TARGET_COLS]
        )

# predict the targets for each trained model
y_pred = predict_targets(models, test.drop(TARGET_COLS, axis=1))

# evaluate the models
mae = evaluate_mae(y_true = test[TARGET_COLS], y_pred = y_pred)

# save the results
df_results = df_results.append({'model': 'LassoLarsIC', **mae}, ignore_index=True)

# delete the variables to save RAM
df_results



Wall time: 8.49 s


Unnamed: 0,model,target1,target2,target3,target4,average
0,LASSO,3.495544,2.832391,2.991855,1.61472,2.733627
1,Ridge,3.438686,2.877414,2.980205,1.671849,2.742038
2,ElasticNer,3.496967,2.841108,2.993748,1.622403,2.738557
3,LassoLars,4.456701,3.365556,4.131011,3.281212,3.80862
4,LassoLars,4.456701,3.365556,4.131011,3.281212,3.80862
5,LassoCV,4.456701,3.365556,4.131011,3.281212,3.80862
6,LassoCV,4.456701,3.365556,4.131011,3.281212,3.80862
7,LassoLarsIC,3.474871,2.747967,3.079329,1.631107,2.733319
8,LassoLarsIC,3.474871,2.747967,3.079329,1.631107,2.733319


In [95]:
results = pd.DataFrame(
    {
        "alphas": models[-1].alphas_,
        "AIC criterion": models[-1].criterion_,
    }
).set_index("alphas")
alpha_aic = models[-1].alpha_

In [100]:
models

[LassoLarsIC(), LassoLarsIC(), LassoLarsIC(), LassoLarsIC()]

In [103]:
models.set_params(lassolarsic__criterion="bic").fit(X, y)
results["BIC criterion"] = models[-1].criterion_
alpha_bic = models[-1].alpha_

AttributeError: 'list' object has no attribute 'set_params'

In [99]:
def highlight_min(x):
    x_min = x.min()
    return ["font-weight: bold" if v == x_min else "" for v in x]


results.style.apply(highlight_min)

Unnamed: 0_level_0,AIC criterion
alphas,Unnamed: 1_level_1
0.012679,1266385.414458
0.005728,1118910.212411
0.004063,1093739.548084
0.003901,1091602.69578
0.003869,1091148.436465
0.003557,1086830.220345
0.001766,1068697.566101
0.001275,1065811.272566
0.001094,1064687.752692
0.00096,1063767.892294


In [104]:
%%time
model = Lasso(alpha = 415527862156129.6)

# train the models
models = train_models(
            model = model,
            x_train = train.drop(TARGET_COLS, axis=1),
            y_train = train[TARGET_COLS]
        )

# predict the targets for each trained model
y_pred = predict_targets(models, test.drop(TARGET_COLS, axis=1))

# evaluate the models
mae = evaluate_mae(y_true = test[TARGET_COLS], y_pred = y_pred)

# save the results
df_results = df_results.append({'model': 'LassoAlphaCV', **mae}, ignore_index=True)

# delete the variables to save RAM
del models, y_pred, mae

df_results

Wall time: 9.93 s


Unnamed: 0,model,target1,target2,target3,target4,average
0,LASSO,3.495544,2.832391,2.991855,1.61472,2.733627
1,Ridge,3.438686,2.877414,2.980205,1.671849,2.742038
2,ElasticNer,3.496967,2.841108,2.993748,1.622403,2.738557
3,LassoLars,4.456701,3.365556,4.131011,3.281212,3.80862
4,LassoLars,4.456701,3.365556,4.131011,3.281212,3.80862
5,LassoCV,4.456701,3.365556,4.131011,3.281212,3.80862
6,LassoCV,4.456701,3.365556,4.131011,3.281212,3.80862
7,LassoLarsIC,3.474871,2.747967,3.079329,1.631107,2.733319
8,LassoLarsIC,3.474871,2.747967,3.079329,1.631107,2.733319
9,LassoAlphaCV,4.456701,3.365556,4.131011,3.281212,3.80862


In [106]:
%%time
model = Lasso(alpha = 0.000127)

# train the models
models = train_models(
            model = model,
            x_train = train.drop(TARGET_COLS, axis=1),
            y_train = train[TARGET_COLS]
        )

# predict the targets for each trained model
y_pred = predict_targets(models, test.drop(TARGET_COLS, axis=1))

# evaluate the models
mae = evaluate_mae(y_true = test[TARGET_COLS], y_pred = y_pred)

# save the results
df_results = df_results.append({'model': 'LassoAlphaIC', **mae}, ignore_index=True)

# delete the variables to save RAM
del models, y_pred, mae

df_results

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


Wall time: 1min 10s


Unnamed: 0,model,target1,target2,target3,target4,average
0,LASSO,3.495544,2.832391,2.991855,1.61472,2.733627
1,Ridge,3.438686,2.877414,2.980205,1.671849,2.742038
2,ElasticNer,3.496967,2.841108,2.993748,1.622403,2.738557
3,LassoLars,4.456701,3.365556,4.131011,3.281212,3.80862
4,LassoLars,4.456701,3.365556,4.131011,3.281212,3.80862
5,LassoCV,4.456701,3.365556,4.131011,3.281212,3.80862
6,LassoCV,4.456701,3.365556,4.131011,3.281212,3.80862
7,LassoLarsIC,3.474871,2.747967,3.079329,1.631107,2.733319
8,LassoLarsIC,3.474871,2.747967,3.079329,1.631107,2.733319
9,LassoAlphaCV,4.456701,3.365556,4.131011,3.281212,3.80862
