In [1]:
pip install xgboost lightgbm catboost holidays optuna

Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
Collecting xgboost
  Downloading xgboost-1.7.4-py3-none-manylinux2014_x86_64.whl (193.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 MB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting lightgbm
  Downloading lightgbm-3.3.5-py3-none-manylinux1_x86_64.whl (2.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m39.8 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hCollecting catboost
  Downloading catboost-1.1.1-cp310-none-manylinux1_x86_64.whl (76.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.6/76.6 MB[0m [31m10.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting holidays
  Downloading holidays-0.21.13-py3-none-any.whl (378 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m378.2/378.2 kB[0m [31m11.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting optu

In [2]:
import boto3
import pandas as pd; pd.set_option('display.max_columns', 100)
import numpy as np

from tqdm import tqdm

import holidays

from functools import partial
import scipy as sp

import matplotlib.pyplot as plt; plt.style.use('ggplot')
import seaborn as sns

from scipy.stats import rankdata
from sklearn.multiclass import OneVsRestClassifier
from sklearn.tree import DecisionTreeRegressor, plot_tree
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import KFold, train_test_split, GridSearchCV, StratifiedKFold, TimeSeriesSplit, GroupKFold
from sklearn.metrics import mean_squared_error, roc_auc_score, log_loss
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, HistGradientBoostingClassifier, HistGradientBoostingRegressor, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression, LinearRegression, Lasso, Ridge, ElasticNet
from lightgbm import LGBMClassifier, LGBMRegressor 
from xgboost import XGBClassifier, XGBRegressor
from catboost import CatBoostClassifier, CatBoostRegressor

import optuna 

s3 = boto3.resource('s3')
bucket_name = 'analytics-data-science-competitions'
bucket = s3.Bucket(bucket_name)

file_key_1 = 'Tabular-Playground-Series/Pog-Series/Rob-Sleep-Prediction/train.csv'
file_key_2 = 'Tabular-Playground-Series/Pog-Series/Rob-Sleep-Prediction/test.csv'
file_key_3 = 'Tabular-Playground-Series/Pog-Series/Rob-Sleep-Prediction/sample_submission.csv'
# file_key_4 = 'Tabular-Playground-Series/PS-S3/Ep10/Pulsar.csv'

bucket_object_1 = bucket.Object(file_key_1)
file_object_1 = bucket_object_1.get()
file_content_stream_1 = file_object_1.get('Body')

bucket_object_2 = bucket.Object(file_key_2)
file_object_2 = bucket_object_2.get()
file_content_stream_2 = file_object_2.get('Body')

bucket_object_3 = bucket.Object(file_key_3)
file_object_3 = bucket_object_3.get()
file_content_stream_3 = file_object_3.get('Body')

# bucket_object_4 = bucket.Object(file_key_4)
# file_object_4 = bucket_object_4.get()
# file_content_stream_4 = file_object_4.get('Body')

## Reading data files
train = pd.read_csv(file_content_stream_1)
train['date'] = pd.to_datetime(train['date'])

# train.drop(columns = 'id', axis = 1, inplace = True)
# train['is_generated'] = 1

test = pd.read_csv(file_content_stream_2)
test['date'] = pd.to_datetime(test['date'])

# test['is_generated'] = 1

submission = pd.read_csv(file_content_stream_3)
# original = pd.read_csv(file_content_stream_4)
# original['is_generated'] = 0

# train = pd.concat([train, original], axis = 0)

Matplotlib is building the font cache; this may take a moment.


In [4]:
train.head()

Unnamed: 0,date,sleep_hours
0,2015-02-19,6.4
1,2015-02-20,7.583333
2,2015-02-21,6.35
3,2015-02-22,6.5
4,2015-02-23,8.916667


In [3]:
def get_holidays(df):
    years_list = [2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023]

    holiday_US = holidays.CountryHoliday('US', years = years_list)
    holiday_dict = holiday_US.copy()

    df['holiday_name'] = df['date'].map(holiday_dict)
    df['is_holiday'] = np.where(df['holiday_name'].notnull(), 1, 0)
    df['holiday_name'] = df['holiday_name'].fillna('Not Holiday')
    
    return df.drop(columns = ['holiday_name'])


def feature_engineer(df):
    
    new_df = df.copy()
    new_df["month"] = df["date"].dt.month
    new_df["month_sin"] = np.sin(new_df['month'] * (2 * np.pi / 12))
#     new_df["month_cos"] = np.cos(new_df['month'] * (2 * np.pi / 12))
    
    new_df["day"] = df["date"].dt.day
    new_df["day_sin"] = np.sin(new_df['day'] * (2 * np.pi / 12))
#     new_df["day_cos"] = np.cos(new_df['day'] * (2 * np.pi / 12))
    
    new_df["day_of_week"] = df["date"].dt.dayofweek
    new_df["day_of_week"] = new_df["day_of_week"].apply(lambda x: 0 if x <= 3 else(1 if x == 4 else (2 if x == 5 else (3))))
    
    new_df["day_of_year"] = df["date"].dt.dayofyear
    new_df["year"] = df["date"].dt.year
    
    return new_df

train = feature_engineer(train)
test = feature_engineer(test)

train = get_holidays(train)
test = get_holidays(test)

train.loc[((train['date'] >= '2017-09-27') & (train['date'] <= '2018-06-12')), 'sleep_hours'] = train.loc[((train['date'] >= '2017-09-27') & (train['date'] <= '2018-06-12')), 'sleep_hours'] / 1.94 

X = train.drop(columns = ['date', 'sleep_hours', 'year'], axis = 1)
Y = train['sleep_hours']

test = test.drop(columns = ['date', 'sleep_hours', 'year'], axis = 1)

In [19]:
X.head()

Unnamed: 0,month,month_sin,day,day_sin,day_of_week,day_of_year,year,is_holiday
0,2,0.866025,19,-0.5,0,50,2015,0
1,2,0.866025,20,-0.866025,1,51,2015,0
2,2,0.866025,21,-1.0,2,52,2015,0
3,2,0.866025,22,-0.866025,3,53,2015,0
4,2,0.866025,23,-0.5,0,54,2015,0


In [4]:
linear_preds = np.zeros(test.shape[0])
linear_rmse = 0

ridge_preds = np.zeros(test.shape[0])
ridge_rmse = 0

lasso_preds = np.zeros(test.shape[0])
lasso_rmse = 0

elastic_preds = np.zeros(test.shape[0])
elastic_rmse = 0

kf = KFold(n_splits = 30, shuffle = True, random_state = 42)
# kf = GroupKFold(n_splits = 5)

for trn_idx, test_idx in kf.split(X, Y):
# for trn_idx, test_idx in kf.split(X, groups = X.year):
    
    x_train, x_valid = X.iloc[trn_idx], X.iloc[test_idx]
    y_train, y_valid = Y.iloc[trn_idx], Y.iloc[test_idx]
    
    ##################
    ## Linear model ##
    ##################
    
    linear_model = LinearRegression().fit(x_train, y_train)

    y_pred = linear_model.predict(x_valid)
    linear_rmse += mean_squared_error(y_pred, y_valid, squared = False) / kf.n_splits
    
    linear_pred = linear_model.predict(test)
    linear_pred = pd.Series(linear_pred) 
    linear_preds += linear_pred / kf.n_splits
    
    ###########
    ## Ridge ##
    ###########
    
    ridge_model = Ridge(alpha = 0.0999, tol = 1e-2, max_iter = 1000000, random_state = 0).fit(x_train, y_train)

    y_pred = ridge_model.predict(x_valid)
    ridge_rmse += mean_squared_error(y_pred, y_valid, squared = False) / kf.n_splits
    
    ridge_pred = ridge_model.predict(test)
    ridge_pred = pd.Series(ridge_pred) 
    ridge_preds += ridge_pred / kf.n_splits
    
    ###########
    ## Lasso ##
    ###########
    
    lasso_model = Lasso(alpha = 0.0999, tol = 1e-2, max_iter = 1000000, random_state = 0).fit(x_train, y_train)

    y_pred = lasso_model.predict(x_valid)
    lasso_rmse += mean_squared_error(y_pred, y_valid, squared = False) / kf.n_splits
    
    lasso_pred = lasso_model.predict(test)
    lasso_pred = pd.Series(lasso_pred) 
    lasso_preds += lasso_pred / kf.n_splits
    
    #############
    ## Elastic ##
    #############
    
    elastic_model = ElasticNet(alpha = 0.0002, l1_ratio = 0.001, random_state = 42).fit(x_train, y_train)

    y_pred = elastic_model.predict(x_valid)
    elastic_rmse += mean_squared_error(y_pred, y_valid, squared = False) / kf.n_splits
    
    elastic_pred = elastic_model.predict(test)
    elastic_pred = pd.Series(elastic_pred) 
    elastic_preds += elastic_pred / kf.n_splits 

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

In [6]:
md_perf = pd.DataFrame({'Model': ['Linear', 'Ridge', 'Lasso', 'Elastic'], 
                        'CV-Score': [linear_rmse, ridge_rmse, lasso_rmse, elastic_rmse]})
md_perf

Unnamed: 0,Model,CV-Score
0,Linear,0.90787
1,Ridge,0.907847
2,Lasso,0.91131
3,Elastic,0.907846


In [12]:
md_perf = pd.DataFrame({'Model': ['Linear', 'Ridge', 'Lasso', 'Elastic'], 
                        'CV-Score': [linear_rmse, ridge_rmse, lasso_rmse, elastic_rmse]})
md_perf

Unnamed: 0,Model,CV-Score
0,Linear,0.902437
1,Ridge,0.902408
2,Lasso,0.905345
3,Elastic,0.90232


In [5]:
md_perf = pd.DataFrame({'Model': ['Linear', 'Ridge', 'Lasso', 'Elastic'], 
                        'CV-Score': [linear_rmse, ridge_rmse, lasso_rmse, elastic_rmse]})
md_perf

Unnamed: 0,Model,CV-Score
0,Linear,0.902998
1,Ridge,0.902969
2,Lasso,0.905833
3,Elastic,0.902971


In [7]:
w1 = 1/ linear_rmse  
w2 = 1/ ridge_rmse
w3 = 1/ lasso_rmse
w4 = 1/ elastic_rmse
wtot = w1 + w2 + w3 + w4

w1 = w1 / wtot
w2 = w2 / wtot
w3 = w3 / wtot
w4 = w4 / wtot

submission['sleep_hours'] = linear_preds * w1 + ridge_preds * w2 + lasso_preds * w3  + elastic_preds * w4
submission.head()

Unnamed: 0,date,sleep_hours
0,2022-01-01,6.896605
1,2022-01-02,6.905294
2,2022-01-03,6.603509
3,2022-01-04,6.604246
4,2022-01-05,6.606458


In [8]:
submission.to_csv('baseline_submission_4.csv', index = False)

In [9]:
submission['sleep_hours'] =  elastic_preds 
submission.head()

Unnamed: 0,date,sleep_hours
0,2022-01-01,6.955219
1,2022-01-02,6.958634
2,2022-01-03,6.596936
3,2022-01-04,6.598015
4,2022-01-05,6.601075


In [10]:
submission.to_csv('baseline_submission_5.csv', index = False)

In [13]:
submission['sleep_hours'] =  elastic_preds 
submission.head()

Unnamed: 0,date,sleep_hours
0,2022-01-01,6.955125
1,2022-01-02,6.958714
2,2022-01-03,6.596975
3,2022-01-04,6.598063
4,2022-01-05,6.601143


In [14]:
submission.to_csv('baseline_submission_6.csv', index = False)

In [6]:
xgb_cv_scores, xgb_preds = list(), list()

for i in tqdm(range(1)):

    skf = KFold(n_splits = 30, random_state = 42, shuffle = True)
    
    for train_ix, test_ix in skf.split(X, Y):
        
        ## Splitting the data 
        X_train, X_test = X.iloc[train_ix], X.iloc[test_ix]
        Y_train, Y_test = Y.iloc[train_ix], Y.iloc[test_ix]
                
        #############
        ## XGBoost ##
        #############
        
        xgb_md = XGBRegressor(tree_method = 'hist',
                              colsample_bytree = 0.4, 
                              gamma = 1.25, 
                              learning_rate = 0.01, 
                              max_depth = 2, 
                              min_child_weight = 80, 
                              n_estimators = 2000, 
                              subsample = 1).fit(X_train, Y_train)
        
        ## Predicting on X_test and test        
        xgb_pred_1 = xgb_md.predict(X_test)
        xgb_pred_2 = xgb_md.predict(test)
        
        ## Computing rmse and storing test predictions
        xgb_cv_scores.append(mean_squared_error(Y_test, xgb_pred_1, squared = False))
        xgb_preds.append(xgb_pred_2)
        
xgb_cv_score = np.mean(xgb_cv_scores)
print('The oof mean-squared-log-error of the XGBoost model is ', xgb_cv_score)

100%|██████████| 1/1 [00:14<00:00, 14.67s/it]

The oof mean-squared-log-error of the XGBoost model is  0.8999885829047847





In [None]:
0.8994529678578282

In [7]:
lgb_cv_scores, lgb_preds = list(), list()

for i in tqdm(range(1)):

    skf = KFold(n_splits = 30, random_state = 42, shuffle = True)
    
    for train_ix, test_ix in skf.split(X, Y):
        
        ## Splitting the data 
        X_train, X_test = X.iloc[train_ix], X.iloc[test_ix]
        Y_train, Y_test = Y.iloc[train_ix], Y.iloc[test_ix]
                
        #############
        ## XGBoost ##
        #############
        
        lgb_md = LGBMRegressor(n_estimators = 2000,
                               max_depth = 2,
                               learning_rate = 0.01,
                               num_leaves = 50,
                               lambda_l1 = 20,
                               lambda_l2 = 25,
                               bagging_fraction = 0.8,
                               feature_fraction = 0.8).fit(X_train, Y_train)
        
        ## Predicting on X_test and test        
        lgb_pred_1 = lgb_md.predict(X_test)
        lgb_pred_2 = lgb_md.predict(test)
        
        ## Computing rmse and storing test predictions
        lgb_cv_scores.append(mean_squared_error(Y_test, lgb_pred_1, squared = False))
        lgb_preds.append(lgb_pred_2)
        
lgb_cv_score = np.mean(lgb_cv_scores)
print('The oof mean-squared-log-error of the XGBoost model is ', lgb_cv_score)

  0%|          | 0/1 [00:00<?, ?it/s]



100%|██████████| 1/1 [00:05<00:00,  5.95s/it]

The oof mean-squared-log-error of the XGBoost model is  0.9004400901291761





In [None]:
0.8991208416290151

In [34]:
lgb_preds = pd.DataFrame(lgb_preds).mean(axis = 0)
submission['sleep_hours'] =  lgb_preds
submission.head()

Unnamed: 0,date,sleep_hours
0,2022-01-01,6.841592
1,2022-01-02,6.7726
2,2022-01-03,6.576557
3,2022-01-04,6.576552
4,2022-01-05,6.543252


In [35]:
submission.to_csv('lgb_baseline_submission_2.csv', index = False)

# Changes based on public notebook

In [8]:
import boto3
import pandas as pd; pd.set_option('display.max_columns', 100)
import numpy as np

from tqdm import tqdm

import holidays

from functools import partial
import scipy as sp

import matplotlib.pyplot as plt; plt.style.use('ggplot')
import seaborn as sns

from scipy.stats import rankdata
from sklearn.multiclass import OneVsRestClassifier
from sklearn.tree import DecisionTreeRegressor, plot_tree
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import KFold, train_test_split, GridSearchCV, StratifiedKFold, TimeSeriesSplit, GroupKFold
from sklearn.metrics import mean_squared_error, roc_auc_score, log_loss
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, HistGradientBoostingClassifier, HistGradientBoostingRegressor, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression, LinearRegression, Lasso, Ridge, ElasticNet
from lightgbm import LGBMClassifier, LGBMRegressor 
from xgboost import XGBClassifier, XGBRegressor
from catboost import CatBoostClassifier, CatBoostRegressor

import optuna 

s3 = boto3.resource('s3')
bucket_name = 'analytics-data-science-competitions'
bucket = s3.Bucket(bucket_name)

file_key_1 = 'Tabular-Playground-Series/Pog-Series/Rob-Sleep-Prediction/train.csv'
file_key_2 = 'Tabular-Playground-Series/Pog-Series/Rob-Sleep-Prediction/test.csv'
file_key_3 = 'Tabular-Playground-Series/Pog-Series/Rob-Sleep-Prediction/sample_submission.csv'
# file_key_4 = 'Tabular-Playground-Series/PS-S3/Ep10/Pulsar.csv'

bucket_object_1 = bucket.Object(file_key_1)
file_object_1 = bucket_object_1.get()
file_content_stream_1 = file_object_1.get('Body')

bucket_object_2 = bucket.Object(file_key_2)
file_object_2 = bucket_object_2.get()
file_content_stream_2 = file_object_2.get('Body')

bucket_object_3 = bucket.Object(file_key_3)
file_object_3 = bucket_object_3.get()
file_content_stream_3 = file_object_3.get('Body')

## Reading data files
train = pd.read_csv(file_content_stream_1)
train['date'] = pd.to_datetime(train['date'])

test = pd.read_csv(file_content_stream_2)
test['date'] = pd.to_datetime(test['date'])

submission = pd.read_csv(file_content_stream_3)

def get_holidays(df):
    years_list = [2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023]

    holiday_US = holidays.CountryHoliday('US', years = years_list)
    holiday_dict = holiday_US.copy()

    df['holiday_name'] = df['date'].map(holiday_dict)
    df['is_holiday'] = np.where(df['holiday_name'].notnull(), 1, 0)
    df['holiday_name'] = df['holiday_name'].fillna('Not Holiday')
    
    return df.drop(columns = ['holiday_name'])


def feature_engineer(df):
    
    new_df = df.copy()
    new_df["month"] = df["date"].dt.month
    new_df["month_sin"] = np.sin(new_df['month'] * (2 * np.pi / 12))
#     new_df["month_cos"] = np.cos(new_df['month'] * (2 * np.pi / 12))
    
    new_df["day"] = df["date"].dt.day
    new_df["day_sin"] = np.sin(new_df['day'] * (2 * np.pi / 12))
#     new_df["day_cos"] = np.cos(new_df['day'] * (2 * np.pi / 12))
    
    new_df["day_of_week"] = df["date"].dt.dayofweek
    new_df["day_of_week"] = new_df["day_of_week"].apply(lambda x: 0 if x <= 3 else(1 if x == 4 else (2 if x == 5 else (3))))
    
    new_df["day_of_year"] = df["date"].dt.dayofyear
    new_df["year"] = df["date"].dt.year
    
    return new_df

train = feature_engineer(train)
test = feature_engineer(test)

train = get_holidays(train)
test = get_holidays(test)

train.loc[((train['date'] >= '2017-09-27') & (train['date'] <= '2018-06-12')), 'sleep_hours'] = train.loc[((train['date'] >= '2017-09-27') & (train['date'] <= '2018-06-12')), 'sleep_hours'] / 1.94 

X = train.drop(columns = ['date', 'sleep_hours', 'year'], axis = 1)
Y = train['sleep_hours']

test = test.drop(columns = ['date', 'sleep_hours', 'year'], axis = 1)

train = train[train['date'] > '2015-07-20'].reset_index(drop = True)

In [9]:
train.head()

Unnamed: 0,date,sleep_hours,month,month_sin,day,day_sin,day_of_week,day_of_year,year,is_holiday
0,2015-07-21,7.85,7,-0.5,21,-1.0,0,202,2015,0
1,2015-07-22,5.916667,7,-0.5,22,-0.8660254,0,203,2015,0
2,2015-07-23,6.916667,7,-0.5,23,-0.5,0,204,2015,0
3,2015-07-24,6.45,7,-0.5,24,-4.898587e-16,1,205,2015,0
4,2015-07-25,7.783333,7,-0.5,25,0.5,2,206,2015,0


In [10]:
linear_preds = np.zeros(test.shape[0])
linear_rmse = 0

ridge_preds = np.zeros(test.shape[0])
ridge_rmse = 0

lasso_preds = np.zeros(test.shape[0])
lasso_rmse = 0

elastic_preds = np.zeros(test.shape[0])
elastic_rmse = 0

kf = KFold(n_splits = 30, shuffle = True, random_state = 42)
# kf = GroupKFold(n_splits = 5)

for trn_idx, test_idx in kf.split(X, Y):
# for trn_idx, test_idx in kf.split(X, groups = X.year):
    
    x_train, x_valid = X.iloc[trn_idx], X.iloc[test_idx]
    y_train, y_valid = Y.iloc[trn_idx], Y.iloc[test_idx]
    
    ##################
    ## Linear model ##
    ##################
    
    linear_model = LinearRegression().fit(x_train, y_train)

    y_pred = linear_model.predict(x_valid)
    linear_rmse += mean_squared_error(y_pred, y_valid, squared = False) / kf.n_splits
    
    linear_pred = linear_model.predict(test)
    linear_pred = pd.Series(linear_pred) 
    linear_preds += linear_pred / kf.n_splits
    
    ###########
    ## Ridge ##
    ###########
    
    ridge_model = Ridge(alpha = 0.0999, tol = 1e-2, max_iter = 1000000, random_state = 0).fit(x_train, y_train)

    y_pred = ridge_model.predict(x_valid)
    ridge_rmse += mean_squared_error(y_pred, y_valid, squared = False) / kf.n_splits
    
    ridge_pred = ridge_model.predict(test)
    ridge_pred = pd.Series(ridge_pred) 
    ridge_preds += ridge_pred / kf.n_splits
    
    ###########
    ## Lasso ##
    ###########
    
    lasso_model = Lasso(alpha = 0.0999, tol = 1e-2, max_iter = 1000000, random_state = 0).fit(x_train, y_train)

    y_pred = lasso_model.predict(x_valid)
    lasso_rmse += mean_squared_error(y_pred, y_valid, squared = False) / kf.n_splits
    
    lasso_pred = lasso_model.predict(test)
    lasso_pred = pd.Series(lasso_pred) 
    lasso_preds += lasso_pred / kf.n_splits
    
    #############
    ## Elastic ##
    #############
    
    elastic_model = ElasticNet(alpha = 0.0002, l1_ratio = 0.001, random_state = 42).fit(x_train, y_train)

    y_pred = elastic_model.predict(x_valid)
    elastic_rmse += mean_squared_error(y_pred, y_valid, squared = False) / kf.n_splits
    
    elastic_pred = elastic_model.predict(test)
    elastic_pred = pd.Series(elastic_pred) 
    elastic_preds += elastic_pred / kf.n_splits 

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

In [11]:
md_perf = pd.DataFrame({'Model': ['Linear', 'Ridge', 'Lasso', 'Elastic'], 
                        'CV-Score': [linear_rmse, ridge_rmse, lasso_rmse, elastic_rmse]})
md_perf

Unnamed: 0,Model,CV-Score
0,Linear,0.902998
1,Ridge,0.902969
2,Lasso,0.905833
3,Elastic,0.902971


In [12]:
xgb_cv_scores, xgb_preds = list(), list()

for i in tqdm(range(1)):

    skf = KFold(n_splits = 30, random_state = 42, shuffle = True)
    
    for train_ix, test_ix in skf.split(X, Y):
        
        ## Splitting the data 
        X_train, X_test = X.iloc[train_ix], X.iloc[test_ix]
        Y_train, Y_test = Y.iloc[train_ix], Y.iloc[test_ix]
                
        #############
        ## XGBoost ##
        #############
        
        xgb_md = XGBRegressor(tree_method = 'hist',
                              colsample_bytree = 0.4, 
                              gamma = 1.25, 
                              learning_rate = 0.01, 
                              max_depth = 2, 
                              min_child_weight = 80, 
                              n_estimators = 2000, 
                              subsample = 1).fit(X_train, Y_train)
        
        ## Predicting on X_test and test        
        xgb_pred_1 = xgb_md.predict(X_test)
        xgb_pred_2 = xgb_md.predict(test)
        
        ## Computing rmse and storing test predictions
        xgb_cv_scores.append(mean_squared_error(Y_test, xgb_pred_1, squared = False))
        xgb_preds.append(xgb_pred_2)
        
xgb_cv_score = np.mean(xgb_cv_scores)
print('The oof mean-squared-log-error of the XGBoost model is ', xgb_cv_score)

100%|██████████| 1/1 [00:12<00:00, 12.17s/it]

The oof mean-squared-log-error of the XGBoost model is  0.8999885829047847



