In [None]:
!pip install kaggle



In [None]:
from google.colab import files

# Upload the Kaggle API key file
uploaded = files.upload()

Saving kaggle.json to kaggle.json


In [None]:
!mkdir ~/.kaggle
!mv kaggle.json ~/.kaggle/

In [None]:
!kaggle competitions download -c dataquest-challenge-1

Downloading dataquest-challenge-1.zip to /content
  0% 0.00/413k [00:00<?, ?B/s]
100% 413k/413k [00:00<00:00, 106MB/s]


In [None]:
!unzip dataquest-challenge-1.zip

Archive:  dataquest-challenge-1.zip
  inflating: SampleSubmission.csv    
  inflating: extra_data/extra_data/CONSOMMATION D_EAU POTABLE GLOBALE ANNUELLE PAR USAGE ET PAR R�GION EN 1000 M�TRE CUBE/c-e-p-u-2007.xlsx  
  inflating: extra_data/extra_data/CONSOMMATION D_EAU POTABLE GLOBALE ANNUELLE PAR USAGE ET PAR R�GION EN 1000 M�TRE CUBE/c-e-p-u-2008.xlsx  
  inflating: extra_data/extra_data/CONSOMMATION D_EAU POTABLE GLOBALE ANNUELLE PAR USAGE ET PAR R�GION EN 1000 M�TRE CUBE/c-e-p-u-2009.xlsx  
  inflating: extra_data/extra_data/CONSOMMATION D_EAU POTABLE GLOBALE ANNUELLE PAR USAGE ET PAR R�GION EN 1000 M�TRE CUBE/c-e-p-u-2010.xlsx  
  inflating: extra_data/extra_data/CONSOMMATION D_EAU POTABLE GLOBALE ANNUELLE PAR USAGE ET PAR R�GION EN 1000 M�TRE CUBE/c-e-p-u-2011.xlsx  
  inflating: extra_data/extra_data/CONSOMMATION D_EAU POTABLE GLOBALE ANNUELLE PAR USAGE ET PAR R�GION EN 1000 M�TRE CUBE/c-e-p-u-2012.xlsx  
  inflating: extra_data/extra_data/CONSOMMATION D_EAU POTABLE GLOBALE ANNU

In [None]:
import numpy as np
import pandas as pd

from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import mean_squared_error

from lightgbm import LGBMRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score

from xgboost import XGBRegressor
from sklearn.ensemble import VotingRegressor


from tqdm import tqdm

In [None]:
RANDOM_STATE = 42

# data import
data = pd.read_csv('train.csv', parse_dates=['date'])
test_data = pd.read_csv('test.csv', parse_dates=['date'])
sample_submission = pd.read_csv('SampleSubmission.csv', parse_dates=['date'])

data_copy = data.copy()
targets = data.drop('date', axis=1).columns

n_train_days = int((data['date'].max() - data['date'].min()).days * 0.65)
n_val_days = (data['date'].max() - data['date'].min()).days - n_train_days
first_val_date = data['date'].min() + pd.DateOffset(days=n_train_days)

data = data.groupby('date').mean()

In [None]:
def preprocess_data(data):
    resampled_data = data.asfreq('d').interpolate(method='time')
    shifted_data = resampled_data.shift()
    shift_columns = [column + ' -1' for column in resampled_data.columns]
    shifted_data.columns = shift_columns

    result_data = pd.concat([resampled_data, shifted_data], axis=1).fillna(-1)
    result_data['std'] = result_data[shift_columns].std(axis=1)
    result_data['mean'] = result_data[shift_columns].mean(axis=1)
    result_data['median'] = result_data[shift_columns].median(axis=1)
    result_data['min'] = result_data[shift_columns].min(axis=1)
    result_data['max'] = result_data[shift_columns].max(axis=1)

    return result_data




In [None]:
import tensorflow as tf
tf.config.list_physical_devices('GPU')

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [None]:

lgbm_params = {
    'n_estimators': 133,
    'learning_rate': 0.02160090575577129,
    'num_leaves': 17,
    'reg_lambda': 9.565258041853633,
    'reg_alpha': 0.8510455855003681,
    'max_depth': 8,
    'subsample': 0.9853030507684611,
    'colsample_bytree': 0.3386640122320139,
    'min_child_weight': 68,
    'min_child_samples': 32,
    'random_state': RANDOM_STATE,
    'boosting_type': 'gbdt'
}

xgb_params = {
    'lambda': 3.2834644371560267e-07,
    'alpha': 0.0033674483434975857,
    'colsample_bytree': 0.9931681116513553,
    'subsample': 0.9627374697387774,
    'n_estimators': 564,
    'max_depth': 12,
    'min_child_weight': 99,
    'tree_method': 'gpu_hist',
    'random_state': RANDOM_STATE,
    'learning_rate': 0.01
}

In [None]:
max_train_date = data.index.max()
min_date = sample_submission['date'].min()
max_date = sample_submission['date'].max()

print(f'The training data stops at {max_train_date}')
print(f'The testing data is from {min_date} to {max_date}')

n_days = (max_date - min_date).days

preprocessed_data = preprocess_data(data)
X, Y = preprocessed_data.drop(targets, axis=1), preprocessed_data[targets]

The training data stops at 2019-02-05 00:00:00
The testing data is from 2019-02-06 00:00:00 to 2020-12-01 00:00:00


In [None]:
from sklearn.model_selection import GridSearchCV, KFold

lgbm_param_grid = {
    'n_estimators': [120, 125, 130, 135],
    'learning_rate': [0.01, 0.02, 0.03],
    'num_leaves': [15, 20, 25],
    'reg_lambda': [5, 7, 10],
    'reg_alpha': [0, 0.1, 1],
    'max_depth': [6, 8, 10],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.2, 0.4, 0.6, 0.8],
    'min_child_weight': [50, 75, 100],
    'min_child_samples': [20, 30, 40],
    'random_state': [42],
    'boosting_type': ['gbdt']
}


xgb_param_grid = {
    'lambda': [1e-7, 2e-7, 3e-7],
    'alpha': [0, 0.001, 0.01],
    'colsample_bytree': [0.8, 0.9, 1.0],
    'subsample': [0.8, 0.9, 1.0],
    'n_estimators': [100, 200, 300, 400, 500, 600],
    'max_depth': [6, 8, 10, 12],
    'min_child_weight': [50, 100, 150],
    'tree_method': ['gpu_hist'],
    'random_state': [42],
    'learning_rate': [0.01, 0.015, 0.02]
}
def find_best_params(model, param_grid, X, Y):
    grid = GridSearchCV(model,
                        param_grid=param_grid,
                        refit=True,
                        cv=KFold(shuffle=True, random_state=1),
                        n_jobs=-1)
    grid.fit(X, Y)
    best_params = grid.best_params_
    return best_params
lgbm_model = LGBMRegressor()
xgb_model = XGBRegressor()
lgbm_params = find_best_params(lgbm_model, lgbm_param_grid, X, Y)
xgb_params = find_best_params(xgb_model, xgb_param_grid, X, Y)

In [None]:
lgbm_model = LGBMRegressor(**lgbm_params)
xgb_model = XGBRegressor(**xgb_params)
voting_reg = VotingRegressor([('LightGBM', lgbm_model), ('XGBoost', xgb_model)],
                             weights=(0.6, 0.4))
model = MultiOutputRegressor(voting_reg)

model.fit(X, Y)

In [None]:

prediction = pd.DataFrame()
past = Y.copy()
for day in tqdm(range(n_days + 1)):
    dummy_row = past.iloc[[-1]]
    dummy_row.index = dummy_row.index + pd.DateOffset(days=1)
    past = pd.concat([past, dummy_row], axis=0)
    preprocessed_past = preprocess_data(past)
    pred = model.predict(preprocessed_past.drop(targets, axis=1).iloc[[-1]])
    prediction_row = pd.DataFrame(pred, columns=targets)
    prediction_row.index = past.iloc[[-1]].index
    prediction = pd.concat([prediction, prediction_row], axis=0)
    past.iloc[-1, :] = prediction_row.iloc[0, :]
prediction.reset_index(inplace=True)

sample_submission = sample_submission[['date']].merge(prediction, on='date')
sample_submission.to_csv('submission6.csv', index=False)


    E.g. tree_method = "hist", device = "cuda"

100%|██████████| 665/665 [04:11<00:00,  2.65it/s]


In [None]:
df = pd.read_csv("submission6.csv")
df.head()

Unnamed: 0,date,MELLEGUE,BEN METIR,KASSEB,BARBARA,SIDI SALEM,BOU-HEURTMA,JOUMINE,GHEZALA,SEJNANE,...,SIDI AÏCH,EL BREK,BEZIRK,CHIBA,MASRI,LEBNA,HMA,ABID,Zarga,Ziatine
0,2019-02-06,33.536259,54.678594,62.438513,55.011414,549.156418,93.969989,97.826204,8.069175,104.430777,...,3.967861,2.209193,4.475513,3.647641,4.581357,20.368411,2.382456,8.746026,22.039796,31.704147
1,2019-02-07,33.728288,54.261641,62.287051,57.608681,547.493178,92.49241,95.908588,8.751875,106.131746,...,4.340692,2.169476,4.644476,3.837923,4.271186,20.469938,2.549419,8.849239,21.595246,31.607432
2,2019-02-08,33.89575,54.676726,62.33496,56.988836,544.269472,90.535467,95.924947,8.441438,107.787892,...,4.353645,2.145157,4.657836,3.866932,4.28707,20.894503,2.415886,8.810196,20.564983,31.804851
3,2019-02-09,33.930885,55.049664,62.426699,56.697497,543.623335,92.402377,96.16599,8.516385,109.77563,...,4.337908,2.151603,4.655738,3.880381,4.257113,20.964903,2.464252,8.920912,20.039792,31.804432
4,2019-02-11,33.957867,55.626112,62.650774,51.619232,547.952741,93.885909,97.263039,8.10411,110.236212,...,4.342069,2.158823,4.650552,3.857987,4.251179,20.89584,2.243553,8.917654,19.992248,31.798517


In [None]:
!pip install joblib



In [None]:
from joblib import dump, load

# Save the stacking model
dump(model, 'final_model.joblib')

# Load the stacking model in production
loaded_model = load('final_model.joblib')





    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"

