## Modèle LightGBM

### Entrainement

In [None]:
import lightgbm as lgb
from sklearn.metrics import mean_squared_error

In [None]:
params = {
    "objective": "regression",
    "boosting": "gbdt",
    "num_leaves": 40,
    "learning_rate": 0.05,
    "feature_fraction": 0.85,
    "reg_lambda": 2,
    "metric": "rmse",
    "num_boost_round": 500, 
    "early_stopping_rounds": 200,
}

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import scipy.stats
import gc
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [None]:
X_train1 = pd.read_csv('/content/drive/MyDrive/Kaggle/X_train1.csv')#chemin Olivier

In [None]:
X_test1 = pd.read_csv('/content/drive/MyDrive/Kaggle/X_test1.csv') #cHemin Olivier

In [None]:
Y_train = pd.read_csv('/content/drive/MyDrive/Kaggle/Y_train.csv')#chemin Olivier

In [None]:
Y_test = pd.read_csv('/content/drive/MyDrive/Kaggle/Y_test.csv')#chemin Olivier

In [None]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df

In [None]:
X_train1=reduce_mem_usage(X_train1)
X_test1=reduce_mem_usage(X_test1)

Memory usage after optimization is: 0.52 MB
Decreased by 85.9%
Memory usage after optimization is: 0.29 MB
Decreased by 85.9%


In [None]:
LGB = lgb.LGBMRegressor(**params)

In [None]:
LGB_Reg=LGB.fit(X_train1, Y_train,
        eval_set=[(X_test1, Y_test)],
        eval_metric='rmse')



[1]	valid_0's rmse: 2.22042
Training until validation scores don't improve for 200 rounds.
[2]	valid_0's rmse: 2.19639
[3]	valid_0's rmse: 2.16433
[4]	valid_0's rmse: 2.13465
[5]	valid_0's rmse: 2.11033
[6]	valid_0's rmse: 2.09943
[7]	valid_0's rmse: 2.07672
[8]	valid_0's rmse: 2.05404
[9]	valid_0's rmse: 2.04116
[10]	valid_0's rmse: 2.02213
[11]	valid_0's rmse: 2.00851
[12]	valid_0's rmse: 2.00897
[13]	valid_0's rmse: 2.00271
[14]	valid_0's rmse: 1.99259
[15]	valid_0's rmse: 1.99139
[16]	valid_0's rmse: 1.98233
[17]	valid_0's rmse: 1.97337
[18]	valid_0's rmse: 1.96649
[19]	valid_0's rmse: 1.96082
[20]	valid_0's rmse: 1.96036
[21]	valid_0's rmse: 1.95349
[22]	valid_0's rmse: 1.94814
[23]	valid_0's rmse: 1.94935
[24]	valid_0's rmse: 1.94018
[25]	valid_0's rmse: 1.93048
[26]	valid_0's rmse: 1.93155
[27]	valid_0's rmse: 1.92788
[28]	valid_0's rmse: 1.91987
[29]	valid_0's rmse: 1.9206
[30]	valid_0's rmse: 1.9162
[31]	valid_0's rmse: 1.90885
[32]	valid_0's rmse: 1.90329
[33]	valid_0's rmse:

In [None]:
X_train1.info()

In [None]:
pickle.dump(LGB_Reg, open("lgb.pickle.dat", "wb"))

### Prediction et soumission

In [None]:
X_final = pd.read_csv('/content/drive/MyDrive/Kaggle/X_final.csv')

In [None]:
reduce_mem_usage(X_final)

In [None]:
X_final1=X_final.iloc[0:20000000,0:38]
reduce_mem_usage(X_final1)

In [None]:
loaded_model = pickle.load(open("lgb.pickle.dat", "rb"))

In [None]:
X_final1.info()

In [None]:
Y_pred_final1 = LGB_Reg.predict(X_final1,num_iteration=LGB_Reg.best_iteration_)

In [None]:
Y_pred_final1 = pd.DataFrame(data=Y_pred_final1)
Y_pred_final1=Y_pred_final1.to_csv('/content/Y_pred_final1.csv', index=False)

In [None]:
del X_final1
del Y_pred_final1

In [None]:
X_final.info()

In [None]:
X_final2=X_final.iloc[20000000:41697601,0:38]
reduce_mem_usage(X_final2)

In [None]:
Y_pred_final2 = LGB_Reg.predict(X_final2,num_iteration=LGB_Reg.best_iteration_)

In [None]:
Y_pred_final2 = pd.DataFrame(data=Y_pred_final2)
Y_pred_final2=Y_pred_final2.to_csv('/content/Y_pred_final2.csv', index=False)

In [None]:
del X_final2
del Y_pred_final2

In [None]:
Y_pred_final1 = pd.read_csv('/content/Y_pred_final1.csv')

In [None]:
Y_pred_final2 = pd.read_csv('/content/Y_pred_final2.csv')

In [None]:
Y_pred_finalLGB_df = pd.concat([Y_pred_final1, Y_pred_final2])

In [None]:
reduce_mem_usage(Y_pred_finalLGB_df)

In [None]:
Y_pred_finalLGB_df.info()

In [None]:
Y_pred_finalLGB_df.to_csv (r'/content/drive/MyDrive/Kaggle/Y_pred_finalLGB_df.csv', index = False, header=True)

In [None]:
del Y_pred_finalLGB_df

In [None]:
Y_pred_finalLGB=pd.read_csv('/content/drive/MyDrive/Kaggle/Y_pred_finalLGB_df.csv')

In [None]:
reduce_mem_usage(Y_pred_finalLGB)

In [None]:
Y_pred_finalLGB=np.exp(Y_pred_finalLGB)


In [None]:
Y_pred_finalLGB.loc[ (X_final.site_id==0)&(X_final.electricity==1) ] /=0.2931

In [None]:
submission  = pd.read_csv('/content/drive/MyDrive/Kaggle/sample_submission.csv')
submission['meter_reading'] = Y_pred_finalLGB
submission.loc[submission['meter_reading']<0, 'meter_reading'] = 0
submission.to_csv('/content/drive/MyDrive/Kaggle/submission.csv', index=False)

In [None]:
reduce_mem_usage(Y_pred_finalLGB_df)

In [None]:
! kaggle competitions submit -c ashrae-energy-prediction -f '/content/drive/MyDrive/Kaggle/submission.csv' -m "Third submission using Light GBM algorithm"

100% 579M/579M [00:11<00:00, 53.7MB/s]
Successfully submitted to ASHRAE - Great Energy Predictor III