In [None]:
pip install catboost

In [None]:
import pandas as pd
import numpy as np

from catboost import CatBoostRegressor

from sklearn.metrics import mean_squared_error

In [None]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df

In [None]:
test = pd.read_csv('/content/drive/MyDrive/Kaggle/test.csv')

In [None]:
test = reduce_mem_usage(test)

Memory usage after optimization is: 1709.94 MB
Decreased by 85.9%


In [None]:
x_train = pd.read_csv('/content/drive/MyDrive/Kaggle/x_train.csv')

In [None]:
x_train = reduce_mem_usage(x_train)

Memory usage after optimization is: 485.81 MB
Decreased by 85.9%


In [None]:
x_test = pd.read_csv('/content/drive/MyDrive/Kaggle/x_test.csv')

In [None]:
x_test = reduce_mem_usage(x_test)

Memory usage after optimization is: 266.36 MB
Decreased by 85.9%


In [None]:
y_train = pd.read_csv('/content/drive/MyDrive/Kaggle/y_train.csv')

In [None]:
y_test = pd.read_csv('/content/drive/MyDrive/Kaggle/y_test.csv')

In [None]:
#Function used to downcast
def downcast_dtypes(df):

          # Select columns to downcast
  float_cols = [c for c in df if df[c].dtype == "float64"]
  int_cols =   [c for c in df if df[c].dtype == "int64"]

          # Downcast
  df[float_cols] = df[float_cols].astype(np.float16)
  df[int_cols]   = df[int_cols].astype(np.int16)

  return df

In [None]:
downcast_dtypes(test)
downcast_dtypes(x_train)
downcast_dtypes(x_test)

# Première soumission

In [None]:
CB = CatBoostRegressor()
CB_fit = CB.fit(x_train,y_train)

In [None]:
y_pred_CB = CB_fit.predict(x_test)

In [None]:
RMSE_CB = mean_squared_error(y_test, y_pred_CB) ** 0.5

In [None]:
print(RMSE_CB)

In [None]:
submission3  = pd.read_csv('/content/drive/MyDrive/Kaggle/sample_submission/sample_submission.csv')

In [None]:
Y_pred = CB_fit.predict(test)

In [None]:
submission3['meter_reading'] = np.exp(Y_pred)

In [None]:
submission3['meter_reading'] = 3.4118 * submission3['meter_reading']

In [None]:
submission3.loc[submission3['meter_reading']<0, 'meter_reading'] = 0

In [None]:
submission3.to_csv('/content/drive/MyDrive/Kaggle/submission3.csv', index=False)

# Deuxième soumission

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
mse = make_scorer(mean_squared_error)

In [None]:
param_grid = {'learning_rate':[0.1],'l2_leaf_reg':[0,5,10],'depth':[5,10,15],'iterations':[5]}

CB = CatBoostRegressor()

CB_GridSearch = GridSearchCV(estimator = CB, param_grid = param_grid, scoring=mse)

CB_search = CB_GridSearch.fit(x_train, y_train)

0:	learn: 1.6715401	total: 786ms	remaining: 3.15s
1:	learn: 1.6108039	total: 1.38s	remaining: 2.08s
2:	learn: 1.5591830	total: 1.99s	remaining: 1.33s
3:	learn: 1.5135180	total: 2.57s	remaining: 642ms
4:	learn: 1.4753901	total: 3.15s	remaining: 0us
0:	learn: 1.6733733	total: 596ms	remaining: 2.38s
1:	learn: 1.6121575	total: 1.21s	remaining: 1.81s
2:	learn: 1.5590107	total: 1.78s	remaining: 1.19s
3:	learn: 1.5141288	total: 2.36s	remaining: 590ms
4:	learn: 1.4739602	total: 2.94s	remaining: 0us
0:	learn: 1.6713677	total: 590ms	remaining: 2.36s
1:	learn: 1.6082521	total: 1.17s	remaining: 1.76s
2:	learn: 1.5545715	total: 1.75s	remaining: 1.17s
3:	learn: 1.5085780	total: 2.35s	remaining: 588ms
4:	learn: 1.4695750	total: 2.93s	remaining: 0us
0:	learn: 1.6752623	total: 584ms	remaining: 2.34s
1:	learn: 1.6131565	total: 1.18s	remaining: 1.77s
2:	learn: 1.5604442	total: 1.75s	remaining: 1.17s
3:	learn: 1.5134142	total: 2.36s	remaining: 589ms
4:	learn: 1.4743311	total: 2.94s	remaining: 0us
0:	learn

In [None]:
CB_search.best_params_

{'depth': 5, 'iterations': 5, 'l2_leaf_reg': 10, 'learning_rate': 0.1}

In [None]:
CB = CatBoostRegressor(learning_rate=0.05,l2_leaf_reg=2,depth=5)

In [None]:
CB_fit = CB.fit(x_train,y_train)

In [None]:
y_pred_CB = CB_fit.predict(x_test)

In [None]:
RMSE_CB = mean_squared_error(y_test, y_pred_CB) ** 0.5

In [None]:
print(RMSE_CB)

1.3321807829619867


In [None]:
submission4  = pd.read_csv('/content/drive/MyDrive/Kaggle/sample_submission/sample_submission.csv')

In [None]:
Y_pred = CB_fit.predict(test)

In [None]:
submission4['meter_reading'] = np.exp(Y_pred)

In [None]:
submission4['meter_reading'] = 3.4118 * submission4['meter_reading']

In [None]:
submission4.loc[submission4['meter_reading']<0, 'meter_reading'] = 0

In [None]:
submission4.to_csv('/content/drive/MyDrive/Kaggle/submission4.csv', index=False)

# Troisième soumission

In [None]:
CB = CatBoostRegressor(learning_rate=0.1,l2_leaf_reg=2,depth=5)

In [None]:
CB_fit = CB.fit(x_train,y_train)

0:	learn: 1.6733129	total: 1.02s	remaining: 17m 1s
1:	learn: 1.6117948	total: 1.93s	remaining: 16m 3s
2:	learn: 1.5587413	total: 2.81s	remaining: 15m 33s
3:	learn: 1.5141157	total: 3.69s	remaining: 15m 18s
4:	learn: 1.4756513	total: 4.57s	remaining: 15m 9s
5:	learn: 1.4404946	total: 5.47s	remaining: 15m 6s
6:	learn: 1.4056341	total: 6.33s	remaining: 14m 58s
7:	learn: 1.3791935	total: 7.24s	remaining: 14m 57s
8:	learn: 1.3571586	total: 8.1s	remaining: 14m 51s
9:	learn: 1.3326952	total: 8.97s	remaining: 14m 48s
10:	learn: 1.3098521	total: 9.88s	remaining: 14m 47s
11:	learn: 1.2944634	total: 10.7s	remaining: 14m 42s
12:	learn: 1.2784760	total: 11.7s	remaining: 14m 46s
13:	learn: 1.2663261	total: 12.5s	remaining: 14m 40s
14:	learn: 1.2486759	total: 13.4s	remaining: 14m 37s
15:	learn: 1.2316587	total: 14.3s	remaining: 14m 37s
16:	learn: 1.2200885	total: 15.1s	remaining: 14m 35s
17:	learn: 1.2109100	total: 16s	remaining: 14m 35s
18:	learn: 1.2034144	total: 16.9s	remaining: 14m 30s
19:	learn:

In [None]:
y_pred_CB = CB_fit.predict(x_test)

In [None]:
RMSE_CB = mean_squared_error(y_test, y_pred_CB) ** 0.5

In [None]:
print(RMSE_CB)

1.355940455646705


In [None]:
submission4  = pd.read_csv('/content/drive/MyDrive/Kaggle/sample_submission/sample_submission.csv')

In [None]:
Y_pred = CB_fit.predict(test)

In [None]:
submission4['meter_reading'] = np.exp(Y_pred)

In [None]:
submission4['meter_reading'] = 3.4118 * submission4['meter_reading']

In [None]:
submission4.loc[submission4['meter_reading']<0, 'meter_reading'] = 0

In [None]:
submission4.to_csv('/content/drive/MyDrive/Kaggle/submission4.csv', index=False)

#Quatrième soumission

In [None]:
CB = CatBoostRegressor(learning_rate=0.1,l2_leaf_reg=5,depth=5)

In [None]:
CB_fit = CB.fit(x_train,y_train)

0:	learn: 1.6733135	total: 1.01s	remaining: 16m 53s
1:	learn: 1.6117959	total: 1.91s	remaining: 15m 52s
2:	learn: 1.5587426	total: 2.77s	remaining: 15m 20s
3:	learn: 1.5141171	total: 3.65s	remaining: 15m 8s
4:	learn: 1.4756529	total: 4.52s	remaining: 15m
5:	learn: 1.4404963	total: 5.4s	remaining: 14m 55s
6:	learn: 1.4056359	total: 6.25s	remaining: 14m 46s
7:	learn: 1.3791953	total: 7.14s	remaining: 14m 45s
8:	learn: 1.3571605	total: 7.98s	remaining: 14m 38s
9:	learn: 1.3326972	total: 8.84s	remaining: 14m 34s
10:	learn: 1.3098540	total: 9.72s	remaining: 14m 33s
11:	learn: 1.2944655	total: 10.5s	remaining: 14m 28s
12:	learn: 1.2784782	total: 11.5s	remaining: 14m 32s
13:	learn: 1.2663283	total: 12.3s	remaining: 14m 25s
14:	learn: 1.2486782	total: 13.1s	remaining: 14m 22s
15:	learn: 1.2316610	total: 14s	remaining: 14m 22s
16:	learn: 1.2200911	total: 14.9s	remaining: 14m 20s
17:	learn: 1.2109126	total: 15.8s	remaining: 14m 20s
18:	learn: 1.2034171	total: 16.6s	remaining: 14m 14s
19:	learn: 

In [None]:
y_pred_CB = CB_fit.predict(x_test)

In [None]:
RMSE_CB = mean_squared_error(y_test, y_pred_CB) ** 0.5

In [None]:
print(RMSE_CB)

1.3477382535704174


In [None]:
submission4  = pd.read_csv('/content/drive/MyDrive/Kaggle/sample_submission/sample_submission.csv')

In [None]:
Y_pred = CB_fit.predict(test)

In [None]:
submission4['meter_reading'] = np.exp(Y_pred)

In [None]:
submission4['meter_reading'] = 3.4118 * submission4['meter_reading']

In [None]:
submission4.loc[submission4['meter_reading']<0, 'meter_reading'] = 0

In [None]:
submission4.to_csv('/content/drive/MyDrive/Kaggle/submission4.csv', index=False)

# Cinquième soumission

In [None]:
CB = CatBoostRegressor(learning_rate=0.2,l2_leaf_reg=8,depth=5)

In [None]:
CB_fit = CB.fit(x_train,y_train)

0:	learn: 1.6064895	total: 992ms	remaining: 16m 31s
1:	learn: 1.5032071	total: 1.85s	remaining: 15m 23s
2:	learn: 1.4298863	total: 2.7s	remaining: 14m 56s
3:	learn: 1.3745897	total: 3.56s	remaining: 14m 46s
4:	learn: 1.3161829	total: 4.39s	remaining: 14m 32s
5:	learn: 1.2744893	total: 5.21s	remaining: 14m 23s
6:	learn: 1.2401600	total: 6.11s	remaining: 14m 26s
7:	learn: 1.2210682	total: 6.9s	remaining: 14m 15s
8:	learn: 1.2059347	total: 7.71s	remaining: 14m 8s
9:	learn: 1.1830491	total: 8.6s	remaining: 14m 11s
10:	learn: 1.1680637	total: 9.44s	remaining: 14m 8s
11:	learn: 1.1489944	total: 10.3s	remaining: 14m 4s
12:	learn: 1.1373366	total: 11.1s	remaining: 14m
13:	learn: 1.1287072	total: 11.9s	remaining: 13m 55s
14:	learn: 1.1152062	total: 12.7s	remaining: 13m 50s
15:	learn: 1.1048242	total: 13.5s	remaining: 13m 49s
16:	learn: 1.0944185	total: 14.3s	remaining: 13m 49s
17:	learn: 1.0881441	total: 15.2s	remaining: 13m 48s
18:	learn: 1.0834771	total: 16s	remaining: 13m 46s
19:	learn: 1.07

In [None]:
y_pred_CB = CB_fit.predict(x_test)

In [None]:
RMSE_CB = mean_squared_error(y_test, y_pred_CB) ** 0.5

In [None]:
print(RMSE_CB)

In [None]:
submission5  = pd.read_csv('/content/drive/MyDrive/Kaggle/sample_submission/sample_submission.csv')

In [None]:
Y_pred = CB_fit.predict(test)

In [None]:
submission5['meter_reading'] = np.exp(Y_pred)

In [None]:
submission5['meter_reading'] = 3.4118 * submission5['meter_reading']

In [None]:
submission5.loc[submission5['meter_reading']<0, 'meter_reading'] = 0

In [None]:
submission5.to_csv('/content/drive/MyDrive/Kaggle/submission5.csv', index=False)

# Sixième soumission

In [None]:
CB = CatBoostRegressor(learning_rate=0.1,l2_leaf_reg=20,depth=5)

In [None]:
CB_fit = CB.fit(x_train,y_train)

0:	learn: 1.6733162	total: 946ms	remaining: 15m 44s
1:	learn: 1.6118011	total: 1.77s	remaining: 14m 42s
2:	learn: 1.5587493	total: 2.55s	remaining: 14m 7s
3:	learn: 1.5141241	total: 3.36s	remaining: 13m 57s
4:	learn: 1.4756605	total: 4.15s	remaining: 13m 46s
5:	learn: 1.4405049	total: 4.96s	remaining: 13m 42s
6:	learn: 1.4056450	total: 5.73s	remaining: 13m 32s
7:	learn: 1.3792048	total: 6.54s	remaining: 13m 31s
8:	learn: 1.3571703	total: 7.32s	remaining: 13m 26s
9:	learn: 1.3327069	total: 8.1s	remaining: 13m 22s
10:	learn: 1.3098639	total: 8.91s	remaining: 13m 21s
11:	learn: 1.2944763	total: 9.68s	remaining: 13m 16s
12:	learn: 1.2784892	total: 10.5s	remaining: 13m 20s
13:	learn: 1.2663396	total: 11.3s	remaining: 13m 14s
14:	learn: 1.2486894	total: 12.1s	remaining: 13m 12s
15:	learn: 1.2316728	total: 12.9s	remaining: 13m 11s
16:	learn: 1.2201040	total: 13.7s	remaining: 13m 9s
17:	learn: 1.2109260	total: 14.5s	remaining: 13m 9s
18:	learn: 1.2034309	total: 15.2s	remaining: 13m 3s
19:	lear

In [None]:
y_pred_CB = CB_fit.predict(x_test)

In [None]:
RMSE_CB = mean_squared_error(y_test, y_pred_CB) ** 0.5

In [None]:
print(RMSE_CB)

1.3452162066720512


In [None]:
submission6  = pd.read_csv('/content/drive/MyDrive/Kaggle/sample_submission/sample_submission.csv')

In [None]:
Y_pred = CB_fit.predict(test)

In [None]:
submission6['meter_reading'] = np.exp(Y_pred)

In [None]:
submission6['meter_reading'] = 3.4118 * submission6['meter_reading']

In [None]:
submission6.loc[submission6['meter_reading']<0, 'meter_reading'] = 0

In [None]:
submission6.to_csv('/content/drive/MyDrive/Kaggle/submission6.csv', index=False)

# Septième soumission

In [None]:
CB = CatBoostRegressor(learning_rate=0.01,l2_leaf_reg=15,depth=5,one_hot_max_size=10)

In [None]:
CB_fit = CB.fit(x_train,y_train)

In [None]:
y_pred_CB = CB_fit.predict(x_test)

In [None]:
RMSE_CB = mean_squared_error(y_test, y_pred_CB) ** 0.5

In [None]:
print(RMSE_CB)