# Import

In [63]:
import pandas as pd
import numpy as np
import sklearn
import matplotlib.pyplot as plt
import tensorflow as tf
import xgboost as xgb

# sklearn utilities
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import mutual_info_regression
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# DL utilities
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.wrappers.scikit_learn import KerasRegressor


# Load data

In [18]:
data_train = pd.read_csv('./data/train_dataset.csv')
data_test = pd.read_csv('./data/test_dataset.csv')

# Explore

In [13]:
data_train.shape

(9972, 9)

In [14]:
data_train.head()

Unnamed: 0,date,season,holiday,weather,temp,feeling_temp,humidity,windspeed,count
0,2011-01-01 00:00:00,winter,False,Few Clouds,9.84,14.395,81.0,0.0,16
1,2011-01-01 01:00:00,winter,False,Few Clouds,9.02,13.635,80.0,0.0,40
2,2011-01-01 02:00:00,winter,False,Few Clouds,9.02,13.635,80.0,0.0,32
3,2011-01-01 03:00:00,winter,False,Clear,9.84,14.395,75.0,0.0,13
4,2011-01-01 04:00:00,winter,False,Few Clouds,9.84,14.395,75.0,0.0,1


In [15]:
data_train.describe()

Unnamed: 0,temp,feeling_temp,humidity,windspeed,count
count,9939.0,9936.0,9942.0,9948.0,9972.0
mean,20.248943,23.673803,61.883826,12.815347,191.727838
std,7.788614,8.48024,19.188561,8.161022,181.081932
min,0.82,0.76,0.0,0.0,1.0
25%,13.94,16.665,47.0,7.0015,43.0
50%,20.5,24.24,62.0,12.998,145.0
75%,26.24,31.06,77.0,16.9979,284.0
max,41.0,45.455,100.0,56.9969,977.0


In [19]:
data_train.isnull().sum()

date             0
season           9
holiday         27
weather         26
temp            33
feeling_temp    36
humidity        30
windspeed       24
count            0
dtype: int64

# Columns

In [20]:
for col in data_train.columns:
    print(col)
    print(data_train[col].value_counts())

date
2011-01-01 00:00:00    1
2012-05-02 01:00:00    1
2012-05-01 18:00:00    1
2012-05-01 19:00:00    1
2012-05-01 20:00:00    1
                      ..
2011-09-02 07:00:00    1
2011-09-02 09:00:00    1
2011-09-02 10:00:00    1
2011-09-02 11:00:00    1
2012-12-19 23:00:00    1
Name: date, Length: 9972, dtype: int64
season
summer     2520
fall       2499
springs    2493
winter     2451
Name: season, dtype: int64
holiday
False    9651
True      294
Name: holiday, dtype: int64
weather
Few Clouds    3326
Clear         3245
Mist          2595
Light snow     394
Light rain     385
Heavy rain       1
Name: weather, dtype: int64
temp
14.76    430
26.24    416
28.70    391
13.94    385
18.86    372
22.14    369
16.40    368
25.42    365
27.06    364
24.60    355
22.96    350
12.30    344
21.32    332
13.12    327
29.52    315
17.22    311
10.66    308
18.04    307
20.50    301
30.34    280
9.84     267
15.58    228
31.16    227
9.02     224
27.88    204
8.20     201
32.80    187
23.78    184


In [21]:
data_train.dropna(inplace=True)

In [22]:
data_train.isnull().sum()

date            0
season          0
holiday         0
weather         0
temp            0
feeling_temp    0
humidity        0
windspeed       0
count           0
dtype: int64

# X & y

In [24]:
data_train.columns

Index(['date', 'season', 'holiday', 'weather', 'temp', 'feeling_temp',
       'humidity', 'windspeed', 'count'],
      dtype='object')

In [94]:
X = data_train.drop(columns=['date', 'count'])
y = data_train['count']

# X traitement

In [95]:
X.replace(
    {'holiday': {True: 1, False: 0}},
inplace=True)

In [96]:
X.replace(
    {'weather': {'Light rain': 'Rain', 'Heavy rain': 'Rain'}},
inplace=True)

In [97]:
seasons = pd.get_dummies(X['season'], prefix='season')
X = pd.concat([X, seasons], axis=1)
X.drop('season', axis=1, inplace=True)

In [98]:
weathers = pd.get_dummies(X['weather'], prefix='weather')
X = pd.concat([X, weathers], axis=1)
X.drop('weather', axis=1, inplace=True)

# Statistical fixing

In [99]:
standardize_cols = ['temp', 'feeling_temp', 'humidity', 'windspeed']
scaler = StandardScaler()
X[standardize_cols] = scaler.fit_transform(X[standardize_cols])

# Func

In [85]:
def treat(X, standardize_cols=standardize_cols, scaler=scaler):

    X.replace(
        {'holiday': {True: 1, False: 0}},
    inplace=True)

    X.replace(
        {'weather': {'Light rain': 'Rain', 'Heavy rain': 'Rain'}},
    inplace=True)

    seasons = pd.get_dummies(X['season'], prefix='season')
    X = pd.concat([X, seasons], axis=1)
    X.drop('season', axis=1, inplace=True)
 
    weathers = pd.get_dummies(X['weather'], prefix='weather')
    X = pd.concat([X, weathers], axis=1)
    X.drop('weather', axis=1, inplace=True)

    X[standardize_cols] = scaler.transform(X[standardize_cols])

    return X

# Feature selection

In [53]:
mi_scores = mutual_info_regression(X, y)
mi_scores_series = pd.Series(mi_scores, index=X.columns)
mi_scores_sorted = mi_scores_series.sort_values(ascending=False)
print(mi_scores_sorted)

temp                  0.140000
feeling_temp          0.131060
humidity              0.100307
season_winter         0.039111
windspeed             0.018721
weather_Mist          0.007494
weather_Rain          0.005754
season_springs        0.002324
season_summer         0.000100
holiday               0.000000
season_fall           0.000000
weather_Clear         0.000000
weather_Few Clouds    0.000000
weather_Light snow    0.000000
dtype: float64


In [58]:
tree = DecisionTreeRegressor()
tree.fit(X, y)
for column, impo in zip(X.columns,tree.feature_importances_):
    print(f"{column}: {impo:.2f}")

holiday: 0.01
temp: 0.14
feeling_temp: 0.22
humidity: 0.25
windspeed: 0.20
season_fall: 0.03
season_springs: 0.02
season_summer: 0.02
season_winter: 0.01
weather_Clear: 0.03
weather_Few Clouds: 0.03
weather_Light snow: 0.01
weather_Mist: 0.03
weather_Rain: 0.01


In [100]:
X_weather = X[['temp', 'feeling_temp', 'humidity', 'windspeed']]

# model

In [102]:
xgb_reg = xgb.XGBRegressor()

param_grid = {
    'n_estimators': [500, 200],
    'max_depth': [5, 7],
    'learning_rate': [0.01, 0.05]
}

grid_search = GridSearchCV(estimator=xgb_reg, param_grid=param_grid, cv=5, scoring='neg_root_mean_squared_error')

grid_search.fit(X_weather, y, verbose=2)

print("Best hyperparameters: ", grid_search.best_params_)

y_pred = grid_search.predict(X_weather)

rmse = mean_squared_error(y, y_pred, squared=False)

print("RMSE: ", rmse)

Best hyperparameters:  {'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 200}
RMSE:  153.29103826364792


In [89]:
model_final = grid_search.best_estimator_

# Submission

In [82]:
data_test.columns

Index(['date', 'season', 'holiday', 'weather', 'temp', 'feeling_temp',
       'humidity', 'windspeed'],
      dtype='object')

In [84]:
dates = data_test['date']
X_test = data_test.drop(columns=['date'])

In [86]:
X_test = treat(X_test)

In [90]:
predictions = model_final.predict(X_test)

In [92]:
columns = {
    'date': dates,
    'count': [int(x) for x in predictions]
}

In [93]:
pd.DataFrame(columns).to_csv('submission.csv', index=False)