In [26]:
import pandas as pd
import numpy as np
from sklearn.dummy import DummyRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import *
import matplotlib.pyplot as plt  

In [27]:
from sklearn.impute import SimpleImputer

from sklearn.pipeline import Pipeline, make_pipeline

from sklearn.preprocessing import QuantileTransformer, FunctionTransformer, LabelEncoder, OneHotEncoder, OrdinalEncoder, MinMaxScaler, StandardScaler, RobustScaler , PolynomialFeatures

from sklearn.compose import ColumnTransformer, make_column_transformer

In [28]:
# Importation des données et visualisation 
df = pd.read_csv('datasets/velille.csv', parse_dates=True, index_col='datetime') 
df

Unnamed: 0_level_0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count,month,day,hour,year,date,is_night
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
2011-01-01 00:00:00,Winter,0,0,Clear,9.84,14.395,81,0.0000,3,13,16,1,Saturday,0,2011,718,1
2011-01-01 01:00:00,Winter,0,0,Clear,9.02,13.635,80,0.0000,8,32,40,1,Saturday,1,2011,718,1
2011-01-01 02:00:00,Winter,0,0,Clear,9.02,13.635,80,0.0000,5,27,32,1,Saturday,2,2011,718,1
2011-01-01 03:00:00,Winter,0,0,Clear,9.84,14.395,75,0.0000,3,10,13,1,Saturday,3,2011,718,1
2011-01-01 04:00:00,Winter,0,0,Clear,9.84,14.395,75,0.0000,0,1,1,1,Saturday,4,2011,718,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2012-12-19 19:00:00,Fall,0,1,Clear,15.58,19.695,50,26.0027,7,329,336,12,Wednesday,19,2012,0,0
2012-12-19 20:00:00,Fall,0,1,Clear,14.76,17.425,57,15.0013,10,231,241,12,Wednesday,20,2012,0,1
2012-12-19 21:00:00,Fall,0,1,Clear,13.94,15.910,61,15.0013,4,164,168,12,Wednesday,21,2012,0,1
2012-12-19 22:00:00,Fall,0,1,Clear,13.94,17.425,61,6.0032,12,117,129,12,Wednesday,22,2012,0,1


In [29]:
df.columns

Index(['season', 'holiday', 'workingday', 'weather', 'temp', 'atemp',
       'humidity', 'windspeed', 'casual', 'registered', 'count', 'month',
       'day', 'hour', 'year', 'date', 'is_night'],
      dtype='object')

In [30]:
df.dtypes

season         object
holiday         int64
workingday      int64
weather        object
temp          float64
atemp         float64
humidity        int64
windspeed     float64
casual          int64
registered      int64
count           int64
month           int64
day            object
hour            int64
year            int64
date            int64
is_night        int64
dtype: object

In [31]:
weather_dict = {'Clear':1, 'Misty+Cloudy':2, 'Light Snow/Rain':3, 'Heavy Snow/Rain':4}
season_dict = {'Winter':1, 'Fall':2, 'Spring':3, 'Summer':4}
df['weather'] = df['weather'].map(weather_dict)
df['season'] = df['season'].map(season_dict)

In [32]:
# Séparation du jeu de données
# TRAIN = 01/11 au 08/12
# TEST = 09/12 au 12/12

train_df = df[df.index <= '2012-08-31'].drop(['casual','registered'], axis=1)
test_df = df[df.index >= '2012-09-01'].drop(['casual','registered'], axis=1)


In [33]:
train_df.sort_values(by='datetime', ascending=False)

Unnamed: 0_level_0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,count,month,day,hour,year,date,is_night
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2012-08-19 23:00:00,4,0,0,2,25.42,29.545,73,0.0000,46,8,Sunday,23,2012,122,1
2012-08-19 22:00:00,4,0,0,2,25.42,29.545,78,11.0014,56,8,Sunday,22,2012,122,1
2012-08-19 21:00:00,4,0,0,3,25.42,29.545,78,6.0032,190,8,Sunday,21,2012,122,1
2012-08-19 20:00:00,4,0,0,2,26.24,30.305,73,12.9980,274,8,Sunday,20,2012,122,1
2012-08-19 19:00:00,4,0,0,2,26.24,30.305,73,8.9981,341,8,Sunday,19,2012,122,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0000,1,1,Saturday,4,2011,718,1
2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0000,13,1,Saturday,3,2011,718,1
2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0000,32,1,Saturday,2,2011,718,1
2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0000,40,1,Saturday,1,2011,718,1


In [34]:
test_df.index.min()

Timestamp('2012-09-01 00:00:00')

In [35]:
print(train_df.shape)
print(test_df.shape)

(9063, 15)
(1823, 15)


In [36]:
# Séparation des features et des targets

y_train = train_df['count']
X_train = train_df.drop(['count'], axis=1)

y_test = test_df['count']
X_test = test_df.drop(['count'], axis=1)

## Régresseur naïf

In [37]:
# Création d'un régresseur naïf pour comparer la performance
dr = DummyRegressor(strategy='median')
dr.fit(X_train, y_train)
y_pred = dr.predict(X_test)
dr.score(X_test, y_test)

-0.3181073023684451

## Sans GridSearchCV

In [38]:
X_train

Unnamed: 0_level_0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,month,day,hour,year,date,is_night
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0000,1,Saturday,0,2011,718,1
2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0000,1,Saturday,1,2011,718,1
2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0000,1,Saturday,2,2011,718,1
2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0000,1,Saturday,3,2011,718,1
2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0000,1,Saturday,4,2011,718,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2012-08-19 19:00:00,4,0,0,2,26.24,30.305,73,8.9981,8,Sunday,19,2012,122,0
2012-08-19 20:00:00,4,0,0,2,26.24,30.305,73,12.9980,8,Sunday,20,2012,122,1
2012-08-19 21:00:00,4,0,0,3,25.42,29.545,78,6.0032,8,Sunday,21,2012,122,1
2012-08-19 22:00:00,4,0,0,2,25.42,29.545,78,11.0014,8,Sunday,22,2012,122,1


In [39]:
#Choix des features :

numerical_features = ['temp', 'humidity']
log_features = ['atemp', 'temp', 'windspeed', 'humidity', 'date']
categorical_features = ['season', 'weather', 'holiday', 'workingday', 'year', 'month', 'day', 'hour', 'is_night']

loga = FunctionTransformer(np.log1p)

numerical_pipeline = make_pipeline(MinMaxScaler())
log_pipeline = make_pipeline(loga)
categorical_pipeline = make_pipeline(OneHotEncoder(handle_unknown = "ignore"))


preprocessor = make_column_transformer((categorical_pipeline, categorical_features), (log_pipeline, log_features))


In [68]:
# Modèle XGboost

from xgboost import XGBRegressor

xgb = make_pipeline(preprocessor,XGBRegressor(learning_rate=0.05, n_estimators=300, objective='reg:squarederror'))
xgb.fit(X_train, y_train)
y_pred = xgb.predict(X_test)

print('r2', r2_score(y_test, y_pred))
print('mae', mean_absolute_error(y_test, y_pred))

r2 0.9058640431829525
mae 44.99373738427641


In [61]:
# pickle
import pickle
pickle.dump(xgb, open('XGBRegressor.pkl', 'wb'))

In [77]:
# Modèle LGBM
from lightgbm import LGBMRegressor

lgbm = make_pipeline(preprocessor,LGBMRegressor(learning_rate=0.05, n_estimators=400))
lgbm.fit(X_train, y_train)
y_pred = lgbm.predict(X_test)

print('r2', r2_score(y_test, y_pred))
print('mae', mean_absolute_error(y_test, y_pred))

r2 0.9210319440325717
mae 41.544028223059186


In [78]:
# pickle
import pickle
pickle.dump(lgbm, open('LGBMRegressor.pkl', 'wb'))

In [44]:
X_train.columns

Index(['season', 'holiday', 'workingday', 'weather', 'temp', 'atemp',
       'humidity', 'windspeed', 'month', 'day', 'hour', 'year', 'date',
       'is_night'],
      dtype='object')

In [45]:
# Preprocessing RandomForestRegressor

numerical_features = ['temp', 'humidity']
log_features = ['atemp', 'temp', 'windspeed', 'humidity', 'date']
categorical_features = ['season', 'weather', 'holiday', 'workingday','year', 'month', 'day', 'hour', 'is_night']

loga = FunctionTransformer(np.log1p)

numerical_pipeline = make_pipeline(MinMaxScaler())
log_pipeline = make_pipeline(loga)
categorical_pipeline = make_pipeline(OneHotEncoder(handle_unknown = "ignore"))


preprocessor_rf = make_column_transformer((categorical_pipeline, categorical_features), (log_pipeline, log_features))

In [46]:
# Modèle RandomForestRegressor avec pipeline
rf = make_pipeline(preprocessor_rf,RandomForestRegressor(n_estimators = 100))
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

print('r2', r2_score(y_test, y_pred))
print('mae', mean_absolute_error(y_test, y_pred))

r2 0.8813124774734877
mae 48.416664838178825


In [47]:
# pickle
import pickle
pickle.dump(rf, open('RandomForestRegressor.pkl', 'wb'))

In [84]:
# Preprocessing ExtraTreesRegressor

numerical_features = ['temp', 'humidity']
log_features = ['temp', 'windspeed', 'humidity', 'date']
categorical_features = ['season', 'weather', 'holiday', 'workingday', 'year', 'month', 'day', 'hour', 'is_night']

loga = FunctionTransformer(np.log1p)

numerical_pipeline = make_pipeline(MinMaxScaler())
log_pipeline = make_pipeline(loga)
categorical_pipeline = make_pipeline(OneHotEncoder(handle_unknown = "ignore"))


preprocessor_extra = make_column_transformer((categorical_pipeline, categorical_features), (log_pipeline, log_features))

In [85]:
# Modèle ExtraTreesRegressor avec pipeline
extra = make_pipeline(preprocessor_extra,ExtraTreesRegressor(n_estimators=100))
extra.fit(X_train, y_train)
y_pred = extra.predict(X_test)

print('r2', r2_score(y_test, y_pred))
print('mae', mean_absolute_error(y_test, y_pred))

r2 0.9009600585154496
mae 44.06632473944048


In [86]:
# pickle
import pickle
pickle.dump(extra, open('ExtraTreesRegressor2.pkl', 'wb'))

In [59]:
# Evaluation du training set

y_train_predict = lgbm.predict(X_train)
r2 = r2_score(y_train, y_train_predict)

print("La performance du modèle sur la base d'apprentissage")
print('le score R2 est {}'.format(r2))
print('\n')

# Evualation du test set

y_test_predict = lgbm.predict(X_test)
r2 = r2_score(y_test, y_test_predict)

print('La performance du modèle sur la base du test')
print('le score R2 est {}'.format(r2))

La performance du modèle sur la base d'apprentissage
le score R2 est 0.9627719806521129


La performance du modèle sur la base du test
le score R2 est 0.9122157173200258


## Avec GridSearchCV

In [52]:
from sklearn.model_selection import GridSearchCV

# Liste des transformations
num_transformer = Pipeline([('imputer',SimpleImputer()), ('minmax', MinMaxScaler())])
cat_transformer = Pipeline([('imputer', SimpleImputer(strategy='most_frequent')),('onehot',OneHotEncoder()), ('standard', StandardScaler(with_mean=False))])


# Liste des catégories à transformer
num_features = ['season','weather', 'temp', 'humidity', 'windspeed']
cat_features = ['holiday', 'workingday', 'month', 'day', 'hour']


preprocessor2 = ColumnTransformer([('num', num_transformer, num_features), ('cat', cat_transformer, cat_features)])

pipe_rf = Pipeline([('preprocess',preprocessor2),('rf', RandomForestRegressor())])

In [53]:
#Connaître les paramètres calculables dans GridSearchCV
pipe_rf.get_params().keys()


dict_keys(['memory', 'steps', 'verbose', 'preprocess', 'rf', 'preprocess__n_jobs', 'preprocess__remainder', 'preprocess__sparse_threshold', 'preprocess__transformer_weights', 'preprocess__transformers', 'preprocess__verbose', 'preprocess__num', 'preprocess__cat', 'preprocess__num__memory', 'preprocess__num__steps', 'preprocess__num__verbose', 'preprocess__num__imputer', 'preprocess__num__minmax', 'preprocess__num__imputer__add_indicator', 'preprocess__num__imputer__copy', 'preprocess__num__imputer__fill_value', 'preprocess__num__imputer__missing_values', 'preprocess__num__imputer__strategy', 'preprocess__num__imputer__verbose', 'preprocess__num__minmax__clip', 'preprocess__num__minmax__copy', 'preprocess__num__minmax__feature_range', 'preprocess__cat__memory', 'preprocess__cat__steps', 'preprocess__cat__verbose', 'preprocess__cat__imputer', 'preprocess__cat__onehot', 'preprocess__cat__standard', 'preprocess__cat__imputer__add_indicator', 'preprocess__cat__imputer__copy', 'preprocess__c

In [54]:
# Paramètres

param_rf = {'rf__n_estimators': [1, 5, 8],
            'rf__max_depth' : [1, 2, 3],
            'rf__min_samples_split' : [2, 5, 8],
            'rf__max_leaf_nodes' : [2, 3]
            }



#Initialisation

grid_rf = GridSearchCV(pipe_rf, param_rf, cv=2, scoring= 'r2')

#Entraînement
grid_result_rf = grid_rf.fit(X_train, y_train)


print('Meilleur score pour RF: ', grid_result_rf.best_score_)
print('avec paramètres suivants: ', grid_result_rf.best_params_)
print('--------------------------------------')



Traceback (most recent call last):
  File "C:\Users\Apprenant\anaconda3\envs\devia\lib\site-packages\sklearn\model_selection\_validation.py", line 687, in _score
    scores = scorer(estimator, X_test, y_test)
  File "C:\Users\Apprenant\anaconda3\envs\devia\lib\site-packages\sklearn\metrics\_scorer.py", line 199, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true,
  File "C:\Users\Apprenant\anaconda3\envs\devia\lib\site-packages\sklearn\metrics\_scorer.py", line 236, in _score
    y_pred = method_caller(estimator, "predict", X)
  File "C:\Users\Apprenant\anaconda3\envs\devia\lib\site-packages\sklearn\metrics\_scorer.py", line 53, in _cached_call
    return getattr(estimator, method)(*args, **kwargs)
  File "C:\Users\Apprenant\anaconda3\envs\devia\lib\site-packages\sklearn\utils\metaestimators.py", line 120, in <lambda>
    out = lambda *args, **kwargs: self.fn(obj, *args, **kwargs)
  File "C:\Users\Apprenant\anaconda3\envs\devia\lib\site-packages\sklear

Meilleur score pour RF:  nan
avec paramètres suivants:  {'rf__max_depth': 1, 'rf__max_leaf_nodes': 2, 'rf__min_samples_split': 2, 'rf__n_estimators': 1}
--------------------------------------


Traceback (most recent call last):
  File "C:\Users\Apprenant\anaconda3\envs\devia\lib\site-packages\sklearn\model_selection\_validation.py", line 687, in _score
    scores = scorer(estimator, X_test, y_test)
  File "C:\Users\Apprenant\anaconda3\envs\devia\lib\site-packages\sklearn\metrics\_scorer.py", line 199, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true,
  File "C:\Users\Apprenant\anaconda3\envs\devia\lib\site-packages\sklearn\metrics\_scorer.py", line 236, in _score
    y_pred = method_caller(estimator, "predict", X)
  File "C:\Users\Apprenant\anaconda3\envs\devia\lib\site-packages\sklearn\metrics\_scorer.py", line 53, in _cached_call
    return getattr(estimator, method)(*args, **kwargs)
  File "C:\Users\Apprenant\anaconda3\envs\devia\lib\site-packages\sklearn\utils\metaestimators.py", line 120, in <lambda>
    out = lambda *args, **kwargs: self.fn(obj, *args, **kwargs)
  File "C:\Users\Apprenant\anaconda3\envs\devia\lib\site-packages\sklear

In [55]:
# from sklearn.model_selection import learning_curve, StratifiedKFold

# skf = StratifiedKFold(n_splits=5)

# N, train_score, val_score = learning_curve(rf, X_train, y_train, train_sizes = np.linspace(0.1,1,50), cv=skf, scoring='r2')
# plt.plot(N, val_score.mean(axis=1), label='validation')
# plt.plot(N, train_score.mean(axis=1), label='train')

# plt.xlabel('train_sizes')
# plt.title('RandomForestRegressor learning curve')
# plt.legend()
# plt.show() 

In [56]:
# from sklearn.model_selection import learning_curve

# train_sizes, train_scores, test_scores = learning_curve(et, X_train, y_train, cv=None, scoring='r2', n_jobs=-1, train_sizes=range(10,7220,20))

In [57]:
# train_mean = np.mean(train_scores, axis=1)
# train_std = np.std(train_scores, axis=1)

# test_mean = np.mean(test_scores, axis=1)
# test_std = np.std(test_scores, axis=1)

In [58]:
# import matplotlib.pyplot as plt

# plt.subplots(1, figsize=(10,10))
# plt.plot(train_sizes, train_mean, '--', color="#EF2D04",  label="Training score")
# plt.plot(train_sizes, test_mean, color="#34AA06", label="Test score")


# plt.title("Courbe d'apprentissage")
# plt.xlabel("Nombre de samples"), plt.ylabel("R2"), plt.legend(loc="best")
# plt.tight_layout()
# plt.show()