In [1]:
import pandas as pd
import numpy as np
from math import *
from sklearn import model_selection, metrics, dummy

In [2]:
from sklearn.ensemble import RandomForestRegressor

In [3]:
import pickle

with open("./data/pickle", 'rb') as f:
    data_X, data_Y = pickle.load(f)

In [4]:
# 0 pour les données sans star, 1 pour energystar
data_X = data_X[0]
data_Y = data_Y[0]

In [5]:
col_num = list(data_X.select_dtypes(include=['float', 'int']).columns)
col_num

['DataYear',
 'BuildingType',
 'ZipCode',
 'CouncilDistrictCode',
 'Neighborhood',
 'Latitude',
 'Longitude',
 'YearBuilt',
 'NumberofBuildings',
 'NumberofFloors',
 'PropertyGFATotal',
 'PropertyGFAParking',
 'PropertyGFABuilding(s)',
 'LargestPropertyUseType',
 'LargestPropertyUseTypeGFA',
 'SecondLargestPropertyUseType',
 'SecondLargestPropertyUseTypeGFA',
 'ThirdLargestPropertyUseType',
 'ThirdLargestPropertyUseTypeGFA',
 'Steam',
 'Electricity',
 'Gas',
 'SimpleAddress']

In [6]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(data_X[col_num], data_Y['Energy'], test_size=0.3)

In [7]:
from sklearn.model_selection import KFold

In [8]:
kf = KFold(n_splits=8)

In [9]:
dum = dummy.DummyRegressor(strategy='median')

score_final = {'R2':0, 'RMSE':0, 'RMSLE':0}

for train, test in kf.split(data_X):
    # Entraînement
    dum.fit(data_X.iloc[train], data_Y.iloc[train])

    # Prédiction sur le jeu de test
    y_pred = dum.predict(data_X.iloc[test])

    
    y_test = data_Y.iloc[test]
    
    # Evaluate
    score = metrics.r2_score(y_test, y_pred)
    print('R2 : ', score)
    score_final['R2'] += abs(score)
    
    score = metrics.mean_squared_error(y_test, y_pred, squared=True)
    #print('RMSE : ', score)
    score_final['RMSE'] += score
    
    score = metrics.mean_squared_log_error(y_test.applymap(lambda x:max(0, x)), y_pred, squared=True)
    #print('RMSLE : ', score)
    score_final['RMSLE'] += score

for k, score in score_final.items():
    print('Score {} final : {}'.format(k, score/kf.get_n_splits()))

R2 :  -0.035290600150939766
R2 :  -0.03534169654567698
R2 :  -0.017713026432810697
R2 :  -0.09845145078352657
R2 :  -0.07919785375218602
R2 :  -0.03732071772895473
R2 :  -0.03031603107486036
R2 :  -0.069849477485459
Score R2 final : 0.050435106744301766
Score RMSE final : 240868683677696.0
Score RMSLE final : 2.2796877348829403


In [10]:
data_X_tmp = data_X[col_num]

score_final = {'R2':0, 'RMSE':0, 'RMSLE':0}

for train, test in kf.split(data_X_tmp):
    regr = RandomForestRegressor(max_depth=15)
    
    # Entraînement
    regr.fit(data_X_tmp.iloc[train], data_Y.iloc[train])

    # Prédiction sur le jeu de test
    y_pred = regr.predict(data_X_tmp.iloc[test])

    
    y_test = data_Y.iloc[test]
    
    # Evaluate
    score = metrics.r2_score(y_test, y_pred)
    print('R2 : ', score)
    score_final['R2'] += abs(score)
    
    score = metrics.mean_squared_error(y_test, y_pred, squared=True)
    #print('RMSE : ', score)
    score_final['RMSE'] += score
    
    score = metrics.mean_squared_log_error(y_test.applymap(lambda x:max(0, x)), y_pred, squared=True)
    #print('RMSLE : ', score)
    score_final['RMSLE'] += score

for k, score in score_final.items():
    print('Score {} final : {}'.format(k, score/kf.get_n_splits()))

R2 :  0.7331001370076151
R2 :  0.6084989500135252
R2 :  0.2575347151181409
R2 :  0.423074901877571
R2 :  0.37648627901783077
R2 :  0.5239531283448888
R2 :  0.350539051463704
R2 :  0.588040609575441
Score R2 final : 0.4826534715523396
Score RMSE final : 141224626545919.9
Score RMSLE final : 1.1978748684065283


In [11]:
from sklearn.linear_model import LassoCV, MultiTaskLassoCV
from sklearn.datasets import make_regression


reg = MultiTaskLassoCV(cv=5).fit(data_X[col_num], data_Y)
reg.score(data_X[col_num], data_Y)


0.5163307489347575

In [12]:
data_X_tmp = data_X[col_num]

score_final = {'R2':0, 'RMSE':0, 'RMSLE':0}

for train, test in kf.split(data_X_tmp):
    regr = MultiTaskLassoCV(cv=5)
    
    # Entraînement
    regr.fit(data_X_tmp.iloc[train], data_Y.iloc[train])

    # Prédiction sur le jeu de test
    y_pred = regr.predict(data_X_tmp.iloc[test])

    
    y_test = data_Y.iloc[test]
    
    # Evaluate
    score = metrics.r2_score(y_test, y_pred)
    print('R2 : ', score)
    score_final['R2'] += abs(score)
    
    score = metrics.mean_squared_error(y_test, y_pred, squared=True)
    #print('RMSE : ', score)
    score_final['RMSE'] += score
    
    score = metrics.mean_squared_log_error(y_test.applymap(lambda x:max(0, x)), [[max(0, f) for f in e] for e in y_pred], squared=True)
    #print('RMSLE : ', score)
    score_final['RMSLE'] += score

for k, score in score_final.items():
    print('Score {} final : {}'.format(k, score/kf.get_n_splits()))

R2 :  0.4810822919039779
R2 :  0.37020891757759644
R2 :  0.6438792597013104
R2 :  0.30518423472344614
R2 :  0.14861602379995464
R2 :  0.4504488278814423
R2 :  0.221334330374294
R2 :  0.5370045738817968
Score R2 final : 0.3947198074804773
Score RMSE final : 76831734424031.62
Score RMSLE final : 6.843540241248162


In [13]:
from sklearn.model_selection import GridSearchCV

def grid_search_cv_models(X, y, models, params):
    # Initialiser la variable pour enregistrer le meilleur score
    best_score = 0
    # Initialiser la variable pour enregistrer le meilleur modèle
    best_model = None
    # Boucler à travers les modèles et les paramètres correspondants
    for model, model_params in zip(models, params):
        # Créer un objet GridSearchCV pour ce modèle et ces paramètres
        gs = GridSearchCV(model, model_params, cv=5, error_score='raise', scoring='r2')
        # Entraîner le modèle sur les données X et y
        gs.fit(X, y)
        print(gs.best_score_)
        print(gs.best_estimator_)
        # Si le meilleur score de ce modèle est supérieur au meilleur score actuel, le mettre à jour
        if gs.best_score_ > best_score:
            best_score = gs.best_score_
            best_model = gs.best_estimator_
    # Renvoyer le meilleur modèle
    return best_model

In [14]:
from sklearn.ensemble import RandomForestRegressor

# Une liste de modèles
models = [LassoCV(), RandomForestRegressor()]
# Une liste de paramètres pour les modèles correspondants
params = [{'cv':[2, 4, 5, 8, 10, 15]},
          {"max_depth": [3, 5, 8, 10, 15, 20, 30, 40, 50]}]

best_model = grid_search_cv_models(data_X[col_num], data_Y['CO2'], models, params)
print("Le meilleur modèle est:", best_model)

0.3098119774009346
LassoCV(cv=4)
0.3122471606949108
RandomForestRegressor(max_depth=10)
Le meilleur modèle est: RandomForestRegressor(max_depth=10)
