In [1]:
import pandas as pd
import numpy as np
from math import *
from sklearn import model_selection, metrics, dummy

In [2]:
from sklearn.ensemble import RandomForestRegressor

In [3]:
import pickle

with open("./data/pickle", 'rb') as f:
    data_X, data_Y = pickle.load(f)

In [4]:
# 0 pour les données sans star, 1 pour energystar
data_X = data_X[0]
data_Y = data_Y[0]

In [5]:
col_num = list(data_X.select_dtypes(include=['float', 'int']).columns)
col_num

['BuildingType',
 'PrimaryPropertyType',
 'ZipCode',
 'CouncilDistrictCode',
 'Neighborhood',
 'Latitude',
 'Longitude',
 'YearBuilt',
 'NumberofBuildings',
 'NumberofFloors',
 'PropertyGFATotal',
 'PropertyGFAParking',
 'PropertyGFABuilding(s)',
 'LargestPropertyUseType',
 'LargestPropertyUseTypeGFA',
 'SecondLargestPropertyUseType',
 'SecondLargestPropertyUseTypeGFA',
 'ThirdLargestPropertyUseType',
 'ThirdLargestPropertyUseTypeGFA',
 'Steam',
 'Electricity',
 'Gas',
 'SimpleAddress',
 'NumberOfPropertyUseTypes',
 'SurfaceInside',
 'Age',
 'EquatorProximity',
 'SeaProximity',
 'SeaProximityLog',
 'SeaProximityLimit',
 'SeaProximityLogLimit']

In [6]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(data_X[col_num], data_Y['Energy'], test_size=0.3)

In [7]:
from sklearn.model_selection import KFold

In [8]:
kf = KFold(n_splits=8)

In [9]:
dum = dummy.DummyRegressor(strategy='median')

score_final = {'R2':0, 'RMSE':0, 'RMSLE':0}

for train, test in kf.split(data_X):
    # Entraînement
    dum.fit(data_X.iloc[train], data_Y.iloc[train])

    # Prédiction sur le jeu de test
    y_pred = dum.predict(data_X.iloc[test])

    
    y_test = data_Y.iloc[test]
    
    # Evaluate
    score = metrics.r2_score(y_test, y_pred)
    print('R2 : ', score)
    score_final['R2'] += abs(score)
    
    score = metrics.mean_squared_error(y_test, y_pred, squared=True)
    #print('RMSE : ', score)
    score_final['RMSE'] += score
    
    score = metrics.mean_squared_log_error(y_test.applymap(lambda x:max(0, x)), y_pred, squared=True)
    #print('RMSLE : ', score)
    score_final['RMSLE'] += score

for k, score in score_final.items():
    print('Score {} final : {}'.format(k, score/kf.get_n_splits()))

R2 :  -0.03509961405163475
R2 :  -0.11067168990771836
R2 :  -0.024344143904872717
R2 :  -0.030476916985302527
R2 :  -0.06065029581975723
R2 :  -0.05453289570819242
R2 :  -0.015704315555755133
R2 :  -0.0767006165803309
Score R2 final : 0.051022561064195504
Score RMSE final : 240823770284032.0
Score RMSLE final : 2.278740158819589


In [10]:
data_X_tmp = data_X[col_num]

score_final = {'R2':0, 'RMSE':0, 'RMSLE':0}

for train, test in kf.split(data_X_tmp):
    regr = RandomForestRegressor(max_depth=15)
    
    # Entraînement
    regr.fit(data_X_tmp.iloc[train], data_Y.iloc[train])

    # Prédiction sur le jeu de test
    y_pred = regr.predict(data_X_tmp.iloc[test])

    
    y_test = data_Y.iloc[test]
    
    # Evaluate
    score = metrics.r2_score(y_test, y_pred)
    print('R2 : ', score)
    score_final['R2'] += abs(score)
    
    score = metrics.mean_squared_error(y_test, y_pred, squared=True)
    #print('RMSE : ', score)
    score_final['RMSE'] += score
    
    score = metrics.mean_squared_log_error(y_test.applymap(lambda x:max(0, x)), y_pred, squared=True)
    #print('RMSLE : ', score)
    score_final['RMSLE'] += score

for k, score in score_final.items():
    print('Score {} final : {}'.format(k, score/kf.get_n_splits()))

R2 :  0.708803523029698
R2 :  0.5438610556371254
R2 :  0.25529256598593325
R2 :  0.713867694661211
R2 :  0.4975411864945473
R2 :  0.33946202380404117
R2 :  0.34450518451060363
R2 :  0.6059530470394804
Score R2 final : 0.50116078514533
Score RMSE final : 145363573832897.84
Score RMSLE final : 1.193926823244094


In [11]:
from sklearn.linear_model import LassoCV, MultiTaskLassoCV
from sklearn.datasets import make_regression


reg = MultiTaskLassoCV(cv=5).fit(data_X[col_num], data_Y)
reg.score(data_X[col_num], data_Y)


0.5240912145340775

In [12]:
data_X_tmp = data_X[col_num]

score_final = {'R2':0, 'RMSE':0, 'RMSLE':0}

for train, test in kf.split(data_X_tmp):
    regr = MultiTaskLassoCV(cv=4)
    
    # Entraînement
    regr.fit(data_X_tmp.iloc[train], data_Y.iloc[train])

    # Prédiction sur le jeu de test
    y_pred = regr.predict(data_X_tmp.iloc[test])

    
    y_test = data_Y.iloc[test]
    
    # Evaluate
    score = metrics.r2_score(y_test, y_pred)
    print('R2 : ', score)
    score_final['R2'] += abs(score)
    
    score = metrics.mean_squared_error(y_test, y_pred, squared=True)
    #print('RMSE : ', score)
    score_final['RMSE'] += score
    
    score = metrics.mean_squared_log_error(y_test.applymap(lambda x:max(0, x)), [[max(0, f) for f in e] for e in y_pred], squared=True)
    #print('RMSLE : ', score)
    score_final['RMSLE'] += score

for k, score in score_final.items():
    print('Score {} final : {}'.format(k, score/kf.get_n_splits()))

R2 :  0.44818534226758694
R2 :  0.4216657902196019
R2 :  0.31717709836572927
R2 :  0.3448347759744504
R2 :  0.5048594186582999
R2 :  0.38576708535762794
R2 :  0.20711034633524955
R2 :  0.5873648358842445
Score R2 final : 0.4021205866328488
Score RMSE final : 70541299470321.66
Score RMSLE final : 5.914829956579185


In [13]:
from sklearn.model_selection import GridSearchCV

def grid_search_cv_models(X, y, models, params):
    # Initialiser la variable pour enregistrer le meilleur score
    best_score = 0
    # Initialiser la variable pour enregistrer le meilleur modèle
    best_model = None
    # Boucler à travers les modèles et les paramètres correspondants
    for model, model_params in zip(models, params):
        # Créer un objet GridSearchCV pour ce modèle et ces paramètres
        gs = GridSearchCV(model, model_params, cv=5, error_score='raise', scoring='r2')
        # Entraîner le modèle sur les données X et y
        gs.fit(X, y)
        print(gs.best_score_)
        print(gs.best_estimator_)
        # Si le meilleur score de ce modèle est supérieur au meilleur score actuel, le mettre à jour
        if gs.best_score_ > best_score:
            best_score = gs.best_score_
            best_model = gs.best_estimator_
    # Renvoyer le meilleur modèle
    return best_model

In [14]:
from sklearn.ensemble import RandomForestRegressor

# Une liste de modèles
models = [LassoCV(), RandomForestRegressor()]
# Une liste de paramètres pour les modèles correspondants
params = [{'cv':[2, 4, 5, 8, 10, 15]},
          {"max_depth": [3, 5, 8, 10, 15, 20, 30, 40, 50]}]

best_model = grid_search_cv_models(data_X[col_num], data_Y['CO2'], models, params)
print("Le meilleur modèle est:", best_model)

0.17789592681783759
LassoCV(cv=2)
0.36956244150180423
RandomForestRegressor(max_depth=15)
Le meilleur modèle est: RandomForestRegressor(max_depth=15)
