In [7]:
from google.colab import drive
drive.mount('/content/drive')
!cp /content/drive/MyDrive/ashrae.zip ashrae.zip
!unzip -q ashrae.zip

Mounted at /content/drive


In [49]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import gc
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import plotly.figure_factory as ff

from IPython.core.pylabtools import figsize
from plotly.offline import init_notebook_mode,iplot,plot
%matplotlib inline
init_notebook_mode(connected=True)
plt.style.use("ggplot")

from sklearn.preprocessing import  LabelEncoder
from sklearn.model_selection import train_test_split


In [50]:
#Function to reduction memory usage (Source code from Kaggle)
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df

In [51]:
building = pd.read_csv('/content/building_metadata.csv')
weather = pd.read_csv('/content/weather_train.csv')
train = pd.read_csv('/content/train.csv')

train.loc[(train.meter == 0),'meter_reading'] = train['meter_reading'] * 0.2931 # convertir du kwh vers kbtu

building = reduce_mem_usage(building)
weather = reduce_mem_usage(weather)
train = reduce_mem_usage(train)

Memory usage after optimization is: 0.03 MB
Decreased by 60.3%
Memory usage after optimization is: 3.07 MB
Decreased by 68.1%
Memory usage after optimization is: 289.19 MB
Decreased by 53.1%


##apperçues de nos données

In [52]:
train.head()

Unnamed: 0,building_id,meter,timestamp,meter_reading
0,0,0,2016-01-01 00:00:00,0.0
1,1,0,2016-01-01 00:00:00,0.0
2,2,0,2016-01-01 00:00:00,0.0
3,3,0,2016-01-01 00:00:00,0.0
4,4,0,2016-01-01 00:00:00,0.0


In [53]:
building.head()

Unnamed: 0,site_id,building_id,primary_use,square_feet,year_built,floor_count
0,0,0,Education,7432,2008.0,
1,0,1,Education,2720,2004.0,
2,0,2,Education,5376,1991.0,
3,0,3,Education,23685,2002.0,
4,0,4,Education,116607,1975.0,


In [9]:
weather.head()

Unnamed: 0,site_id,timestamp,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed
0,0,2016-01-01 00:00:00,25.0,6.0,20.0,,1019.5,0.0,0.0
1,0,2016-01-01 01:00:00,24.40625,,21.09375,-1.0,1020.0,70.0,1.5
2,0,2016-01-01 02:00:00,22.796875,2.0,21.09375,0.0,1020.0,0.0,0.0
3,0,2016-01-01 03:00:00,21.09375,2.0,20.59375,0.0,1020.0,0.0,0.0
4,0,2016-01-01 04:00:00,20.0,2.0,20.0,-1.0,1020.0,250.0,2.599609


on joint les données ci-dessus a partir des clés

In [10]:
train = (train.merge(building, on='building_id', how='left'))
train['log_meter_reading'] = np.log1p(train['meter_reading'])
train = train.merge(weather, how='left', on = ['site_id', 'timestamp'], validate='many_to_one')

#train  = time_features(train)

del weather, building
gc.collect()

136

On convertie la variable timestamp en un objet datetime

In [11]:
train.timestamp = pd.to_datetime(train.timestamp)

train['month'] = train['timestamp'].dt.month.astype(np.int8)
train['dayofweek'] = train['timestamp'].dt.dayofweek.astype(np.int8)
train['hour'] = train['timestamp'].dt.hour.astype(np.int8)

On regarde le nombre de valeurs manquantes dans chaque variable.

In [12]:
print(train.columns)
print('--------------------------')
for col in train.columns:
  if train[col].isnull().values.any():
    print('Dans la variable', '(',col,')' , '-- il y a', train[col].isnull().values.sum() , ' Nan ')

Index(['building_id', 'meter', 'timestamp', 'meter_reading', 'site_id',
       'primary_use', 'square_feet', 'year_built', 'floor_count',
       'log_meter_reading', 'air_temperature', 'cloud_coverage',
       'dew_temperature', 'precip_depth_1_hr', 'sea_level_pressure',
       'wind_direction', 'wind_speed', 'month', 'dayofweek', 'hour'],
      dtype='object')
--------------------------
Dans la variable ( year_built ) -- il y a 12127645  Nan 
Dans la variable ( floor_count ) -- il y a 16709167  Nan 
Dans la variable ( air_temperature ) -- il y a 96658  Nan 
Dans la variable ( cloud_coverage ) -- il y a 8825365  Nan 
Dans la variable ( dew_temperature ) -- il y a 100140  Nan 
Dans la variable ( precip_depth_1_hr ) -- il y a 3749023  Nan 
Dans la variable ( sea_level_pressure ) -- il y a 1231669  Nan 
Dans la variable ( wind_direction ) -- il y a 1449048  Nan 
Dans la variable ( wind_speed ) -- il y a 143676  Nan 


On supprime les variables qui ont plus de 150 000 valeurs manquantes

In [13]:
train = train.drop([ 'timestamp' , 'meter_reading' ] , axis= 1)
train = train.drop(['year_built' , 'floor_count' , 'cloud_coverage' , 'precip_depth_1_hr' , 'wind_direction' , 'sea_level_pressure'] , axis= 1)

In [14]:
train.head()

Unnamed: 0,building_id,meter,site_id,primary_use,square_feet,log_meter_reading,air_temperature,dew_temperature,wind_speed,month,dayofweek,hour
0,0,0,0,Education,7432,0.0,25.0,20.0,0.0,1,4,0
1,1,0,0,Education,2720,0.0,25.0,20.0,0.0,1,4,0
2,2,0,0,Education,5376,0.0,25.0,20.0,0.0,1,4,0
3,3,0,0,Education,23685,0.0,25.0,20.0,0.0,1,4,0
4,4,0,0,Education,116607,0.0,25.0,20.0,0.0,1,4,0


On supprime les lignes qui contiennent les valeurs manquantes

In [15]:
train = train.dropna(axis=0)

for col in train.columns:
  if train[col].isnull().values.any():
    print(col , 'il y a', train[col].isnull().values.sum() , ' Nan ')

In [16]:
print(train.shape)

(20067705, 12)


transformation de la variable primary_use grace au LabelEncoder()

In [17]:
le = LabelEncoder()
le.fit(train['primary_use'])

train['primary_use'] = le.transform(train['primary_use'])

In [18]:
train.head()

Unnamed: 0,building_id,meter,site_id,primary_use,square_feet,log_meter_reading,air_temperature,dew_temperature,wind_speed,month,dayofweek,hour
0,0,0,0,0,7432,0.0,25.0,20.0,0.0,1,4,0
1,1,0,0,0,2720,0.0,25.0,20.0,0.0,1,4,0
2,2,0,0,0,5376,0.0,25.0,20.0,0.0,1,4,0
3,3,0,0,0,23685,0.0,25.0,20.0,0.0,1,4,0
4,4,0,0,0,116607,0.0,25.0,20.0,0.0,1,4,0


### First model

### Data

In [22]:
#data

X= train.drop(['log_meter_reading'],axis=1)
y =train['log_meter_reading']

### Spliting data into training set and test set

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [25]:
print("la taille de X_train est: ",X_train.shape)
print("la taille de y_train est: ",y_train.shape)

la taille de X_train est:  (16054164, 11)
la taille de y_train est:  (16054164,)


### 1 DecisionTreeRegressor

In [26]:
from sklearn.tree import DecisionTreeRegressor

decisionT_model = DecisionTreeRegressor(min_samples_split = 200, min_samples_leaf = 150)
decisionT_model = decisionT_model.fit(X_train, y_train)



###1.1 accuracy du model

In [27]:
y_pred_test = decisionT_model.predict(X_test)

from sklearn.metrics import mean_squared_error

print("l'erreur RMSE est: ",mean_squared_error(y_test, y_pred_test,squared=False))


l'erreur RMSE est:  0.7109444855615193


### 2 RandomForestRegressor

In [None]:
from sklearn.ensemble import RandomForestRegressor

RF_model= RandomForestRegressor(min_samples_split=100,min_samples_leaf=50)
RF_model= RF_model.fit(X_train, y_train)



### 2.1 Accuracy du modèle

In [47]:
y_pred_test_rf = RF_model.predict(X_test)

from sklearn.metrics import mean_squared_error

print("l'erreur RMSE est: ",mean_squared_error(y_test, y_pred_test_rf,squared=False))


l'erreur RMSE est:  0.6872086300431037


### 3 BaggingRegressor

In [43]:
from sklearn.ensemble import BaggingRegressor

Bg_model= BaggingRegressor(n_estimators=10,max_samples=300,max_features=11 )
Bg_model= Bg_model.fit(X_train, y_train)

### 3.1 Accuracy du modèle

In [44]:
y_pred_test_bg = Bg_model.predict(X_test)

from sklearn.metrics import mean_squared_error

print("l'erreur RMSE est: ",mean_squared_error(y_test, y_pred_test_bg,squared=False))


l'erreur RMSE est:  1.770618800664036


### 4 GradientBoostingRegressor

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

Gb_model = GradientBoostingRegressor()
Gb_model = Gb_model.fit(X_train, y_train)


### 4.1 Accuracy du modèle

In [None]:
y_pred_test_Gb = Gb_model.predict(X_test)

from sklearn.metrics import mean_squared_error

print("l'erreur RMSE est: ",mean_squared_error(y_test, y_pred_test_Gb,squared=False))


### 5 SGDRegressor

### 5.1 Accuracy modèle

In [64]:
from sklearn.linear_model import SGDRegressor
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

sgd_model = make_pipeline(StandardScaler(),SGDRegressor(max_iter=3000, tol=1e-3))
sgd_model.fit(X_train, y_train)


Pipeline(memory=None,
         steps=[('standardscaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('sgdregressor',
                 SGDRegressor(alpha=0.0001, average=False, early_stopping=False,
                              epsilon=0.1, eta0=0.01, fit_intercept=True,
                              l1_ratio=0.15, learning_rate='invscaling',
                              loss='squared_loss', max_iter=3000,
                              n_iter_no_change=5, penalty='l2', power_t=0.25,
                              random_state=None, shuffle=True, tol=0.001,
                              validation_fraction=0.1, verbose=0,
                              warm_start=False))],
         verbose=False)

In [65]:
y_pred_test_sgd = sgd_model.predict(X_test)

from sklearn.metrics import mean_squared_error

print("l'erreur RMSE est: ",mean_squared_error(y_test, y_pred_test_sgd,squared=False))

l'erreur RMSE est:  1.9069443027620574
