In [1]:
#from google.colab import drive
#drive.mount('/content/drive')
#!cp /content/drive/MyDrive/ashrae.zip ashrae.zip
#!unzip -q ashrae.zip

In [2]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import gc
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import plotly.figure_factory as ff

from IPython.core.pylabtools import figsize
from plotly.offline import init_notebook_mode,iplot,plot
%matplotlib inline
init_notebook_mode(connected=True)
plt.style.use("ggplot")

from sklearn.preprocessing import  LabelEncoder
from sklearn.model_selection import train_test_split

from keras.models import Model
from keras.layers import Input, Dense, BatchNormalization, Embedding,Flatten,Dropout
from keras.optimizers import Adam, SGD, RMSprop
import keras.backend as K

# importer les fichiers .py sur le notebook
%run utils.py

In [3]:
building = pd.read_csv('/content/building_metadata.csv')
weather = pd.read_csv('/content/weather_train.csv')
train = pd.read_csv('/content/train.csv')

train.loc[(train.meter == 0),'meter_reading'] = train['meter_reading'] * 0.2931 # convertir du kwh vers kbtu

building = reduce_mem_usage(building)
weather = reduce_mem_usage(weather)
train = reduce_mem_usage(train)

Memory usage of dataframe is 0.07 MB
Memory usage after optimization is: 0.02 MB
Decreased by 65.4%
Memory usage of dataframe is 9.60 MB
Memory usage after optimization is: 4.51 MB
Decreased by 53.0%
Memory usage of dataframe is 616.95 MB
Memory usage after optimization is: 173.90 MB
Decreased by 71.8%


In [4]:
train = (train.merge(building, on='building_id', how='left'))
train['log_meter_reading'] = np.log1p(train['meter_reading'])
train = train.merge(weather, how='left', on = ['site_id', 'timestamp'], validate='many_to_one')

train  = time_features(train)

del weather, building
gc.collect()

47

In [5]:
print(train.columns)
print('--------------------------')
for col in train.columns:
  if train[col].isnull().values.any():
    print('Dans la variable', '(',col,')' , '-- il y a', train[col].isnull().values.sum() , ' Nan ')

Index(['building_id', 'meter', 'timestamp', 'meter_reading', 'site_id',
       'primary_use', 'square_feet', 'year_built', 'floor_count',
       'log_meter_reading', 'air_temperature', 'cloud_coverage',
       'dew_temperature', 'precip_depth_1_hr', 'sea_level_pressure',
       'wind_direction', 'wind_speed', 'hour', 'dayofweek', 'month'],
      dtype='object')
--------------------------
Dans la variable ( year_built ) -- il y a 12127645  Nan 
Dans la variable ( floor_count ) -- il y a 16709167  Nan 
Dans la variable ( air_temperature ) -- il y a 96658  Nan 
Dans la variable ( cloud_coverage ) -- il y a 8825365  Nan 
Dans la variable ( dew_temperature ) -- il y a 100140  Nan 
Dans la variable ( precip_depth_1_hr ) -- il y a 3749023  Nan 
Dans la variable ( sea_level_pressure ) -- il y a 1231669  Nan 
Dans la variable ( wind_direction ) -- il y a 1449048  Nan 
Dans la variable ( wind_speed ) -- il y a 143676  Nan 


## On supprime les variables qui contient plus de 150 000 valeurs manquantes.

In [6]:
train = train.drop([ 'timestamp' , 'meter_reading' ] , axis= 1)
train = train.drop(['year_built' , 'floor_count' , 'cloud_coverage' , 'precip_depth_1_hr' , 'wind_direction' , 'sea_level_pressure'] , axis= 1)

## On supprime les lignes qui contient des valeurs manquantes.

In [7]:
train = train.dropna(axis=0)

for col in train.columns:
  if train[col].isnull().values.any():
    print(col , 'il y a', train[col].isnull().values.sum() , ' Nan ')

In [8]:
train.shape

(20067705, 12)

In [9]:
le = LabelEncoder()
le.fit(train['primary_use'])

train['primary_use'] = le.transform(train['primary_use'])

In [10]:
train.head()

Unnamed: 0,building_id,meter,site_id,primary_use,square_feet,log_meter_reading,air_temperature,dew_temperature,wind_speed,hour,dayofweek,month
0,0,0,0,0,7432,0.0,25.0,20.0,0.0,0,4,0
1,1,0,0,0,2720,0.0,25.0,20.0,0.0,0,4,0
2,2,0,0,0,5376,0.0,25.0,20.0,0.0,0,4,0
3,3,0,0,0,23685,0.0,25.0,20.0,0.0,0,4,0
4,4,0,0,0,116607,0.0,25.0,20.0,0.0,0,4,0


In [11]:
def create_model(X , categorical_vars , numerical_vars):
    input_layers = []
    embed_layers = []  
    for col in categorical_vars:
        num_classes = len(np.unique(X[col]))

        input_x = Input(shape = [1] , name = col)
        input_layers.append(input_x)
        
        embed_x = Embedding(num_classes, 10)(input_x)
        embed_layers.append(embed_x)

    for col in numerical_vars:
        input_layers.append(Input(shape=[1] , name = col))

    concat_emb = K.concatenate([Flatten()(emb_layer) for emb_layer in embed_layers])

    d1  = Dense(128,activation='relu')(concat_emb)
    dr1 = Dropout(0.1)(d1)
    bn1 = BatchNormalization()(dr1)

    d2  = Dense(128,activation='relu')(bn1)
    dr2 = Dropout(0.1)(d2)
    
    main = K.concatenate(input_layers+[dr2])

    d3  = Dense(128,activation='relu')(main)
    dr3 = Dropout(0.1)(d3)
    bn3 = BatchNormalization()(dr3)

    d4  = Dense(128,activation='relu')(bn3)
    dr4 = Dropout(0.1)(d4)

    output = Dense(1)(dr4)
    model = Model(input_layers, output)
    return model

def rmse(y_true, y_pred):
    """ root_mean_squared_error """
    return K.sqrt(K.mean(K.square(y_pred - y_true)))

In [12]:
categorical_vars = ['building_id' , 'meter' , 'site_id' , 'primary_use' , 'hour' , 'dayofweek' , 'month']
numerical_vars = ['square_feet' , 'air_temperature' , 'dew_temperature' , 'wind_speed']
co_vars = categorical_vars + numerical_vars
model = create_model(train , categorical_vars , numerical_vars)

y = train.log_meter_reading.values
y = y.reshape((len(y) , 1))
X = train[co_vars].values

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

X_train = [X_train[:,i].flatten() for i in range(len(X_train[0,:])) ] 
X_val = [X_val[:,i].flatten() for i in range(len(X_val[0,:])) ] 

del train
gc.collect()

440

In [None]:
model.compile(optimizer = Adam(lr=1e-4 ), loss= 'mse',  metrics=[rmse])
history_1 = model.fit(X_train, y_train , epochs= 30 , batch_size = 1000 , validation_data = (X_val,y_val)  )

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
 1874/16055 [==>...........................] - ETA: 1:33 - loss: 0.7495 - rmse: 0.8639