In [1]:
#from google.colab import drive
#drive.mount('/content/drive')
!cp /content/drive/MyDrive/ashrae.zip ashrae.zip
!unzip -q ashrae.zip

In [2]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import gc
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import plotly.figure_factory as ff

from IPython.core.pylabtools import figsize
from plotly.offline import init_notebook_mode,iplot,plot
%matplotlib inline
init_notebook_mode(connected=True)
plt.style.use("ggplot")

from sklearn.utils import shuffle
from sklearn.preprocessing import  LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split

from keras.models import Model
from keras.layers import Input, Dense, BatchNormalization, Lambda
from keras.layers import Embedding,Flatten,Dropout, Add,Activation
from keras.optimizers import Adam, SGD, RMSprop
import keras.backend as K

# importer les fichiers .py sur le notebook
%run utils.py

In [3]:
building = pd.read_csv('/content/building_metadata.csv')
weather = pd.read_csv('/content/weather_train.csv')
train = pd.read_csv('/content/train.csv')

train.loc[(train.meter == 0),'meter_reading'] = train['meter_reading'] * 0.2931 # convertir du kwh vers kbtu

In [4]:
def join_data(data , weather_data , building_data):
    building_data = reduce_mem_usage(building_data)
    weather_data  = reduce_mem_usage(weather_data)
    data = reduce_mem_usage(data)

    data = (data.merge(building_data, on='building_id', how='left'))
    data = data.merge(weather_data, how='left', on = ['site_id', 'timestamp'], validate='many_to_one')

    return data


def preprocess_data(data , cols_to_scale , cols_to_drop):
    data = data.drop(cols_to_drop, axis= 1)
    data = data.dropna(axis=0)

    LE = LabelEncoder()
    LE.fit(data['primary_use'])

    data['primary_use'] = LE.transform(data['primary_use'])

    for col in cols_to_scale:
        data[col] = ( data[col] - data[col].mean())/ data[col].std()
    
    data = time_features(data)
    
    return data

In [5]:
train = join_data(train , weather , building)

train['log_meter_reading'] = np.log(train['meter_reading'] + 1)


del weather, building
gc.collect()

Memory usage of dataframe is 0.07 MB
Memory usage after optimization is: 0.02 MB
Decreased by 65.4%
Memory usage of dataframe is 9.60 MB
Memory usage after optimization is: 4.51 MB
Decreased by 53.0%
Memory usage of dataframe is 616.95 MB
Memory usage after optimization is: 173.90 MB
Decreased by 71.8%


13

In [6]:
print(train.columns)
print('--------------------------')
for col in train.columns:
  if train[col].isnull().values.any():
    print('Dans la variable', '(',col,')' , '-- il y a', train[col].isnull().values.sum() , ' Nan ')

Index(['building_id', 'meter', 'timestamp', 'meter_reading', 'site_id',
       'primary_use', 'square_feet', 'year_built', 'floor_count',
       'air_temperature', 'cloud_coverage', 'dew_temperature',
       'precip_depth_1_hr', 'sea_level_pressure', 'wind_direction',
       'wind_speed', 'log_meter_reading'],
      dtype='object')
--------------------------
Dans la variable ( year_built ) -- il y a 12127645  Nan 
Dans la variable ( floor_count ) -- il y a 16709167  Nan 
Dans la variable ( air_temperature ) -- il y a 96658  Nan 
Dans la variable ( cloud_coverage ) -- il y a 8825365  Nan 
Dans la variable ( dew_temperature ) -- il y a 100140  Nan 
Dans la variable ( precip_depth_1_hr ) -- il y a 3749023  Nan 
Dans la variable ( sea_level_pressure ) -- il y a 1231669  Nan 
Dans la variable ( wind_direction ) -- il y a 1449048  Nan 
Dans la variable ( wind_speed ) -- il y a 143676  Nan 


## On supprime les variables qui contient plus de 150 000 valeurs manquantes.

In [7]:
train.head()

Unnamed: 0,building_id,meter,timestamp,meter_reading,site_id,primary_use,square_feet,year_built,floor_count,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed,log_meter_reading
0,0,0,2016-01-01 00:00:00,0.0,0,Education,7432,2008.0,,25.0,6.0,20.0,,1019.700012,0.0,0.0,0.0
1,1,0,2016-01-01 00:00:00,0.0,0,Education,2720,2004.0,,25.0,6.0,20.0,,1019.700012,0.0,0.0,0.0
2,2,0,2016-01-01 00:00:00,0.0,0,Education,5376,1991.0,,25.0,6.0,20.0,,1019.700012,0.0,0.0,0.0
3,3,0,2016-01-01 00:00:00,0.0,0,Education,23685,2002.0,,25.0,6.0,20.0,,1019.700012,0.0,0.0,0.0
4,4,0,2016-01-01 00:00:00,0.0,0,Education,116607,1975.0,,25.0,6.0,20.0,,1019.700012,0.0,0.0,0.0


In [8]:
for col in train.columns:
  if train[col].isnull().values.any():
    print(col , 'il y a', train[col].isnull().values.sum() , ' Nan ')

year_built il y a 12127645  Nan 
floor_count il y a 16709167  Nan 
air_temperature il y a 96658  Nan 
cloud_coverage il y a 8825365  Nan 
dew_temperature il y a 100140  Nan 
precip_depth_1_hr il y a 3749023  Nan 
sea_level_pressure il y a 1231669  Nan 
wind_direction il y a 1449048  Nan 
wind_speed il y a 143676  Nan 


In [9]:
train = train.drop([ 'meter_reading' ] , axis= 1)
cols_to_drop = ['year_built' , 'floor_count' , 'cloud_coverage' , 'precip_depth_1_hr' , 'wind_direction' , 'sea_level_pressure']
cols_to_scale = ['square_feet' , 'air_temperature' , 'dew_temperature' , 'wind_speed']

train = preprocess_data(train, cols_to_scale, cols_to_drop)

In [10]:
for col in train.columns:
  if train[col].isnull().values.any():
    print(col , 'il y a', train[col].isnull().values.sum() , ' Nan ')

In [11]:
train.shape

(20067705, 13)

In [12]:
train.head()

Unnamed: 0,building_id,meter,timestamp,site_id,primary_use,square_feet,air_temperature,dew_temperature,wind_speed,log_meter_reading,hour,dayofweek,month
0,0,0,2016-01-01 00:00:00,0,0,-0.856176,0.859954,1.225522,-1.551346,0.0,0,4,0
1,1,0,2016-01-01 00:00:00,0,0,-0.8964,0.859954,1.225522,-1.551346,0.0,0,4,0
2,2,0,2016-01-01 00:00:00,0,0,-0.873727,0.859954,1.225522,-1.551346,0.0,0,4,0
3,3,0,2016-01-01 00:00:00,0,0,-0.717434,0.859954,1.225522,-1.551346,0.0,0,4,0
4,4,0,2016-01-01 00:00:00,0,0,0.075786,0.859954,1.225522,-1.551346,0.0,0,4,0


In [13]:
train = train.drop([ 'timestamp' ] , axis= 1)

In [14]:
train = shuffle(train, random_state=0)

In [15]:
train.head()

Unnamed: 0,building_id,meter,site_id,primary_use,square_feet,air_temperature,dew_temperature,wind_speed,log_meter_reading,hour,dayofweek,month
19012003,166,3,2,4,3.802808,0.192285,-0.489685,-0.180781,4.974006,18,5,11
2943606,1272,1,14,9,1.192895,-0.688666,0.008921,0.040278,0.0,18,2,1
13187863,908,2,9,0,-0.140637,1.425618,1.444909,-0.622899,3.106826,23,6,7
6351780,699,0,5,0,-0.119211,-0.901949,-0.768905,1.631902,2.076409,23,3,3
1149633,1375,2,15,6,0.064527,-1.977637,-1.656425,-0.180781,6.137587,20,3,0


In [16]:
def residual_block(input_x, nodes):
    d1 = Dense(nodes,activation='relu')(input_x)
    d2 = Dense(nodes,activation='relu')(d1)

    projected_x = Dense(nodes,activation='relu')(input_x)

    add = Add()([projected_x , d2])
    return add

def create_model(data , categorical_vars , numerical_vars):
    categ_inputs = []
    embed_layers = []  
    for col in categorical_vars:
        num_classes = len(np.unique(data[col]))

        input_x = Input(shape = [1] , name = col)
        categ_inputs.append(input_x)

        embed_x = Embedding(num_classes, 10)(input_x)
        embed_layers.append(embed_x)

    num_inputs = []
    for col in numerical_vars:
        num_inputs.append(Input(shape=[1] , name = col))

    concat_emb = K.concatenate([Flatten()(emb_layer) for emb_layer in embed_layers])

    d1 = residual_block(concat_emb , 512)
    d2 = residual_block(d1 , 512)
    main = K.concatenate(num_inputs+[d2])

    d3 = residual_block(main , 512)
    d4 = residual_block(d3 , 512)
    d5 = residual_block(d4 , 64)

    d6 = residual_block(d5 , 16)
    output = Dense(1 )(d6)
    model  = Model(categ_inputs + num_inputs, output)

    return model 


def flatten_cols(X):
    X = [X[:,i].flatten() for i in range(len(X[0,:])) ] 
    return X 

def split_data(data, categorical_vars, numerical_vars, validation_split = 0.2, target_var = 'log_meter_reading'):

    y = data[target_var].values
    y = y.reshape((len(y) , 1))

    co_vars = categorical_vars + numerical_vars
    X = data[co_vars].values

    if validation_split ==0 : 
      return X,y
      
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = validation_split, random_state=42) 
    return X_train, X_val , y_train , y_val

def rmse(y_true, y_pred):
    """ root_mean_squared_error """
    return K.sqrt(K.mean(K.square(y_pred - y_true)))

In [18]:
categorical_vars = ['building_id' , 'meter' , 'site_id' , 'primary_use' , 'hour' , 'dayofweek' , 'month']
numerical_vars = ['square_feet' , 'air_temperature' , 'dew_temperature' , 'wind_speed']
model = create_model( train, categorical_vars ,  numerical_vars)

X , y = split_data(train , categorical_vars , numerical_vars , validation_split = 0)

X = flatten_cols(X)
#X_val = flatten_cols(X_val)


del train
gc.collect()

3473

In [20]:
model.compile(optimizer = Adam(lr=1e-4), loss= 'mse',  metrics=[rmse])
history_1 = model.fit(X, y , epochs= 100 , batch_size = 1024 )

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78