In [None]:
#from google.colab import drive
#drive.mount('/content/drive')
#!cp /content/drive/MyDrive/ashrae.zip ashrae.zip
#!unzip -q ashrae.zip

In [None]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import gc
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import plotly.figure_factory as ff

from IPython.core.pylabtools import figsize
from plotly.offline import init_notebook_mode,iplot,plot
%matplotlib inline
init_notebook_mode(connected=True)
plt.style.use("ggplot")
from sklearn.utils import shuffle

from sklearn.preprocessing import  LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split

from keras.models import load_model
from keras.models import Model 
from keras.layers import Input, Dense, Lambda
from keras.layers import Embedding, Flatten, Dropout, Add, Activation
from keras.optimizers import Adam 
from keras.callbacks import ModelCheckpoint,EarlyStopping
import keras.backend as K

# importer les fichiers .py sur le notebook
%run utils.py

In [None]:
building = pd.read_csv('/content/building_metadata.csv')
weather = pd.read_csv('/content/weather_train.csv')
train = pd.read_csv('/content/train.csv')

train.loc[(train.meter == 0),'meter_reading'] = train['meter_reading'] * 0.2931 # convertir du kwh vers kbtu

In [None]:
def join_data(data , weather_data , building_data):
    building_data = reduce_mem_usage(building_data)
    weather_data  = reduce_mem_usage(weather_data)
    data = reduce_mem_usage(data)

    data = data.merge(building_data, on='building_id', how='left')
    data = data.merge(weather_data, how='left', on = ['site_id', 'timestamp'], validate='many_to_one')

    return data


def preprocess_data(data , cols_to_scale , cols_to_drop):
    data = data.drop(cols_to_drop, axis= 1)
    data = data.dropna(axis=0)

    LE = LabelEncoder()
    LE.fit(data['primary_use'])

    data['primary_use'] = LE.transform(data['primary_use'])


    stats = {}
    for col in cols_to_scale:
        data[col] = ( data[col] - data[col].mean())/ data[col].std()
        stats[col] = ( data[col].mean() , data[col].std())

    data = time_features(data)
    
    return data , stats , LE

In [None]:
train = join_data(train , weather , building)

train['log_meter_reading'] = np.log(train['meter_reading'] + 1)


del weather, building
gc.collect()

Memory usage of dataframe is 0.07 MB
Memory usage after optimization is: 0.02 MB
Decreased by 65.4%
Memory usage of dataframe is 9.60 MB
Memory usage after optimization is: 4.51 MB
Decreased by 53.0%
Memory usage of dataframe is 616.95 MB
Memory usage after optimization is: 173.90 MB
Decreased by 71.8%


35

In [None]:
#print(train.columns)
#print('--------------------------')
#for col in train.columns:
#  if train[col].isnull().values.any():
#    print('Dans la variable', '(',col,')' , '-- il y a', train[col].isnull().values.sum() , ' Nan ')

## On supprime les variables qui contient plus de 150 000 valeurs manquantes.

In [None]:
train = train.drop([ 'meter_reading' ] , axis= 1)
cols_to_drop = ['year_built' , 'floor_count' , 'cloud_coverage' , 'precip_depth_1_hr' , 'wind_direction' , 'sea_level_pressure']
cols_to_scale = ['square_feet' , 'air_temperature' , 'dew_temperature' , 'wind_speed']

train , stats, label_encoder = preprocess_data(train, cols_to_scale, cols_to_drop)

In [None]:
for col in train.columns:
  if train[col].isnull().values.any():
    print(col , 'il y a', train[col].isnull().values.sum() , ' Nan ')

In [None]:
train = train.drop([ 'timestamp' ] , axis= 1)

In [None]:
train = shuffle(train, random_state=0)

In [None]:
def residual_block(input_x, nodes):
    d1 = Dense(nodes,activation='relu')(input_x)
    d2 = Dense(nodes,activation='relu')(d1)

    projected_x = Dense(nodes,activation='relu')(input_x)

    add = Add()([projected_x , d2])
    return add

def create_model(data , categorical_vars , numerical_vars):
    categ_inputs = []
    embed_layers = []  
    for col in categorical_vars:
        num_classes = len(np.unique(data[col]))

        input_x = Input(shape = [1] , name = col)
        categ_inputs.append(input_x)

        embed_x = Embedding(num_classes, 3)(input_x)
        embed_layers.append(embed_x)

    num_inputs = []
    for col in numerical_vars:
        num_inputs.append(Input(shape=[1] , name = col))

    concat_emb = K.concatenate([Flatten()(emb_layer) for emb_layer in embed_layers])

    d1 = residual_block(concat_emb , 512)
    main = K.concatenate(num_inputs+[d1])

    d3 = residual_block(main , 512)
    d6 = residual_block(d3 , 8)
    
    output = Dense(1 )(d6)
    model  = Model(categ_inputs + num_inputs, output)

    return model 


def flatten_cols(X):
    X = [X[:,i].flatten() for i in range(len(X[0,:])) ] 
    return X 

def split_data(data, categorical_vars, numerical_vars, validation_split = 0.2, target_var = 'log_meter_reading'):

    y = data[target_var].values
    y = y.reshape((len(y) , 1))

    co_vars = categorical_vars + numerical_vars
    X = data[co_vars].values

    if validation_split ==0 : 
      return X,y
      
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = validation_split, random_state=42) 
    return X_train, X_val , y_train , y_val

def rmse(y_true, y_pred):
    """ root_mean_squared_error """
    return K.sqrt(K.mean(K.square(y_pred - y_true)))

In [None]:
categorical_vars = ['building_id' , 'meter' , 'site_id' , 'primary_use' , 'hour' , 'dayofweek' , 'month']
numerical_vars = ['square_feet' , 'air_temperature' , 'dew_temperature' , 'wind_speed']
model = create_model( train, categorical_vars ,  numerical_vars)

X , y = split_data(train , categorical_vars , numerical_vars , validation_split = 0)

X = flatten_cols(X)

del train
gc.collect()

526

In [None]:
model = load_model('/content/drive/MyDrive/model_ann.h5' , 
                   custom_objects={ "rmse": rmse})

In [None]:
if 1==0:
    checkpoint = ModelCheckpoint( '/content/drive/MyDrive/model_ann.h5', 
                                monitor='loss', 
                                verbose=1, 
                                save_best_only=True, 
                                mode='min')

    early_stopping = EarlyStopping(patience = 5 ,monitor = "rmse")
    callbacks_list = [checkpoint , early_stopping]

    model.compile(optimizer = Adam(lr=1e-4), loss = 'mse',  metrics = [rmse])
    history_1 = model.fit(X, y, epochs = 100, batch_size = 2048, callbacks = callbacks_list)

In [None]:
del X,y
gc.collect()

2300

In [None]:
building = pd.read_csv('/content/building_metadata.csv')
weather = pd.read_csv('/content/weather_test.csv')
test = pd.read_csv('/content/test.csv')

test = join_data(test , weather , building)
test = test.drop(cols_to_drop, axis= 1)

Memory usage of dataframe is 0.07 MB
Memory usage after optimization is: 0.02 MB
Decreased by 65.4%
Memory usage of dataframe is 19.04 MB
Memory usage after optimization is: 8.96 MB
Decreased by 53.0%
Memory usage of dataframe is 1272.51 MB
Memory usage after optimization is: 358.65 MB
Decreased by 71.8%


In [None]:
del building,weather
gc.collect()

53

In [None]:
test['primary_use'] = label_encoder.transform(test['primary_use'])

In [None]:
for col in cols_to_scale:
    test[col] = (test[col] - stats[col][0])/stats[col][1]
    

test = time_features(test)

In [None]:
row_id = test['row_id']
meter = test['meter']
test = test.drop(['row_id' , 'timestamp'] , axis=1)

In [None]:
co_vars = categorical_vars + numerical_vars
test = test[co_vars].values

In [None]:
test = flatten_cols(test)

In [None]:
test

[array([0.000e+00, 1.000e+00, 2.000e+00, ..., 1.446e+03, 1.447e+03,
        1.448e+03]),
 array([0., 0., 0., ..., 0., 0., 0.]),
 array([ 0.,  0.,  0., ..., 15., 15., 15.]),
 array([0., 0., 0., ..., 1., 4., 6.]),
 array([0., 0., 0., ..., 7., 7., 7.]),
 array([6., 6., 6., ..., 2., 2., 2.]),
 array([0., 0., 0., ..., 4., 4., 4.]),
 array([ 7432.        ,  2720.        ,  5376.        , ...,
        11265.        , 29775.        , 92270.99999999]),
 array([17.51616042, 17.51616042, 17.51616042, ...,         nan,
                nan,         nan]),
 array([11.52454959, 11.52454959, 11.52454959, ...,         nan,
                nan,         nan]),
 array([3.65176807, 3.65176807, 3.65176807, ...,        nan,        nan,
               nan])]

In [None]:
from tqdm import tqdm

step_size = 10000
res = np.array([])
i = 0
for j in range(int(np.ceil(test[0].shape[0]/step_size))):
    r = np.exp(model.predict([v[i:i+step_size] for v in test])) - 1
    print(len(r))
    print(r)
    break
    res = np.append(res,r)
    i += step_size

10000
[[          inf]
 [2.3154978e+08]
 [2.0914330e+36]
 ...
 [          inf]
 [          inf]
 [          inf]]



overflow encountered in exp

