In [74]:
import pandas as pd
df = pd.read_csv('train_features.csv')
df.head()

Unnamed: 0,building_id,timestamp,meter_reading,anomaly,site_id,primary_use,square_feet,year_built,floor_count,air_temperature,...,gte_meter,gte_meter_hour,gte_meter_weekday,gte_meter_month,gte_meter_building_id,gte_meter_primary_use,gte_meter_site_id,gte_meter_building_id_hour,gte_meter_building_id_weekday,gte_meter_building_id_month
0,1,2016-01-01 00:00:00,,0,0,Education,2720,104,0,19.4,...,4.116,3.981,4.146,4.12,3.569,4.489,4.079,3.554,3.538,3.845
1,32,2016-01-01 00:00:00,,0,0,Office,48392,105,0,19.4,...,4.116,3.981,4.146,4.12,4.513,4.211,4.079,4.128,4.579,4.317
2,41,2016-01-01 00:00:00,,0,0,Office,93860,68,0,19.4,...,4.116,3.981,4.146,4.12,4.108,4.211,4.079,4.103,4.14,4.114
3,55,2016-01-01 00:00:00,,0,0,Office,16726,111,0,19.4,...,4.116,3.981,4.146,4.12,3.506,4.211,4.079,3.308,3.565,3.813
4,69,2016-01-01 00:00:00,,0,0,Parking,387638,100,0,19.4,...,4.116,3.981,4.146,4.12,3.035,3.623,4.079,3.892,3.09,3.577


In [75]:
#Creating NN on training data
df['meter_reading'].fillna((df['meter_reading'].mean()), inplace=True)

In [76]:
df.head()

Unnamed: 0,building_id,timestamp,meter_reading,anomaly,site_id,primary_use,square_feet,year_built,floor_count,air_temperature,...,gte_meter,gte_meter_hour,gte_meter_weekday,gte_meter_month,gte_meter_building_id,gte_meter_primary_use,gte_meter_site_id,gte_meter_building_id_hour,gte_meter_building_id_weekday,gte_meter_building_id_month
0,1,2016-01-01 00:00:00,179.901838,0,0,Education,2720,104,0,19.4,...,4.116,3.981,4.146,4.12,3.569,4.489,4.079,3.554,3.538,3.845
1,32,2016-01-01 00:00:00,179.901838,0,0,Office,48392,105,0,19.4,...,4.116,3.981,4.146,4.12,4.513,4.211,4.079,4.128,4.579,4.317
2,41,2016-01-01 00:00:00,179.901838,0,0,Office,93860,68,0,19.4,...,4.116,3.981,4.146,4.12,4.108,4.211,4.079,4.103,4.14,4.114
3,55,2016-01-01 00:00:00,179.901838,0,0,Office,16726,111,0,19.4,...,4.116,3.981,4.146,4.12,3.506,4.211,4.079,3.308,3.565,3.813
4,69,2016-01-01 00:00:00,179.901838,0,0,Parking,387638,100,0,19.4,...,4.116,3.981,4.146,4.12,3.035,3.623,4.079,3.892,3.09,3.577


In [77]:
#Classes unbalanced
count_norm = len(df[df['anomaly']==0])
count_anom = len(df[df['anomaly']==1])
pct_norm = count_norm/(count_norm+count_anom)
print('percentage of normal instances is ', pct_norm*100)
pct_anom = count_anom/(count_anom+count_norm)
print('percentage of anomaly instances is ', pct_anom*100)

percentage of normal instances is  97.86818360051535
percentage of anomaly instances is  2.131816399484651


In [78]:
#Convert categorical dtypes
cat_columns = df.select_dtypes(['object']).columns
df[cat_columns] = df[cat_columns].apply(lambda x: pd.factorize(x)[0])

In [79]:
cols = df.columns.tolist()
y = df.iloc[:,3:4]
y.shape

(1749494, 1)

In [80]:
df.drop(['anomaly'], axis=1, inplace=True)
x = df.iloc[:,0:56]
x.shape

(1749494, 56)

In [81]:
#Oversampling anomaly instances to create more accurate thresholds for network

from collections import Counter
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split

In [82]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=0)
columns = x_train.columns

In [83]:

#Oversampling with smote
oversample = SMOTE()
x1, y1 = oversample.fit_resample(x_train, y_train)
counter = Counter(y1)
print(counter)

Counter({'anomaly': 1})


In [84]:
y1.value_counts()

anomaly
0          1147121
1          1147121
dtype: int64

In [85]:
#Scaling feature data for NN

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

In [86]:
#Scale oversampled data

X_train = scaler.fit_transform(x1)
X_test = scaler.fit_transform(x_test)

In [87]:
from tensorflow import keras

In [88]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, BatchNormalization, LeakyReLU

In [89]:
from keras import callbacks

In [90]:
#Early stopping callback to help overfit 
earlystopping = callbacks.EarlyStopping(monitor='val_loss', mode='min', patience=5, restore_best_weights=True)

In [91]:
#Neural network for anomalies in training data - simple to start
model = Sequential()
model.add(Dense(1024, activation='relu', input_dim=X_train.shape[1]))
model.add(Dropout(0.25))
model.add(Dense(512, activation='relu'))
model.add(Dropout(0.1))
model.add(Dense(256, activation='relu'))
model.add(Dropout(0.35))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_18 (Dense)            (None, 1024)              58368     
                                                                 
 dropout_9 (Dropout)         (None, 1024)              0         
                                                                 
 dense_19 (Dense)            (None, 512)               524800    
                                                                 
 dropout_10 (Dropout)        (None, 512)               0         
                                                                 
 dense_20 (Dense)            (None, 256)               131328    
                                                                 
 dropout_11 (Dropout)        (None, 256)               0         
                                                                 
 dense_21 (Dense)            (None, 1)                

In [95]:
hist = model.fit(X_train, y1, validation_data=(X_test, y_test), epochs=100, batch_size=len(X_train[1]), callbacks=[earlystopping])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100


In [113]:
from sklearn.metrics import confusion_matrix

y_predicted = model.predict(X_test) > 0.5
mat = confusion_matrix(y_test, y_predicted)
labels = ['Normal', 'Anomaly']



In [106]:
mat

array([[555968,   9109],
       [  1200,  11057]], dtype=int64)

In [98]:
#Saving model
from pathlib import Path
model_structure = model.to_json()
f = Path('model_structure_anom.json')
f.write_text(model_structure)

#Saving weights
model.save_weights('model_weights_anom.h5')

In [99]:
#Importing test data

import pandas as pd
df = pd.read_csv('test_features.csv')
df.head()

Unnamed: 0,row_id,building_id,timestamp,meter_reading,site_id,primary_use,square_feet,year_built,floor_count,air_temperature,...,gte_meter,gte_meter_hour,gte_meter_weekday,gte_meter_month,gte_meter_building_id,gte_meter_primary_use,gte_meter_site_id,gte_meter_building_id_hour,gte_meter_building_id_weekday,gte_meter_building_id_month
0,0,18,2016-01-01 00:00:00,,0,Education,111891,96,0,19.4,...,4.116,3.981,4.146,4.12,6.389,4.489,4.079,6.37,6.392,5.254
1,1,19,2016-01-01 00:00:00,,0,Office,18717,104,0,19.4,...,4.116,3.981,4.146,4.12,4.098,4.211,4.079,4.031,4.125,4.109
2,2,26,2016-01-01 00:00:00,,0,Office,26953,105,0,19.4,...,4.116,3.981,4.146,4.12,3.785,4.211,4.079,3.046,3.804,3.953
3,3,38,2016-01-01 00:00:00,,0,Office,12769,113,0,19.4,...,4.116,3.981,4.146,4.12,4.961,4.211,4.079,4.949,4.964,4.541
4,4,39,2016-01-01 00:00:00,,0,Office,64619,69,0,19.4,...,4.116,3.981,4.146,4.12,4.584,4.211,4.079,4.472,4.6,4.352


In [100]:
df['meter_reading'].fillna((df['meter_reading'].mean()), inplace=True)

In [101]:
cat_columns = df.select_dtypes(['object']).columns
df[cat_columns] = df[cat_columns].apply(lambda x: pd.factorize(x)[0])

In [102]:
x = df.iloc[:,0:56]
x.shape

(1800567, 56)

In [110]:
x

Unnamed: 0,row_id,building_id,timestamp,meter_reading,site_id,primary_use,square_feet,year_built,floor_count,air_temperature,...,gte_site_id,gte_meter,gte_meter_hour,gte_meter_weekday,gte_meter_month,gte_meter_building_id,gte_meter_primary_use,gte_meter_site_id,gte_meter_building_id_hour,gte_meter_building_id_weekday
0,0,18,0,174.987865,0,0,111891,96,0,19.4,...,4.703,4.116,3.981,4.146,4.120,6.389,4.489,4.079,6.370,6.392
1,1,19,0,174.987865,0,1,18717,104,0,19.4,...,4.703,4.116,3.981,4.146,4.120,4.098,4.211,4.079,4.031,4.125
2,2,26,0,174.987865,0,1,26953,105,0,19.4,...,4.703,4.116,3.981,4.146,4.120,3.785,4.211,4.079,3.046,3.804
3,3,38,0,174.987865,0,1,12769,113,0,19.4,...,4.703,4.116,3.981,4.146,4.120,4.961,4.211,4.079,4.949,4.964
4,4,39,0,174.987865,0,1,64619,69,0,19.4,...,4.703,4.116,3.981,4.146,4.120,4.584,4.211,4.079,4.472,4.600
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1800562,1800562,1320,8783,74.552000,14,5,34565,255,0,-29.9,...,5.148,4.116,4.009,4.012,4.056,4.810,3.532,4.710,4.987,4.751
1800563,1800563,1321,8783,584.000000,14,5,370024,255,0,-29.9,...,5.148,4.116,4.009,4.012,4.056,6.013,3.532,4.710,6.142,5.958
1800564,1800564,1322,8783,131.596000,14,5,166489,255,0,-29.9,...,5.148,4.116,4.009,4.012,4.056,4.824,3.532,4.710,4.823,4.803
1800565,1800565,1384,8783,1.200000,15,0,56969,67,0,-29.9,...,4.331,4.116,4.009,4.012,4.056,0.848,4.489,4.188,0.810,0.817


In [103]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

In [107]:
X_pred = scaler.fit_transform(x)

In [109]:
X_pred

array([[0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        6.90686275e-01, 8.08940159e-01, 8.15058480e-01],
       [5.55380919e-07, 7.10732054e-04, 0.00000000e+00, ...,
        6.90686275e-01, 4.71665465e-01, 4.83625731e-01],
       [1.11076184e-06, 5.68585643e-03, 0.00000000e+00, ...,
        6.90686275e-01, 3.29632300e-01, 4.36695906e-01],
       ...,
       [9.99998889e-01, 9.26794598e-01, 1.00000000e+00, ...,
        1.00000000e+00, 5.85868782e-01, 5.82748538e-01],
       [9.99999445e-01, 9.70859986e-01, 1.00000000e+00, ...,
        7.44117647e-01, 7.20980534e-03, 0.00000000e+00],
       [1.00000000e+00, 1.00000000e+00, 1.00000000e+00, ...,
        7.44117647e-01, 2.93294881e-01, 4.37573099e-01]])

In [206]:
predictions = (model.predict(X_pred) > 0.5).astype('int32')
print(predictions)

[[0]
 [0]
 [0]
 ...
 [0]
 [0]
 [0]]


In [207]:
predictions.shape

(1800567, 1)

In [211]:
#Converting predictions and rows to array for concatentation
row_id = df.iloc[:,0:1].values

In [215]:
predictions

array([[0],
       [0],
       [0],
       ...,
       [0],
       [0],
       [0]])

In [216]:
row_id

array([[      0],
       [      1],
       [      2],
       ...,
       [1800564],
       [1800565],
       [1800566]], dtype=int64)

In [217]:
#Concatenating the array
import numpy as np
pred1 = np.concatenate((row_id, predictions), axis=1)

In [218]:
pred1

array([[      0,       0],
       [      1,       0],
       [      2,       0],
       ...,
       [1800564,       0],
       [1800565,       0],
       [1800566,       0]], dtype=int64)

In [219]:
pred_df = pd.DataFrame(pred1, columns=['row_id', 'anomaly'])

In [220]:
#Checking results
pred_df

Unnamed: 0,row_id,anomaly
0,0,0
1,1,0
2,2,0
3,3,0
4,4,0
...,...,...
1800562,1800562,0
1800563,1800563,0
1800564,1800564,0
1800565,1800565,0


In [221]:
#Converting to csv
pred_df.to_csv('Submission_B1', index=False)

In [222]:
pred_df['anomaly'].value_counts()

0    1797412
1       3155
Name: anomaly, dtype: int64