# Tutorial 1 - Autoencoder

We will build an ANOMALY detector. We will use the airbnb data set

**The unit of analysis is a single housing district**

**We will train an autoencoder on the `inland` districts and consider them the "normal" data. Then, we will reconstruct the `near ocean` districts to see if we can identify them as anomalies.**

I already created two files:<br>
`inland.csv`: includes only the inland districts<br>
`near ocean.csv`: includes only the near ocean districts

# Setup

In [1]:
# Common imports
import numpy as np
import pandas as pd

random_state=42

# Get the data

In [2]:
inland = pd.read_csv("inland.csv")

ocean = pd.read_csv("near ocean.csv")


# Data Prep

In [3]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

from sklearn.preprocessing import FunctionTransformer

##  Identify the numerical and categorical columns

In [4]:
inland.dtypes

longitude             float64
latitude              float64
housing_median_age      int64
total_rooms             int64
total_bedrooms        float64
population              int64
households              int64
median_income         float64
median_house_value      int64
dtype: object

In [5]:
# Identify the numerical columns
numeric_columns = inland.select_dtypes(include=[np.number]).columns.to_list()

# Identify the categorical columns
categorical_columns = inland.select_dtypes('object').columns.to_list()

In [6]:
numeric_columns

['longitude',
 'latitude',
 'housing_median_age',
 'total_rooms',
 'total_bedrooms',
 'population',
 'households',
 'median_income',
 'median_house_value']

In [7]:
categorical_columns

[]

# Pipeline

In [8]:
numeric_transformer = Pipeline(steps=[
                ('imputer', SimpleImputer(strategy='median')),
                ('scaler', StandardScaler())])

In [9]:
preprocessor = ColumnTransformer([
        ('num', numeric_transformer, numeric_columns)],
        remainder='passthrough')
    
#passtrough is an optional step. You don't have to use it.

# Transform: fit_transform() for NORMAL data

In [10]:
#Fit and transform the train data
normal_x = preprocessor.fit_transform(inland)

normal_x

array([[-1.15057049,  0.42921038,  1.80810087, ..., -0.68697691,
         1.34385936,  2.72553739],
       [-1.14004863,  0.43866257, -0.52191175, ...,  1.62548807,
         2.28920657,  2.8312478 ],
       [-1.12952677,  0.44811477, -0.10583807, ..., -0.48811002,
         2.86916053,  3.03409751],
       ...,
       [-0.78230538,  1.27518142, -0.60512649, ..., -0.11332242,
        -1.04984216, -0.46434573],
       [-0.83491468,  1.27518142, -0.52191175, ..., -0.32748677,
        -0.93351742, -0.57291318],
       [-0.79282724,  1.24682485, -0.68834122, ...,  0.1339864 ,
        -0.57076791, -0.50577278]])

In [11]:
normal_x.shape

(6551, 9)

# Tranform: transform() for ANOMALOUS DATA

In [12]:
# Transform the test data
anomaly_x = preprocessor.transform(ocean)

anomaly_x

array([[-2.33427979,  2.39526597, -0.68834122, ..., -0.20765672,
         2.99327735,  0.87389095],
       [-2.40267188,  2.39526597, -0.43869701, ..., -0.27904483,
         1.21073293,  0.17820216],
       [-2.36584537,  2.37163549, -1.10441491, ..., -0.20000799,
         2.94464637, -0.21435489],
       ...,
       [ 0.83806112, -1.39978844,  2.22417455, ...,  0.09574277,
         5.04642289,  6.54439745],
       [ 0.83280019, -1.39978844,  2.30738929, ..., -0.46006469,
         4.15993377,  8.90831083],
       [ 0.83280019, -1.39978844,  1.39202719, ...,  0.56231509,
         4.96815415,  6.4691145 ]])

In [13]:
anomaly_x.shape

(350, 9)

# Autoencoder

In [14]:
import tensorflow as tf
from tensorflow import keras

In [20]:
model = keras.models.Sequential()

#Encoder
model.add(keras.layers.InputLayer(input_shape=(9,)))
model.add(keras.layers.Dense(8, activation='relu'))
model.add(keras.layers.Dense(7, activation='relu'))

#Decoder
model.add(keras.layers.Dense(8, activation='relu'))
model.add(keras.layers.Dense(9, activation=None))

model.summary()

In [21]:
adam = keras.optimizers.Adam(learning_rate=0.001)


model.compile(loss='mse', optimizer=adam, metrics=['mean_squared_error'])

In [22]:
from tensorflow.keras.callbacks import EarlyStopping

earlystop = EarlyStopping(monitor='val_loss', patience=5, verbose=1, mode='auto')

callback = [earlystop]

In [23]:
# Be careful: both input and output are "housing_normal_std" while training the autoencoder

model.fit(normal_x, normal_x, 
          validation_data = (normal_x, normal_x),
          epochs=100, batch_size=100, callbacks=callback)

Epoch 1/100
[1m66/66[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - loss: 1.0848 - mean_squared_error: 1.0848 - val_loss: 0.9457 - val_mean_squared_error: 0.9477
Epoch 2/100
[1m66/66[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 0.9320 - mean_squared_error: 0.9321 - val_loss: 0.7969 - val_mean_squared_error: 0.7981
Epoch 3/100
[1m66/66[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 0.7534 - mean_squared_error: 0.7535 - val_loss: 0.5902 - val_mean_squared_error: 0.5907
Epoch 4/100
[1m66/66[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 0.5525 - mean_squared_error: 0.5525 - val_loss: 0.4467 - val_mean_squared_error: 0.4470
Epoch 5/100
[1m66/66[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 0.4223 - mean_squared_error: 0.4222 - val_loss: 0.3808 - val_mean_squared_error: 0.3810
Epoch 6/100
[1m66/66[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 0.

[1m66/66[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.0307 - mean_squared_error: 0.0307 - val_loss: 0.0322 - val_mean_squared_error: 0.0324
Epoch 47/100
[1m66/66[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 0.0327 - mean_squared_error: 0.0326 - val_loss: 0.0317 - val_mean_squared_error: 0.0318
Epoch 48/100
[1m66/66[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.0323 - mean_squared_error: 0.0323 - val_loss: 0.0315 - val_mean_squared_error: 0.0317
Epoch 49/100
[1m66/66[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 0.0312 - mean_squared_error: 0.0312 - val_loss: 0.0307 - val_mean_squared_error: 0.0309
Epoch 50/100
[1m66/66[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.0310 - mean_squared_error: 0.0310 - val_loss: 0.0304 - val_mean_squared_error: 0.0306
Epoch 51/100
[1m66/66[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 0.0331 - 

<keras.src.callbacks.history.History at 0x21484f46cd0>

### Check the average MSE on the "normal" data

In [24]:
model.evaluate(normal_x, normal_x)

[1m205/205[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 0.0236 - mean_squared_error: 0.0236


[0.025480037555098534, 0.02550158090889454]

In [25]:
#Multiply by 100 to make sense of the error term:

model.evaluate(normal_x, normal_x)[0]*100

[1m205/205[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 0.0236 - mean_squared_error: 0.0236


2.5480037555098534

### Check the average MSE on the "anomalous" data

In [26]:
model.evaluate(anomaly_x, anomaly_x)

[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 0.0532 - mean_squared_error: 0.0532 


[0.07203206419944763, 0.07191478461027145]

In [27]:
#Multiply by 100 to make sense of the error term:

model.evaluate(anomaly_x, anomaly_x)[0]*100

[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.0532 - mean_squared_error: 0.0532 


7.203206419944763

## Predict first 20 in normal data

In [28]:
from sklearn.metrics import mean_squared_error

for i in range(0,20):
    prediction = model.predict(normal_x[i:i+1])
    print((mean_squared_error(normal_x[i:i+1], prediction))*100)

    
#Error terms are multiplied by 100 to make sense of the numbers

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 95ms/step
0.9422252017562083
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
1.962576843684516
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step
0.9695959210294126
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
1.2241054810177783
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
1.2099475659370762
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
1.807564880797434
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
1.5128933763673726
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step
3.306577874140541
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
2.0742214547792748
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
1.1158454367336055
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step
6.338875467136757
[1

## Predict first 20 in anomalous data


In [29]:
for i in range(0,20):
    prediction = model.predict(anomaly_x[i:i+1])
    print((mean_squared_error(anomaly_x[i:i+1], prediction))*100)

    
#Error terms are multiplied by 100 to make sense of the numbers

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
2.0557721829410305
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
1.5690197195671187
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
10.461368571586888
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
2.03368230730479
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
1.5598813868932455
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
1.6486709270473885
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
1.38973560036345
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
3.2349425644504453
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step
3.00374718450898
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step
1.0659100961599544
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
1.1835898827916116
[1m1

In [None]:
new_data= [()] # data output from tutorial 2

In [None]:
model.predict(new_data);