<a href="https://colab.research.google.com/github/Ditsuhi/Outlier_Detection/blob/main/IsolationForest.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# import all required libraries

import pandas as pd
import numpy as np
from sklearn.ensemble import IsolationForest

In [None]:
# The final version of the preprocessed dataset can be found at the following link: https://doi.org/10.5281/zenodo.6542247.
# the path provided below can be changed depending your data location.

data = pd.read_csv('/content/Madrid_Stations_2019.csv', header=None)
data_test = pd.read_csv('/content/Madrid_Stations_2020.csv', header=None)
fin_data_2019=data.to_numpy().reshape(-1,  24, 20)
fin_data_2020=data_test.to_numpy().reshape(-1, 24,  20)
fin_data_2020.shape

(4367, 24, 20)

In [None]:
# extract the features to which outlier detection technique was applied (all the features except wind direction, which is already went through One Hot Encoder (it means the data of wind direction is binary))

fin_data = fin_data_2019[:, :, 0:10]
fin_data_test = fin_data_2020[:, :, 0:10]

In [None]:
fin_data.shape

(4344, 24, 10)

In [None]:
# detect outliers using Isolation Forest

combined_array= np.array([])
combined_array_test= np.array([])

for i in range(24):
  for j in range(10):
    df_fin_data = pd.DataFrame ()
    df_fin_data_test = pd.DataFrame ()
    df_fin_data['col'] = pd.DataFrame(fin_data[:, i, j])
    df_fin_data_test['col'] = pd.DataFrame(fin_data_test[:, i, j])

    # n\_estimators=100, max\_samples=all the samples, contamination=float(0.05) 
    # contamination is the proportion of outliers in the dataset, and it was set to 0.05, meaning that 5% of the dataset was considered to be outliers;
    # max\_features was set to 1.0, which means that only the given feature was considered in the detection process

    model=IsolationForest(n_estimators=100, max_samples=4344, contamination=float(0.05),max_features=1.0)
    model.fit(df_fin_data[['col']])
    df_fin_data['scores']=model.decision_function(df_fin_data[['col']])
    df_fin_data['anomaly']=model.predict(df_fin_data[['col']])
    df_fin_data.loc[df_fin_data['anomaly']==-1, 'col'] = np.nan
    df_fin_data['col'] = df_fin_data['col'].interpolate()
    if df_fin_data['col'].isna().any():
      df_fin_data['col']= df_fin_data['col'].bfill(limit=3)


    model_test=IsolationForest(n_estimators=100, max_samples=4344, contamination=float(0.05),max_features=1.0)
    model_test.fit(df_fin_data_test[['col']])
    df_fin_data_test['scores']=model_test.decision_function(df_fin_data_test[['col']])
    df_fin_data_test['anomaly']=model_test.predict(df_fin_data_test[['col']])
    df_fin_data_test.loc[df_fin_data_test['anomaly']==-1, 'col'] = np.nan
    df_fin_data_test['col'] = df_fin_data_test['col'].interpolate()
    if df_fin_data_test['col'].isna().any():
      df_fin_data_test['col']= df_fin_data_test['col'].bfill(limit=3)


    combined_array = np.append(combined_array, df_fin_data['col'].values.flatten())
    combined_array_test =  np.append(combined_array_test, df_fin_data_test['col'].values.flatten())


In [None]:
# combine two separated datasets: the part that went through outlier detection and the wind direction part that went through One Hot Encoder

combined_array_resh = combined_array.reshape(-1, 4344)
combined_array_test_resh = combined_array_test.reshape(-1, 4367)
combined_array_resh_trans=np.transpose(combined_array_resh).reshape(-1, 24, 10)
combined_array_test_resh_trans=np.transpose(combined_array_test_resh).reshape(-1, 24, 10)
data_2019_outlier = np.concatenate((combined_array_resh_trans, fin_data_2019[:, :, 12:20]), axis=2)
data_2020_outlier= np.concatenate((combined_array_test_resh_trans, fin_data_2020[:, :, 12:20]), axis=2)
data_2020_outlier_resh = data_2020_outlier.reshape(-1)
data_2019_outlier_resh = data_2019_outlier.reshape(-1)
# save the results as csv files
np.savetxt('Madrid_Stations_2020_WithoutOutlier_iTree.csv', data_2020_outlier_resh, delimiter=',')
np.savetxt('Madrid_Stations_2019_WithoutOutlier_iTree.csv', data_2019_outlier_resh, delimiter=',')

In [None]:
# to check if there is any nan value in the dataset after outlier detection

np.isnan(data_2019_outlier_resh).any()

In [None]:
# if there is any nan value to show where those nan values are placed 

np.argwhere(np.isnan(data_2019_outlier))

array([], shape=(0, 3), dtype=int64)