# **Autoencoder Model - Anomaly Detection**
### Group 8

### **__Comments__**

- Need to fix the DataLoading tensors

- **Model without finetuning**

- Model was a basic autoencoder architecture (could be tuned further)

- Evaluation methods need to be explored further

- Batch size was chosen as 64 because of it being the standard should be changed

- We need to create the (normalized) score values for the logs

In [16]:
# Libraries
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import torch
import torch.nn as nn
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, Dataset
import torch.optim as optim

# Random state/seed
random_state = 42

In [17]:
# Ensure we are using GPU to train the model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [18]:
# Load the json file
features = open("/content/features.json")
data_dict=json.load(features)

**Data-frame and exploration**

In [19]:
def time_diff(features):
    for ip in features:
        for i in range(len(features[ip])):
            log_time=list(features[ip][i]["log_time"])
            if(i==0):
                features[ip][i]["time_diff"]=-1
            else:
                if((log_time[0]-features[ip][i-1]["log_time"][0])<0):
                    features[ip][i]["time_diff"]=-1
                else:
                    time_diff=0
                    if((log_time[2]-features[ip][i-1]["log_time"][2])<0):
                        log_time[2]+=60
                        log_time[1]-=1
                    time_diff+=log_time[2]-features[ip][i-1]["log_time"][2]
                    if((log_time[1]-features[ip][i-1]["log_time"][1])<0):
                        log_time[1]+=60
                        log_time[0]-=1
                    time_diff+=(log_time[1]-features[ip][i-1]["log_time"][1])*60
                    time_diff+=(log_time[0]-features[ip][i-1]["log_time"][0])*3600
                    if(time_diff<0):
                        features[ip][i]["time_diff"]=-1
                    else:
                        features[ip][i]["time_diff"]=time_diff
    return features

In [20]:
data_dict=time_diff(data_dict)

In [21]:
l_ip=[]
l_lon=[]
l_lat=[]
l_time=[]
l_time_diff=[]
l_inst=[]
l_url=[]
l_response=[]
l_weight=[]
for i in data_dict.keys():
    for log in data_dict[i]:
        l_ip.append(i)
        l_lat.append(log["coords"][0])
        l_lon.append(log["coords"][1])
        l_time.append(log["log_time"])
        l_time_diff.append(log["time_diff"])
        l_inst.append(log["instruction"])
        l_url.append(log["url"])
        l_response.append(log["response"])
        l_weight.append(log["response_weight"])

In [22]:
for i in range(len(l_time)):
    l_time[i]=l_time[i][0]

In [23]:
l_ip=pd.Series(l_ip)
l_lat=pd.Series(l_lat)
l_lon=pd.Series(l_lon)
l_time=pd.Series(l_time)
l_inst=pd.Series(l_inst)
l_url=pd.Series(l_url)
l_response=pd.Series(l_response)
l_weight=pd.Series(l_weight)


# Some data exploration
print(l_ip[0]) # 35.170.74.25
print(l_lat[0]) # 22.3193
print(l_lon[0]) # 40.7128
print(l_time[0]) # 18
print(l_inst[0]) # HEAD
print(l_url[0]) # /fr/que-faire/que-fer-sitges-de-nit.htm HTTP/2.0
print(l_response[0]) # 200
print(l_weight[0]) # 9037


print(l_ip.dtype) # object
print(l_lat.dtype) # float64
print(l_lon.dtype) # float64
print(l_time.dtype) # int64
print(l_inst.dtype) # object
print(l_url.dtype) # object
print(l_response.dtype) # object
print(l_weight.dtype) # object

47.76.35.19
22.3193
114.169
6
HEAD
/fr/pag492/explora-platges-i-ports-2/id12/les-anquines.htm HTTP/1.1
301
4840
object
float64
float64
int64
object
object
object
object


In [24]:
# Data-frame creation
frame = {"IP": l_ip,"Lat":l_lat,"Lon":l_lon,"Time":l_time,"Time Diff":l_time_diff,"Instruction":l_inst,"URL":l_url,"Response":l_response,"Weight":l_weight}
data=pd.DataFrame(frame)

In [25]:
data

Unnamed: 0,IP,Lat,Lon,Time,Time Diff,Instruction,URL,Response,Weight
0,47.76.35.19,22.3193,114.1690,6,-1,HEAD,/fr/pag492/explora-platges-i-ports-2/id12/les-...,301,4840
1,47.76.35.19,22.3193,114.1690,6,1,HEAD,/fr/pag492/explora-platges-i-ports-2/id12/les-...,200,5223
2,47.76.35.19,22.3193,114.1690,6,5,HEAD,/fr/pag492/explora-platges-i-ports-2/id12/les-...,301,4840
3,47.76.35.19,22.3193,114.1690,6,1,HEAD,/fr/pag492/explora-platges-i-ports-2/id12/les-...,200,5260
4,47.76.35.19,22.3193,114.1690,6,1,GET,/fr/pag492/explora-platges-i-ports-2/id12/les-...,301,661
...,...,...,...,...,...,...,...,...,...
1077388,64.124.8.96,33.2749,-111.8870,2,1,GET,/media/site1/cache/images/port-aiguadolc-1.jpg...,200,278950
1077389,98.64.96.26,52.3667,4.9000,11,-1,GET,/nl/pl330/nieuws/kalender/id1113/he-mort-el-po...,404,7423
1077390,98.64.96.26,52.3667,4.9000,11,27,GET,/nl/pl330/nieuws/kalender/id1113/he-mort-el-po...,404,2815
1077391,90.74.194.156,39.5638,-0.3368,11,-1,GET,/media/site1/cache/images/dji-0077-2-redim-enc...,200,227390


**Model Pipeline & Split**

In [None]:
# Define categorical and numerical features
categorical_features = ["Instruction", "URL", "Response"]
numerical_features = ["Time", "Weight", "Time Diff", "Lat", "Lon"]

# Create a pipeline for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ]
)

# Fit and transform the data using the preprocessor
X_processed = preprocessor.fit_transform(data)

# Split the data into training and testing sets
train_features, test_features = train_test_split(X_processed, test_size=0.2, random_state=42)

# Convert data to PyTorch tensors
train_data_tensor = torch.tensor(train_features.toarray(), dtype=torch.float32)
test_data_tensor = torch.tensor(test_features.toarray(), dtype=torch.float32)

**Model**

In [28]:
# Define the autoencoder architecture (very simple just basic model)
class Autoencoder(nn.Module):
    def __init__(self, input_dim):
        super(Autoencoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 16)
        )
        self.decoder = nn.Sequential(
            nn.Linear(16, 32),
            nn.ReLU(),
            nn.Linear(32, 64),
            nn.ReLU(),
            nn.Linear(64, input_dim),
            nn.Sigmoid()
        )

    def forward(self, x):
        x = self.encoder(x)
        x = self.decoder(x)
        return x

**Model Initialization**

In [None]:
# Initialize the autoencoder model
input_dim = train_data.shape[1]
autoencoder = Autoencoder(input_dim)

# Define loss function and optimizer
criterion = nn.MSELoss()
optimizer = optim.Adam(autoencoder.parameters(), lr=0.001)

**Training**

In [None]:
# Train the model
num_epochs = 10
for epoch in range(num_epochs):
    running_loss = 0.0
    for data in train_loader:
        optimizer.zero_grad()
        outputs = autoencoder(data)
        loss = criterion(outputs, data)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {running_loss}")

**Evaluation & Visualization**

In [None]:
# Evaluate the model (we can use reconstruction error as anomaly score, but we still have to normalize it etc )
with torch.no_grad():
    reconstructions = autoencoder(test_data_tensor)
    reconstruction_loss = criterion(reconstructions, test_data_tensor)
    print(f"Reconstruction Loss: {reconstruction_loss.item()}")

In [None]:
# Some visualization

# Calculate reconstruction error for each data point
reconstruction_errors = torch.mean((reconstructions - test_data_tensor)**2, dim=1).detach().numpy()

# Plot the distribution of reconstruction errors
plt.figure(figsize=(10, 6))
plt.hist(reconstruction_errors, bins=50, alpha=0.5, color='blue', label="Reconstruction Errors")
plt.axvline(np.mean(reconstruction_errors), color='red', linestyle='dashed', linewidth=1, label="Mean Error")
plt.xlabel("Reconstruction Error")
plt.ylabel("Frequency")
plt.title("Distribution of Log Reconstruction Errors")
plt.legend()
plt.grid(True)
plt.show()