# **Autoencoder Model - Anomaly Detection**
### Group 8

### **__Comments__**

- Need to fix the DataLoading tensors

- **Model without finetuning**

- Model was a basic autoencoder architecture (could be tuned further)

- Evaluation methods need to be explored further

- Batch size was chosen as 64 because of it being the standard should be changed

- We need to create the (normalized) score values for the logs

In [12]:
# Libraries
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import torch
import torch.nn as nn
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, Dataset
import torch.optim as optim

In [13]:
# Ensure we are using GPU to train the model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [15]:
# Load the json file
features = open("/content/features.json")
data_dict=json.load(features)

JSONDecodeError: Unterminated string starting at: line 1 column 57671678 (char 57671677)

**Data-frame and exploration**

In [None]:
def time_diff(features):
    for ip in features:
        for i in range(len(features[ip])):
            log_time=list(features[ip][i]["log_time"])
            if(i==0):
                features[ip][i]["time_diff"]=-1
            else:
                if((log_time[0]-features[ip][i-1]["log_time"][0])<0):
                    features[ip][i]["time_diff"]=-1
                else:
                    time_diff=0
                    if((log_time[2]-features[ip][i-1]["log_time"][2])<0):
                        log_time[2]+=60
                        log_time[1]-=1
                    time_diff+=log_time[2]-features[ip][i-1]["log_time"][2]
                    if((log_time[1]-features[ip][i-1]["log_time"][1])<0):
                        log_time[1]+=60
                        log_time[0]-=1
                    time_diff+=(log_time[1]-features[ip][i-1]["log_time"][1])*60
                    time_diff+=(log_time[0]-features[ip][i-1]["log_time"][0])*3600
                    if(time_diff<0):
                        features[ip][i]["time_diff"]=-1
                    else:
                        features[ip][i]["time_diff"]=time_diff
    return features

In [None]:
data_dict=time_diff(data_dict)

In [None]:
l_ip=[]
l_lon=[]
l_lat=[]
l_time=[]
l_time_diff=[]
l_inst=[]
l_url=[]
l_response=[]
l_weight=[]
for i in data_dict.keys():
    for log in data_dict[i]:
        l_ip.append(i)
        l_lat.append(log["coords"][0])
        l_lon.append(log["coords"][1])
        l_time.append(log["log_time"])
        l_time_diff.append(log["time_diff"])
        l_inst.append(log["instruction"])
        l_url.append(log["url"])
        l_response.append(log["response"])
        l_weight.append(log["response_weight"])

In [None]:
for i in range(len(l_time)):
    l_time[i]=l_time[i][0]

In [None]:
l_ip=pd.Series(l_ip)
l_lat=pd.Series(l_lat)
l_lon=pd.Series(l_lon)
l_time=pd.Series(l_time)
l_inst=pd.Series(l_inst)
l_url=pd.Series(l_url)
l_response=pd.Series(l_response)
l_weight=pd.Series(l_weight)


# Some data exploration
print(l_ip[0]) # 35.170.74.25
print(l_lat[0]) # 22.3193
print(l_lon[0]) # 40.7128
print(l_time[0]) # 18
print(l_inst[0]) # HEAD
print(l_url[0]) # /fr/que-faire/que-fer-sitges-de-nit.htm HTTP/2.0
print(l_response[0]) # 200
print(l_weight[0]) # 9037


print(l_ip.dtype) # object
print(l_lat.dtype) # float64
print(l_lon.dtype) # float64
print(l_time.dtype) # int64
print(l_inst.dtype) # object
print(l_url.dtype) # object
print(l_response.dtype) # object
print(l_weight.dtype) # object

In [None]:
# Data-frame creation
frame = {"IP": l_ip,"Lat":l_lat,"Lon":l_lon,"Time":l_time,"Time Diff":l_time_diff,"Instruction":l_inst,"URL":l_url,"Response":l_response,"Weight":l_weight}
data=pd.DataFrame(frame)

In [None]:
data

**Model Pipeline & Split**

In [None]:
# One-hot encode categorical features and standardize numerical features
ct = ColumnTransformer(transformers=[
    ("encoder", OneHotEncoder(), ["Instruction", "URL"]),
    ("scaler", StandardScaler(), ["Lat", "Lon", "Time", "Time Diff", "Response", "Weight"])
])

data_encoded = pd.DataFrame(ct.fit_transform(data))

In [None]:
# Split data into train and test sets classic 80/20 split
train_data, test_data = train_test_split(data_encoded, test_size=0.2, random_state=42)

In [None]:
# Define the autoencoder architecture
class Autoencoder(nn.Module):
    def __init__(self, input_dim):
        super(Autoencoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 16)
        )
        self.decoder = nn.Sequential(
            nn.Linear(16, 32),
            nn.ReLU(),
            nn.Linear(32, 64),
            nn.ReLU(),
            nn.Linear(64, input_dim),
            nn.Sigmoid()
        )

    def forward(self, x):
        x = self.encoder(x)
        x = self.decoder(x)
        return x

**Dataloader**

In [None]:
# Convert data to PyTorch tensors
train_data_tensor = torch.tensor(train_data.values, dtype=torch.float32)
test_data_tensor = torch.tensor(test_data.values, dtype=torch.float32)

# Define DataLoader
batch_size = 64
train_loader = DataLoader(train_data_tensor, batch_size=batch_size, shuffle=True)

**Model Initialization**

In [None]:
# Initialize the autoencoder model
input_dim = train_data.shape[1]
autoencoder = Autoencoder(input_dim)

# Define loss function and optimizer
criterion = nn.MSELoss()
optimizer = optim.Adam(autoencoder.parameters(), lr=0.001)

**Training**

In [None]:
# Train the model
num_epochs = 10
for epoch in range(num_epochs):
    running_loss = 0.0
    for data in train_loader:
        optimizer.zero_grad()
        outputs = autoencoder(data)
        loss = criterion(outputs, data)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {running_loss}")

**Evaluation & Visualization**

In [None]:
# Evaluate the model (we can use reconstruction error as anomaly score, but we still have to normalize it etc )
with torch.no_grad():
    reconstructions = autoencoder(test_data_tensor)
    reconstruction_loss = criterion(reconstructions, test_data_tensor)
    print(f"Reconstruction Loss: {reconstruction_loss.item()}")

In [None]:
# Some visualization

# Calculate reconstruction error for each data point
reconstruction_errors = torch.mean((reconstructions - test_data_tensor)**2, dim=1).detach().numpy()

# Plot the distribution of reconstruction errors
plt.figure(figsize=(10, 6))
plt.hist(reconstruction_errors, bins=50, alpha=0.5, color='blue', label="Reconstruction Errors")
plt.axvline(np.mean(reconstruction_errors), color='red', linestyle='dashed', linewidth=1, label="Mean Error")
plt.xlabel("Reconstruction Error")
plt.ylabel("Frequency")
plt.title("Distribution of Log Reconstruction Errors")
plt.legend()
plt.grid(True)
plt.show()