In [1]:
import pandas as pd
import numpy as np
import pathlib
import os

import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

np.random.seed(0)

In [2]:
dataset_path = "D:\\Storage\\AnomalyDetection\\wine\\benchmarks"

In [3]:
file_list = list(pathlib.Path(dataset_path).glob('*.csv'))
print(f"Number of files: {len(file_list)}")
print(f"Example path: {file_list[0]}")

Number of files: 1210
Example path: D:\Storage\AnomalyDetection\wine\benchmarks\wine_benchmark_0001.csv


In [4]:
li = []
for filename in file_list[:50]:
    df = pd.read_csv(filename, index_col=None, header=0)
    li.append(df)

df = pd.concat(li, axis=0, ignore_index=True)

# df = pd.read_csv(file_list[0])
df.shape
df.describe()

(185150, 50)

Unnamed: 0,original.label,diff.score,fixed.acidity,volatile.acidity,citric.acid,residual.sugar,chlorides,free.sulfur.dioxide,total.sulfur.dioxide,density,...,noise..24,noise..25,noise..26,noise..27,noise..28,noise..29,noise..30,noise..31,noise..32,noise..33
count,185150.0,185150.0,185150.0,185150.0,185150.0,185150.0,185150.0,185150.0,185150.0,185150.0,...,37030.0,37030.0,37030.0,37030.0,37030.0,37030.0,37030.0,37030.0,37030.0,37030.0
mean,5.82182,0.18406,-0.003067,-0.001873,-0.004124,0.000977,-0.005601,0.000519,-0.000257,-0.002645,...,-0.002497,-0.001473,0.004486,-0.002218,0.004315,-0.006058,-0.006279,-0.004379,0.000239,-0.009774
std,0.873348,0.179302,0.994429,0.994004,0.99816,0.99986,0.975796,0.9946,0.998973,1.001061,...,1.010434,0.997907,0.998337,1.000018,1.016338,0.992243,1.007432,1.003568,1.003339,0.993068
min,3.0,0.002164,-2.634386,-1.577208,-2.192664,-1.017956,-1.342536,-1.663455,-1.941631,-2.529997,...,-2.975988,-2.789406,-2.433288,-2.634386,-2.557251,-3.100376,-2.975988,-3.100376,-3.100376,-3.100376
25%,5.0,0.061595,-0.628884,-0.66611,-0.472297,-0.765739,-0.514759,-0.762016,-0.667787,-0.792562,...,-0.702685,-0.649336,-0.68054,-0.628884,-0.706019,-0.737004,-0.67481,-0.706019,-0.609926,-0.68054
50%,6.0,0.127601,-0.166076,-0.301671,-0.059409,-0.513522,-0.257863,-0.085936,0.039904,0.061149,...,-0.177258,-0.200775,-0.119451,-0.166076,-0.166076,-0.177258,-0.115064,-0.166076,-0.128224,-0.142276
75%,6.0,0.241031,0.373866,0.366468,0.491108,0.558401,0.25593,0.590143,0.71221,0.758124,...,0.528136,0.477463,0.593772,0.451001,0.52909,0.600438,0.533803,0.528136,0.559923,0.516365
max,9.0,0.996894,6.69891,7.533774,9.23057,12.685846,15.840967,14.562446,5.736815,14.767654,...,15.840967,15.840967,12.685846,9.23057,15.840967,14.767654,9.870119,9.870119,15.840967,14.767654


In [5]:
df.head()

Unnamed: 0,point.id,motherset,origin,original.label,diff.score,ground.truth,fixed.acidity,volatile.acidity,citric.acid,residual.sugar,...,noise..24,noise..25,noise..26,noise..27,noise..28,noise..29,noise..30,noise..31,noise..32,noise..33
0,wine_point_3594,wine,regression,7,0.050492,nominal,-1.245962,-0.362411,-0.265853,-0.261304,...,,,,,,,,,,
1,wine_point_5089,wine,regression,5,0.082237,anomaly,0.75954,0.973867,0.215849,-0.53454,...,,,,,,,,,,
2,wine_point_1912,wine,regression,6,0.290201,nominal,-0.088942,-0.969809,-0.403482,-0.870829,...,,,,,,,,,,
3,wine_point_4908,wine,regression,5,0.053559,anomaly,0.219597,0.973867,0.284664,0.138039,...,,,,,,,,,,
4,wine_point_2246,wine,regression,7,0.4203,nominal,0.219597,-0.180191,-0.541112,0.34822,...,,,,,,,,,,


In [6]:
df = df.drop(columns=["point.id", "motherset", "origin", "original.label", "diff.score"])
df = df[df.columns.drop(list(df.filter(regex='noise')))]

In [7]:
df.isnull().sum()

ground.truth            0
fixed.acidity           0
volatile.acidity        0
citric.acid             0
residual.sugar          0
chlorides               0
free.sulfur.dioxide     0
total.sulfur.dioxide    0
density                 0
pH                      0
sulphates               0
alcohol                 0
dtype: int64

In [8]:
data = df.copy()
data.shape

(185150, 12)

In [9]:
from sklearn.preprocessing import LabelEncoder

enc = LabelEncoder()
df["ground.truth"] = enc.fit_transform(df["ground.truth"])

In [10]:
data = df.copy()

In [11]:
data_y = data["ground.truth"]
data_x = data.drop(columns = ["ground.truth"])

In [12]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data_x, data_y, test_size=0.2, random_state=1)

In [14]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
from torchvision import datasets, models, transforms
from torch.utils.data.dataset import Dataset
from torch.utils.data import DataLoader
import torch.optim as optim
from typing import Dict, List, Tuple


In [16]:
class DataHandler:
	def __init__(self, run_config):
		self._training_dataset = None
		self._validation_dataset = None
		self._run_config = run_config

		self.load_datasets()
		
	def load_datasets(self):
		self._training_dataset = CustomDataset(X_train, y_train)
		self._validation_dataset = CustomDataset(X_test, y_test)

	def get_data_loaders(self) -> Tuple[DataLoader]:
		return (
			DataLoader(self._training_dataset, batch_size=self._run_config.batch_size, shuffle=True, num_workers=self._run_config.workers, pin_memory=True), 
			DataLoader(self._validation_dataset, batch_size=self._run_config.batch_size, shuffle=True, num_workers=self._run_config.workers, pin_memory=True)
		)

	def get_datasets(self) -> Tuple[Dataset]:
		return self._training_dataset, self._validation_dataset

	def get_datasets_sizes(self) -> Tuple[int]:
		return len(self._training_dataset), len(self._validation_dataset)


class CustomDataset(Dataset):
	def __init__(self, X, Y):
		self.X = X.to_numpy()
		self.Y = Y.to_numpy()

	def __getitem__(self, idx):
		return self.X[idx], self.Y[idx]

	def __len__(self):
		return size(self.X)

In [21]:
class LinearModel(nn.Module):
    def __init__(self, in_size=11, out_size=1):
        super().__init__()
        self.linear_1 = nn.Linear(in_size, 256)
        self.linear_2 = nn.Linear(256, 256)
        self.linear_3 = nn.Linear(256, out_size)

    def forward(x):
        out = F.leaky_relu(self.linear_1(x))
        out = F.leaky_relu(self.linear_1(out))
        out = F.sigmoid(self.linear_1(out))

In [22]:
device = torch.device("cuda")
model = LinearModel()
model.to(device)
optimizer = optim.Adam(model.parameters(), lr=0.0001, betas=(0.9, 0.999), eps=1e-08, weight_decay=1e-4)

LinearModel(
  (linear_1): Linear(in_features=11, out_features=256, bias=True)
  (linear_2): Linear(in_features=256, out_features=256, bias=True)
  (linear_3): Linear(in_features=256, out_features=1, bias=True)
)

NameError: name 'optim' is not defined