In [None]:
import json
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW, Adam, SGD
import random


class MyDataset:
    def __init__(self, main_json_path, min_json_path, max_json_path):
        # Load main JSON data
        with open(main_json_path, 'r') as file:
            self.data = [list(entry.values()) for entry in json.load(file)]

        # Load min and max values JSON data
        with open(min_json_path, 'r') as file:
            self.min_values = list(json.load(file).values())
            
        with open(max_json_path, 'r') as file:
            self.max_values = list(json.load(file).values())
            
        for entry in self.data:
            entry = self.normalize(entry)

    def normalize(self, entry):
        for k in range(len(self.min_values)):
            min_val = self.min_values[k]
            max_val = self.max_values[k]

            # Avoid division by zero
            if (max_val - min_val) != 0:
                entry[k] = (entry[k] - min_val) / (max_val - min_val)
            else:
                # Handle division by zero (you can choose an appropriate fallback value)
                entry[k] = 0.0
                
        return entry

    def denormalize(self, entry):
        for k in range(len(self.min_values)):
            min_val = self.min_values[k]
            max_val = self.max_values[k]
            if (max_val - min_val) != 0:
                entry[k] = entry[k] * (max_val - min_val) + min_val
            else:
                entry[k] = min_val

        return entry

    def __getitem__(self, index):
        entry = self.data[index]
        for k in range(14):
            if random.random() > 0.66:
                entry[k] = 0
        for k in range(14, len(entry), 4):
            if random.random() > 0.66:
                for j in range(4):
                    entry[k+j-1] = 0
        return torch.tensor(entry, dtype=torch.float32)

    def __len__(self):
        return len(self.data)

class Autoencoder(nn.Module):
    def __init__(self, input_dim, bottleneck_dim, dropout_prob=0.1):
        super(Autoencoder, self).__init__()

        # Encoder
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, input_dim // 2),
            nn.ReLU(),
            nn.BatchNorm1d(input_dim // 2),
            nn.Dropout(p=dropout_prob),
            nn.Linear(input_dim // 2, bottleneck_dim),
            nn.ReLU(),  # Additional fully connected layer
            nn.BatchNorm1d(bottleneck_dim),
            nn.Dropout(p=dropout_prob)
        )

        # Decoder
        self.decoder = nn.Sequential(
            nn.Linear(bottleneck_dim, input_dim // 2),
            nn.ReLU(),
            nn.BatchNorm1d(input_dim // 2),
            nn.Dropout(p=dropout_prob),
            nn.Linear(input_dim // 2, input_dim),
            nn.ReLU(),  # Additional fully connected layer
            nn.BatchNorm1d(input_dim),
            nn.Dropout(p=dropout_prob),
            nn.Sigmoid()  # Sigmoid activation to ensure outputs are in the range [0, 1]
        )

    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded
    
def train_autoencoder(model, train_dataloader, val_dataloader, criterion, optimizer, num_epochs=10, device="cpu", save_path="best_model.pth"):
    model.to(device)
    
    best_val_loss = float('inf')  # Initialize with a large value
    for epoch in range(num_epochs):
        # Training
        model.train()
        running_loss = 0.0
        for k, data in enumerate(train_dataloader):
            data = data.to(device)
            optimizer.zero_grad()
            outputs = model(data)
            loss = criterion(outputs, data)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
            if k % 100 == 0:
                print(f"Training - Epoch [{epoch + 1}/{num_epochs}], Step [{k}], Loss: {loss.item():.4f}")

        # Calculate average training loss for the epoch
        average_train_loss = running_loss / len(train_dataloader)
        print(f"Training - Epoch [{epoch + 1}/{num_epochs}], Average Loss: {average_train_loss:.4f}")

        # Validation
        model.eval()
        val_running_loss = 0.0
        with torch.no_grad():
            for k, val_data in enumerate(val_dataloader):
                val_data = val_data.to(device)
                val_outputs = model(val_data)
                val_loss = criterion(val_outputs, val_data)
                val_running_loss += val_loss.item()

        # Calculate average validation loss for the epoch
        average_val_loss = val_running_loss / len(val_dataloader)
        print(f"Validation - Epoch [{epoch + 1}/{num_epochs}], Average Loss: {average_val_loss:.8f}")

        # Save the best model
        if average_val_loss < best_val_loss:
            best_val_loss = average_val_loss
            torch.save(model.state_dict(), "best_model.pth")
            print(f"Best model saved with validation loss: {best_val_loss:.4f} at epoch {epoch + 1}")

    print("Training complete.")



# Training parameters
num_epochs = 200
learning_rate = 5e-4
batch_size = 16
bottleneck_dim = 2
train_json_path = "/kaggle/input/parsed-veridion/train_data.json"
val_json_path = "/kaggle/input/parsed-veridion/val_data.json"
min_json_path = "/kaggle/input/parsed-veridion/min_values.json"
max_json_path = "/kaggle/input/parsed-veridion/max_values.json"

train_dataset = MyDataset(train_json_path, min_json_path, max_json_path)
val_dataset = MyDataset(val_json_path, min_json_path, max_json_path)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True)
input_dim = len(train_dataset.min_values)
autoencoder = Autoencoder(input_dim, bottleneck_dim)

# Define training parameters
criterion = nn.MSELoss()
optimizer = Adam(autoencoder.parameters(), lr=learning_rate)

# Train the autoencoder
train_autoencoder(autoencoder, train_dataloader, val_dataloader, criterion, optimizer, num_epochs=num_epochs)
