In [None]:
import os
from copy import copy
import numpy as np
import pandas as pd
import itertools
from string import ascii_lowercase
from sklearn.model_selection import KFold

import torch
from torch import optim
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset, Subset

# Analysis the categorical variables

In [None]:
total = set()
df_train = pd.read_csv('./Dataset_Siemens/FinalHackathonData/train/train.csv')
df_test = pd.read_csv('./Dataset_Siemens/FinalHackathonData/test/test.csv')
for i in range(8):
    total = total.union(set(df_train['X{}'.format(i+1)].unique()))
    total = total.union(set(df_test['X{}'.format(i+1)].unique()))
    print('X{}: '.format(i+1), set(df_train['X{}'.format(i+1)].unique()).symmetric_difference(set(df_test['X{}'.format(i+1)].unique())))
    
print(len(total), total)

In [None]:
# Dictionary to map alphabet to number
categorical_dict = {}
def iter_all_strings():
    for size in itertools.count(1):
        for s in itertools.product(ascii_lowercase, repeat=size):
            yield "".join(s)
            
for i, s in enumerate(itertools.islice(iter_all_strings(), 55)):
    categorical_dict[s] = i

# Build Dataset

In [None]:
class ASMEDataset(Dataset):
    def __init__(self, csv_file):
        super(ASMEDataset, self).__init__()
        self.df = pd.read_csv(csv_file)
        
    def __getitem__(self, idx):
        item = self.df.loc[idx]
        x = item[2:].to_list()
        y = item[1]
        for i in range(8):
            x[i] = categorical_dict[x[i]]
        return torch.from_numpy(np.array(x, dtype=np.float32)), y
    
    def __len__(self):
        return len(self.df)

# Build model

In [None]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(35, 100)
        self.fc2 = nn.Linear(100, 100)
        self.fc3 = nn.Linear(100, 1)
        self.relu = nn.ReLU(inplace=True)

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.fc3(x)
        return x

In [None]:
criterion = nn.MSELoss()
dataset = ASMEDataset('./Dataset_Siemens/FinalHackathonData/train/train.csv')
model_path = './model'

# K-fold cross-validation
kf = KFold(n_splits=5)
folds = kf.split(dataset)

if not os.path.exists(model_path):
    os.makedirs(model_path)

for ifold, (train_idx, val_idx) in enumerate(folds):
    print('Fold: %d' % ifold)
    
    net = Net()
    optimizer = optim.Adam(net.parameters(), lr=0.001)
    
    train_dataset = Subset(dataset, train_idx)
    val_dataset = Subset(copy(dataset), val_idx)
    
    train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=1)
    
    best_val = 9999999
    
    for epoch in range(200):  # loop over the dataset multiple times
        net.train()
        running_loss = 0.0
        for i, data in enumerate(train_loader):
            # get the inputs; data is a list of [inputs, labels]
            inputs, labels = data

            # zero the parameter gradients
            optimizer.zero_grad()
            # forward + backward + optimize
            outputs = net(inputs)
            loss = criterion(outputs, labels.view(-1, 1))
            loss.backward()
            optimizer.step()

            # print statistics
            running_loss += loss.item()
            if i % 400 == 399:    # print every mini-batches
                print('[%d, %5d] loss: %.3f' %
                      (epoch + 1, i + 1, running_loss / 400))
                running_loss = 0.0
                
        # Validation
        if epoch % 10 == 0:
            running_diff = 0.0
            net.eval()
            with torch.no_grad():
                for i, data in enumerate(val_loader):
                    inputs, labels = data
                    outputs = net(inputs)
                    diff = F.mse_loss(outputs, labels.view(-1, 1))
                    
                    running_diff += diff.item()
                print('Validation error: %3.4f' % running_diff)
            if running_diff < best_val:
                best_val = running_diff
                print('Save the best model.')
                torch.save(net.state_dict(), os.path.join(model_path, 'model_fold{}.pth'.format(ifold)))


print('Finished Training')