In [1]:
#Import library
import os
import random
from torch.utils.tensorboard import SummaryWriter
from datetime import datetime


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score


import torch
from torch.utils.data import DataLoader, TensorDataset




In [2]:
#Read data from file

#Read training data
DATA_PATH = os.path.join('..','data') # use '..' to go up one directory level
SOURCE = ''
NAME = 'augmented_beos.csv'
training_data = pd.read_csv(os.path.join(DATA_PATH, SOURCE, NAME)).dropna(axis=1)


In [3]:
# #checking the data
class_count = training_data['diseases'].value_counts()

In [None]:
# # Encode the label
# label_encoder = LabelEncoder()
# training_data['diseases'] = label_encoder.fit_transform(training_data['diseases'])

In [None]:
max_class = class_count.max()
balance_size = max_class
for disease in class_count.index:
    if class_count[disease] < balance_size:
        # new samples random drop a value 1 to 0
        new_samples = training_data[training_data['diseases'] == disease][:].sample(n=balance_size - class_count[disease], replace=True)
        # num_to_flip = random.randint(1,class_count[disease])
        # ones_indices = np.argwhere(new_samples.values == 1)
        # indices_to_flip = ones_indices[np.random.choice(ones_indices.shape[0], size=num_to_flip, replace=True)]
        # for index in indices_to_flip:
        #     new_samples.iat[index[0], index[1]] = 0
        training_data = pd.concat([training_data[:], new_samples[:]])

In [None]:
training_data.to_csv("../data/augmented_biggest_real.csv", index = False)

In [4]:
train_df, test_df = train_test_split(training_data, test_size=0.2, random_state=42, stratify=training_data['diseases'])

X_train = train_df.drop(columns=['diseases'])
y_train = train_df['diseases']
X_test = test_df.drop(columns=['diseases'])
y_test = test_df['diseases']
print("X_train shape: ", X_train.shape) 
print("X_test shape: ", X_test.shape)
print("y_train shape: ", y_train.shape)
print("y_test shape: ", y_test.shape)

X_train shape:  (753829, 377)
X_test shape:  (188458, 377)
y_train shape:  (753829,)
y_test shape:  (188458,)


In [5]:
y_train_ohe = pd.get_dummies(y_train[:])[:]
y_test_ohe = pd.get_dummies(y_test[:])[:]

In [6]:
# Convert data to tensor
X_train_tensor = torch.tensor(X_train.values, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test.values, dtype=torch.float32)
y_train_ohe_tensor = torch.tensor(y_train_ohe.values, dtype=torch.float32)
y_test_ohe_tensor = torch.tensor(y_test_ohe.values, dtype=torch.float32)

In [7]:
training_tensor = TensorDataset(X_train_tensor, y_train_ohe_tensor)
testing_tensor = TensorDataset(X_test_tensor, y_test_ohe_tensor)

In [15]:
batch_size = 16
training_loader = DataLoader(training_tensor, batch_size, shuffle=True)
testing_loader = DataLoader(testing_tensor, batch_size, shuffle=True)

In [16]:
# Define the model
class ANN(torch.nn.Module):
    def __init__(self):
        super(ANN, self).__init__()
        self.fc1 = torch.nn.Linear(X_train_tensor.shape[1], 2048)
        self.fc2 = torch.nn.Linear(2048, 4096)
        self.fc3 = torch.nn.Linear(4096, 2048)
        self.fc4 = torch.nn.Linear(2048,1024)
        self.fc5 = torch.nn.Linear(1024, 773)
        
    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = torch.relu(self.fc3(x))
        x = torch.relu(self.fc4(x))
        x = self.fc5(x)
        return x

In [24]:
model = ANN()
loss_fn = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [17]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [58]:
def train_one_epoch(epoch_index, tb_writer):
    running_loss = 0.
    last_loss = 0.
    accuracy = 0

    # Move model to GPU
    model.to(device)

    # Iterate over data
    for i, data in enumerate(training_loader):
        # Move data to GPU
        inputs, labels = data[0].to(device), data[1].to(device)

        # Zero your gradients for every batch!
        optimizer.zero_grad()

        # Make predictions for this batch
        outputs = model(inputs)

        # Compute the loss and its gradients
        
        loss = loss_fn(outputs, labels)
       
        loss.backward()

        # Compute the accuracy
        # predicted = torch.argmax(outputs, 1)
        # correct = (predicted == torch.argmax(labels, 1)).sum().item()
        # accuracy += correct

        #change to function
        top5_indices = outputs.topk(5, dim=1).indices
        for j in range(len(top5_indices)):
            try:
                accuracy += (top5_indices[j] == torch.argmax(labels[j])).sum().item()
            except:
                print(j)
                print(top5_indices)
                print(torch.argmax(labels[j]))
        # Adjust learning weights
        optimizer.step()

        # Gather data and report
        running_loss += loss.item()
        if i % 1000 == 999:
            last_loss = running_loss / 1000 # loss per batch
            print('  batch {} loss: {}'.format(i + 1, last_loss))
            tb_x = epoch_index * len(training_loader) + i + 1
            tb_writer.add_scalar('Loss/train', last_loss, tb_x)
            running_loss = 0.
            print(accuracy)

    return last_loss, accuracy / len(training_loader.dataset)


In [59]:
# Initializing in a separate cell so we can easily add more epochs to the same run
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
writer = SummaryWriter('runs/fashion_trainer_{}'.format(timestamp))
epoch_number = 0

EPOCHS = 10
best_vloss = 1_000_000.
best_vacc = 0
for epoch in range(EPOCHS):
    print('EPOCH {}:'.format(epoch_number + 1))

    # Make sure gradient tracking is on, and do a pass over the data
    model.train(True)
    avg_loss, accuracy = train_one_epoch(epoch_number, writer)

    running_vloss = 0.0
    val_accuracy = 0
    # Set the model to evaluation mode, disabling dropout and using population
    # statistics for batch normalization.
    model.eval()

    # Disable gradient computation and reduce memory consumption.
    with torch.no_grad():
        for i, vdata in enumerate(testing_loader):
            vinputs, vlabels = vdata[0].to(device), vdata[1].to(device)  # Move data to GPU
            voutputs = model(vinputs)
            
            vloss = loss_fn(voutputs, vlabels)
            
            running_vloss += vloss.item()
            # val_accuracy += (torch.topk(voutputs, 1) == torch.argmax(vlabels, 1)).sum().item()
            top5_indices = voutputs.topk(5, dim=1).indices
            for j in range(len(top5_indices)):
                val_accuracy += (top5_indices[j] == torch.argmax(vlabels[j])).sum().item()
    avg_vloss = running_vloss / (i + 1)
    val_accuracy /= len(testing_loader.dataset)
    print('LOSS train {} valid {}'.format(avg_loss, avg_vloss), 'ACC train {} valid {}'.format(accuracy, val_accuracy))

    # Log the running loss averaged per batch
    # for both training and validation
    writer.add_scalars('Training vs. Validation Loss',
                       {'Training': avg_loss, 'Validation': avg_vloss},
                       epoch_number + 1)
    writer.add_scalars('Training vs. Validation Accuracy',
                       {'Training': accuracy, 'Validation': val_accuracy},
                       epoch_number + 1)
    writer.flush()

    # Track best performance, and save the model's state
    if avg_vloss < best_vloss and val_accuracy > best_vacc:
        best_vloss = avg_vloss
        best_vacc = val_accuracy
        model_path = '../model/model_{}_{}'.format(timestamp, epoch_number)
        torch.save(model.state_dict(), model_path)

    epoch_number += 1


EPOCH 1:
  batch 1000 loss: 0.27830568918405335
31674
  batch 2000 loss: 0.28556937208864835
63312
  batch 3000 loss: 0.24481931295990944
95010
  batch 4000 loss: 0.27645748547883703
126707
  batch 5000 loss: 0.2645503236782097
158386
  batch 6000 loss: 0.3222263813610189
190062
  batch 7000 loss: 0.2521832288055448
221726
  batch 8000 loss: 0.25412584209558553
253393
  batch 9000 loss: 0.25189784551458433
285050
  batch 10000 loss: 0.27936381357896606
316685
  batch 11000 loss: 0.26498294360283764
348317
  batch 12000 loss: 0.26045931453770027
379958
  batch 13000 loss: 0.26671574972057716
411588
  batch 14000 loss: 0.37020779536274495
443216
  batch 15000 loss: 0.2621216626699606
474875
  batch 16000 loss: 0.30343709855806084
506517
  batch 17000 loss: 0.25872717176307924
538165
  batch 18000 loss: 0.2568693648930639
569839
  batch 19000 loss: 0.272896213992266
601517
  batch 20000 loss: 0.2617223817824852
633198
  batch 21000 loss: 0.25837669902457855
664882
  batch 22000 loss: 0.25

In [18]:
model_load = ANN()
PATH = '../model/model_latest.pth'
model_load.load_state_dict(torch.load(PATH))

<All keys matched successfully>

In [19]:
model_load.eval()

ANN(
  (fc1): Linear(in_features=377, out_features=2048, bias=True)
  (fc2): Linear(in_features=2048, out_features=4096, bias=True)
  (fc3): Linear(in_features=4096, out_features=2048, bias=True)
  (fc4): Linear(in_features=2048, out_features=1024, bias=True)
  (fc5): Linear(in_features=1024, out_features=773, bias=True)
)

In [35]:
NUM = 20000
y_pred = model_load(X_test_tensor[:NUM].squeeze(0))
_, predicted = torch.max(y_pred, 1)
print(predicted[0])

tensor(56)


In [36]:
_, y_tests = torch.max(y_test_ohe_tensor[:NUM], 1)

In [37]:
accuracy = (y_tests == predicted).sum().item() / y_test_ohe_tensor[:NUM].size(0)
print(f"Accuracy: {accuracy}")

Accuracy: 0.9023


In [38]:
print(X_train_tensor.shape)

torch.Size([753829, 377])
