In [1]:
#Import library
import os
import random
from torch.utils.tensorboard import SummaryWriter
from datetime import datetime


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score


import torch
from torch.utils.data import DataLoader, TensorDataset




In [3]:
#Read data from file

#Read training data
DATA_PATH = os.path.join('..','data') # use '..' to go up one directory level
SOURCE = ''
NAME = 'augmented_beos.csv'
training_data = pd.read_csv(os.path.join(DATA_PATH, SOURCE, NAME)).dropna(axis=1)


In [4]:
# #checking the data
class_count = training_data['diseases'].value_counts()

In [4]:
# # Encode the label
# label_encoder = LabelEncoder()
# training_data['diseases'] = label_encoder.fit_transform(training_data['diseases'])

In [4]:
max_class = class_count.max()
balance_size = max_class
for disease in class_count.index:
    if class_count[disease] < balance_size:
        # new samples random drop a value 1 to 0
        new_samples = training_data[training_data['diseases'] == disease][:].sample(n=balance_size - class_count[disease], replace=True)
        num_to_flip = random.randint(1,class_count[disease])
        ones_indices = np.argwhere(new_samples.values == 1)
        indices_to_flip = ones_indices[np.random.choice(ones_indices.shape[0], size=num_to_flip, replace=True)]
        for index in indices_to_flip:
            new_samples.iat[index[0], index[1]] = 0
        training_data = pd.concat([training_data[:], new_samples[:]])

In [5]:
training_data.to_csv("../data/augmented_beos.csv", index = False)

In [9]:
train_df, test_df = train_test_split(training_data, test_size=0.2, random_state=42, stratify=training_data['diseases'])

X_train = train_df.drop(columns=['diseases'])
y_train = train_df['diseases']
X_test = test_df.drop(columns=['diseases'])
y_test = test_df['diseases']
print("X_train shape: ", X_train.shape) 
print("X_test shape: ", X_test.shape)
print("y_train shape: ", y_train.shape)
print("y_test shape: ", y_test.shape)

X_train shape:  (280572, 377)
X_test shape:  (70143, 377)
y_train shape:  (280572,)
y_test shape:  (70143,)


In [10]:
y_train_ohe = pd.get_dummies(y_train[:])[:]
y_test_ohe = pd.get_dummies(y_test[:])[:]

In [11]:
# Convert data to tensor
X_train_tensor = torch.tensor(X_train.values, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test.values, dtype=torch.float32)
y_train_ohe_tensor = torch.tensor(y_train_ohe.values, dtype=torch.float32)
y_test_ohe_tensor = torch.tensor(y_test_ohe.values, dtype=torch.float32)

In [None]:
training_tensor = TensorDataset(X_train_tensor, y_train_ohe_tensor)
testing_tensor = TensorDataset(X_test_tensor, y_test_ohe_tensor)

In [None]:
batch_size = 32
training_loader = DataLoader(training_tensor, batch_size, shuffle=True)
testing_loader = DataLoader(testing_tensor, batch_size, shuffle=False)

In [11]:
# Define the model
class ANN(torch.nn.Module):
    def __init__(self):
        super(ANN, self).__init__()
        self.fc1 = torch.nn.Linear(X_train_tensor.shape[1], 1024)
        self.fc2 = torch.nn.Linear(1024, 2048)
        self.fc3 = torch.nn.Linear(2048, 1024)
        self.fc4 = torch.nn.Linear(1024,773)
        # self.fc5 = torch.nn.Linear(1024, 773)
        
    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = torch.relu(self.fc3(x))
        x = self.fc4(x)
        return x

In [None]:
model = ANN()
loss_fn = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001, momentum = 0.9)

In [None]:
def train_one_epoch(epoch_index, tb_writer):
    running_loss = 0.
    last_loss = 0.
    accuracy = 0
    # Here, we use enumerate(training_loader) instead of
    # iter(training_loader) so that we can track the batch
    # index and do some intra-epoch reporting
    for i, data in enumerate(training_loader):
        # Every data instance is an input + label pair
        inputs, labels = data

        # Zero your gradients for every batch!
        optimizer.zero_grad()

        # Make predictions for this batch
        outputs = model(inputs)

        # Compute the loss and its gradients
        loss = loss_fn(outputs, labels)
        loss.backward()
        # Compute the accuracy
        predicted = torch.argmax(outputs, 1)
        correct = (predicted == torch.argmax(labels, 1)).sum().item()
        accuracy += correct

        # Adjust learning weights
        optimizer.step()

        # Gather data and report
        running_loss += loss.item()
        if i % 1000 == 999:
            last_loss = running_loss / 1000 # loss per batch
            print('  batch {} loss: {}'.format(i + 1, last_loss))
            tb_x = epoch_index * len(training_loader) + i + 1
            tb_writer.add_scalar('Loss/train', last_loss, tb_x)
            running_loss = 0.

    return last_loss, accuracy / len(X_train_tensor)

In [11]:
# Initializing in a separate cell so we can easily add more epochs to the same run
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
writer = SummaryWriter('runs/fashion_trainer_{}'.format(timestamp))
epoch_number = 0

EPOCHS = 400

best_vloss = 1_000_000.

for epoch in range(EPOCHS):
    print('EPOCH {}:'.format(epoch_number + 1))

    # Make sure gradient tracking is on, and do a pass over the data
    model.train(True)
    avg_loss,accuracy = train_one_epoch(epoch_number, writer)


    running_vloss = 0.0
    val_accuracy = 0
    # Set the model to evaluation mode, disabling dropout and using population
    # statistics for batch normalization.
    model.eval()

    # Disable gradient computation and reduce memory consumption.
    with torch.no_grad():
        for i, vdata in enumerate(testing_loader):
            vinputs, vlabels = vdata
            voutputs = model(vinputs)
            vloss = loss_fn(voutputs, vlabels)
            running_vloss += vloss
            val_accuracy += (torch.argmax(voutputs, 1) == torch.argmax(vlabels, 1)).sum().item()
    avg_vloss = running_vloss / (i + 1)
    val_accuracy /= len(X_test_tensor)
    print('LOSS train {} valid {}'.format(avg_loss, avg_vloss), 'ACC train {} valid {}'.format(accuracy, val_accuracy))

    # Log the running loss averaged per batch
    # for both training and validation
    writer.add_scalars('Training vs. Validation Loss',
                    { 'Training' : avg_loss, 'Validation' : avg_vloss },
                    epoch_number + 1)
    writer.add_scalars('Training vs. Validation Accuracy',
                    { 'Training' : accuracy, 'Validation' : val_accuracy },)
    writer.flush()

    # Track best performance, and save the model's state
    if avg_vloss < best_vloss:
        best_vloss = avg_vloss
        model_path = 'model_{}_{}'.format(timestamp, epoch_number)
        torch.save(model.state_dict(), model_path)

    epoch_number += 1

loss: 10 to 6.6218791007995605
accuracy: 0 to 0.01986128625472888
Epoch: 0, Loss: 6.649628162384033, Acc: 0.0014372145854099538, Val_Loss: 6.6218791007995605, Val_Acc: 0.01986128625472888
Epoch: 1, Loss: 6.621870994567871, Acc: 0.02020909154129674, Val_Loss: 6.551183223724365, Val_Acc: 0.01197982345523329
Epoch: 2, Loss: 6.551131725311279, Acc: 0.011590440204918983, Val_Loss: 6.390519142150879, Val_Acc: 0.006546250278169275
Epoch: 3, Loss: 6.390387058258057, Acc: 0.006119752428197223, Val_Loss: 6.195268154144287, Val_Acc: 0.005081225428380684
Epoch: 4, Loss: 6.194941520690918, Acc: 0.004960708407705325, Val_Loss: 6.272689342498779, Val_Acc: 0.008827238335435058
loss: 6.6218791007995605 to 6.205638408660889
accuracy: 0.01986128625472888 to 0.08298716712410059
Epoch: 5, Loss: 6.271790981292725, Acc: 0.008873641020885973, Val_Loss: 6.205638408660889, Val_Acc: 0.08298716712410059
Epoch: 6, Loss: 6.204654693603516, Acc: 0.08298291569113794, Val_Loss: 6.098793029785156, Val_Acc: 0.0478822045

In [11]:
model_ann.eval()
y_pred = torch.argmax(model_ann(X_train_tensor), 1)
print(y_pred.shape)

torch.Size([197977])


In [None]:
# Evaluate the model
model = model_ann
model.load_state_dict(torch.load('../model/model_ann.pth'))
model.eval()
y_pred = model(X_test_tensor)
_, predicted = torch.max(y_pred, 1)
accuracy = (y_test_ohe_tensor == predicted).sum().item() / y_test_ohe_tensor.size(0)
print(f"Accuracy: {accuracy}")

In [None]:
print(y_pred[1])
print(torch.argmax(y_pred[1]))

In [12]:
torch.cuda.empty_cache()

In [16]:
import gc

gc.collect()

0