In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

from sklearn.preprocessing import StandardScaler    
from sklearn.model_selection import train_test_split

In [2]:
from google.colab import drive 
drive.mount('/content/gdrive')

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

df = pd.read_csv('gdrive/My Drive/pred_all_models_train.csv').drop(columns = 'Unnamed: 0')

X = df.iloc[:, 1:].values

textRep = True
if textRep:
  embedding = 'skipgram' # or cbow
  X1_0 = np.load('gdrive/My Drive/'+embedding + '_train_neg_full_u.npy')
  X1_1 = np.load('gdrive/My Drive/'+embedding + '_train_pos_full_u.npy')
  X1 = np.concatenate((X1_0, X1_1))
  X = np.concatenate((X, X1), axis = 1)

y = df.iloc[:, 0].values

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.05, random_state=0)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.fit_transform(X_val)

Mounted at /content/gdrive


In [3]:
class trainData(Dataset):
    def __init__(self, X_data, y_data):
        self.X_data = X_data
        self.y_data = y_data
        
    def __getitem__(self, index):
        return self.X_data[index], self.y_data[index]
        
    def __len__ (self):
        return len(self.X_data)

xdim = X.shape[1]
del df, X, y
train_data = trainData(torch.FloatTensor(X_train), torch.FloatTensor(y_train))
val_data = trainData(torch.FloatTensor(X_val), torch.FloatTensor(y_val))
del X_train, X_val, y_train, y_val
BATCH_SIZE = 64
train_loader = DataLoader(dataset=train_data, batch_size=BATCH_SIZE, shuffle=True, drop_last=True)
val_loader = DataLoader(dataset=val_data, batch_size=BATCH_SIZE, drop_last = True)  

In [12]:
class stage2clf(nn.Module):
    def __init__(self):
        super(stage2clf, self).__init__()
        self.hidden_layer_1 = nn.Linear(xdim, 64) 
        self.hidden_layer_2 = nn.Linear(64, 64)
        self.output_layer = nn.Linear(64, 1) 
        
        self.relu = nn.ReLU()
        self.drop = nn.Dropout(p=0.1)
        self.batch_norm = nn.BatchNorm1d(64)
        
    def forward(self, inputs):
        x = inputs
        x = self.relu(self.hidden_layer_1(x))
        x = self.batch_norm(x)
        x = self.relu(self.hidden_layer_2(x))
        x = self.batch_norm(x)
        x = self.drop(x)
        x = self.output_layer(x)
        
        return x

class stage2clf2(nn.Module):
    def __init__(self):
        super(stage2clf2, self).__init__()
        self.hidden_layer_1 = nn.Linear(xdim, 128) 
        self.hidden_layer_2 = nn.Linear(128, 64)
        self.output_layer = nn.Linear(64, 1) 
        
        self.relu = nn.ReLU()
        self.drop = nn.Dropout(p=0.1)
        self.batch_norm1 = nn.BatchNorm1d(128)
        self.batch_norm2 = nn.BatchNorm1d(64)
        
    def forward(self, inputs):
        x = inputs
        x = self.relu(self.hidden_layer_1(x))
        x = self.batch_norm1(x)
        x = self.relu(self.hidden_layer_2(x))
        x = self.batch_norm2(x)
        x = self.drop(x)
        x = self.output_layer(x)
        
        return x

In [13]:
model = stage2clf2()
model.to(device)

criterion = nn.BCEWithLogitsLoss()
LEARNING_RATE = 0.001
optimizer = optim.AdamW(model.parameters(), lr=LEARNING_RATE)

scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.9)

In [14]:
def binary_acc(y_pred, y_test):
    y_pred_tag = torch.round(torch.sigmoid(y_pred))

    correct_results_sum = (y_pred_tag == y_test).sum().float()
    acc = correct_results_sum/y_test.shape[0]
    acc = torch.round(acc * 100)
    return acc

In [15]:
def validation_stats(network, loader):

    acc = []
    with torch.no_grad():
        for x, y in loader:
          x, y = x.to(device), y.to(device)
          y_pred = network(x)
          acc.append(binary_acc(y_pred, y.unsqueeze(1)))

    acc = torch.Tensor(acc)
    return acc.mean()

In [16]:
EPOCHS = 10


model.train()
for e in range(1, EPOCHS+1):
    epoch_loss = 0
    epoch_acc = 0
    
    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        optimizer.zero_grad()
        
        y_pred = model(X_batch)
        
        loss = criterion(y_pred, y_batch.unsqueeze(1))
        acc = binary_acc(y_pred, y_batch.unsqueeze(1))
        
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
    val_acc = validation_stats(model, val_loader)
    print('Epoch {:d}.\tLoss: {:.5f}\tAccuracy: {:.3f}% (train) / {:.3f}% (val)'.format(e, 100*epoch_loss/len(train_loader), epoch_acc/len(train_loader), val_acc))
    scheduler.step()


Epoch 1.	Loss: 21.64399	Accuracy: 91.372% (train) / 91.531% (val)
Epoch 2.	Loss: 21.01288	Accuracy: 91.588% (train) / 91.573% (val)
Epoch 3.	Loss: 20.91626	Accuracy: 91.612% (train) / 91.627% (val)
Epoch 4.	Loss: 20.85550	Accuracy: 91.626% (train) / 91.637% (val)
Epoch 5.	Loss: 20.82002	Accuracy: 91.644% (train) / 91.621% (val)
Epoch 6.	Loss: 20.77931	Accuracy: 91.663% (train) / 91.596% (val)
Epoch 7.	Loss: 20.76094	Accuracy: 91.668% (train) / 91.637% (val)
Epoch 8.	Loss: 20.72778	Accuracy: 91.667% (train) / 91.661% (val)
Epoch 9.	Loss: 20.70417	Accuracy: 91.696% (train) / 91.656% (val)
Epoch 10.	Loss: 20.68176	Accuracy: 91.691% (train) / 91.622% (val)


In [19]:
X_t = pd.read_csv('gdrive/My Drive/pred_all_models_test.csv').drop(columns = 'Unnamed: 0')

if textRep:
  X1 = np.load('gdrive/My Drive/'+ embedding + '_test.npy')
  X_t = np.concatenate((X_t, X1), axis = 1)
X_t = scaler.transform(X_t)

In [20]:
class testData(Dataset):
    def __init__(self, X_data):
        self.X_data = X_data
        
    def __getitem__(self, index):
        return self.X_data[index]
        
    def __len__ (self):
        return len(self.X_data)
    

test_data = testData(torch.FloatTensor(X_t))
test_loader = DataLoader(dataset=test_data, batch_size=10, shuffle=False, drop_last=False)

In [23]:
y_pred_list = []
model.eval()
with torch.no_grad():
    for X_batch in test_loader:
        X_batch = X_batch.to(device)
        pred = torch.round(torch.sigmoid(model(X_batch)))
        y_pred_list.append(pred.cpu().numpy())

y_pred = [a.squeeze().tolist() for a in y_pred_list]
predictions = []
for row in y_pred:
  predictions += row
predictions = np.array(predictions)
preds = pd.DataFrame((2*predictions-1).astype(int), columns = ['Prediction'], index = np.arange(1, len(predictions)+1))
preds.index.names = ['Id']
preds.to_csv('stage2_nn.csv')