In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

from sklearn.preprocessing import StandardScaler    
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report

In [48]:
from google.colab import drive 
drive.mount('/content/gdrive')

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

df = pd.read_csv('gdrive/My Drive/pred_all_models_train.csv').drop(columns = 'Unnamed: 0')

X = df.iloc[:, 1:].values

textRep = False
if textRep:
  model = 'skipgram' # or cbow
  X1_0 = np.load('gdrive/My Drive/'+model + '_train_neg_full_u.npy')
  X1_1 = np.load('gdrive/My Drive/'+model + '_train_pos_full_u.npy')
  X1 = np.concatenate((X1_0, X1_1))
  X = np.concatenate((X, X1), axis = 1)

y = df.iloc[:, 0].values

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.05, random_state=0)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.fit_transform(X_val)

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [61]:
class trainData(Dataset):
    def __init__(self, X_data, y_data):
        self.X_data = X_data
        self.y_data = y_data
        
    def __getitem__(self, index):
        return self.X_data[index], self.y_data[index]
        
    def __len__ (self):
        return len(self.X_data)


train_data = trainData(torch.FloatTensor(X_train), torch.FloatTensor(y_train))
val_data = trainData(torch.FloatTensor(X_val), torch.FloatTensor(y_val))
BATCH_SIZE = 64
train_loader = DataLoader(dataset=train_data, batch_size=BATCH_SIZE, shuffle=True, drop_last=True)
val_loader = DataLoader(dataset=val_data, batch_size=64)  

In [141]:
class stage2clf(nn.Module):
    def __init__(self):
        super(stage2clf, self).__init__()
        self.hidden_layer_1 = nn.Linear(X.shape[1], 64) 
        self.hidden_layer_2 = nn.Linear(64, 64)
        self.output_layer = nn.Linear(64, 1) 
        
        self.relu = nn.ReLU()
        self.drop = nn.Dropout(p=0.1)
        self.batch_norm = nn.BatchNorm1d(64)
        
    def forward(self, inputs):
        x = inputs
        x = self.relu(self.hidden_layer_1(x))
        x = self.batch_norm(x)
        x = self.relu(self.hidden_layer_2(x))
        x = self.batch_norm(x)
        x = self.drop(x)
        x = self.output_layer(x)
        
        return x

In [142]:
model = stage2clf()
model.to(device)

criterion = nn.BCEWithLogitsLoss()
LEARNING_RATE = 0.001
optimizer = optim.AdamW(model.parameters(), lr=LEARNING_RATE)

scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.1)

In [143]:
def binary_acc(y_pred, y_test):
    y_pred_tag = torch.round(torch.sigmoid(y_pred))

    correct_results_sum = (y_pred_tag == y_test).sum().float()
    acc = correct_results_sum/y_test.shape[0]
    acc = torch.round(acc * 100)
    return acc

In [144]:
def validation_stats(network, loader):

    acc = []
    with torch.no_grad():
        for x, y in loader:
          x, y = x.to(device), y.to(device)
          y_pred = network(x)
          acc.append(binary_acc(y_pred, y.unsqueeze(1)))

    acc = torch.Tensor(acc)
    return acc.mean()

In [145]:
EPOCHS = 20


model.train()
for e in range(1, EPOCHS+1):
    epoch_loss = 0
    epoch_acc = 0
    
    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        optimizer.zero_grad()
        
        y_pred = model(X_batch)
        
        loss = criterion(y_pred, y_batch.unsqueeze(1))
        acc = binary_acc(y_pred, y_batch.unsqueeze(1))
        
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
    val_acc = validation_stats(model, val_loader)
    print('Epoch {:d}.\tLoss: {:.5f}\tAccuracy: {:.3f}% (train) / {:.3f}% (val)'.format(e, 100*epoch_loss/len(train_loader), epoch_acc/len(train_loader), val_acc))
    scheduler.step()


Epoch 1.	Loss: 21.55328	Accuracy: 91.394% (train) / 91.385% (val)
Epoch 2.	Loss: 20.95680	Accuracy: 91.601% (train) / 91.490% (val)
Epoch 3.	Loss: 20.88812	Accuracy: 91.611% (train) / 91.488% (val)
Epoch 4.	Loss: 20.88006	Accuracy: 91.611% (train) / 91.492% (val)
Epoch 5.	Loss: 20.88742	Accuracy: 91.605% (train) / 91.479% (val)
Epoch 6.	Loss: 20.88358	Accuracy: 91.615% (train) / 91.480% (val)
Epoch 7.	Loss: 20.87089	Accuracy: 91.619% (train) / 91.487% (val)
Epoch 8.	Loss: 20.88478	Accuracy: 91.610% (train) / 91.480% (val)
Epoch 9.	Loss: 20.88127	Accuracy: 91.614% (train) / 91.493% (val)
Epoch 10.	Loss: 20.88281	Accuracy: 91.613% (train) / 91.494% (val)
Epoch 11.	Loss: 20.88625	Accuracy: 91.614% (train) / 91.481% (val)
Epoch 12.	Loss: 20.87293	Accuracy: 91.618% (train) / 91.485% (val)
Epoch 13.	Loss: 20.88314	Accuracy: 91.612% (train) / 91.476% (val)
Epoch 14.	Loss: 20.88719	Accuracy: 91.613% (train) / 91.477% (val)
Epoch 15.	Loss: 20.88171	Accuracy: 91.617% (train) / 91.476% (val)
Epoc

In [149]:
X_t = pd.read_csv('gdrive/My Drive/pred_all_models_test.csv').drop(columns = 'Unnamed: 0')

textRep = False
if textRep:
  model = 'skipgram' # or cbow
  X1_0 = np.load('gdrive/My Drive/'+model + '_test_neg_full_u.npy')
  X1_1 = np.load('gdrive/My Drive/'+model + '_test_pos_full_u.npy')
  X1 = np.concatenate((X1_0, X1_1))
  X_t = np.concatenate((X, X1), axis = 1)
X_t = scaler.fit_transform(X_t)

In [172]:
class testData(Dataset):
    def __init__(self, X_data):
        self.X_data = X_data
        
    def __getitem__(self, index):
        return self.X_data[index]
        
    def __len__ (self):
        return len(self.X_data)
    

test_data = testData(torch.FloatTensor(X_t))
test_loader = DataLoader(dataset=test_data, batch_size=10000, shuffle=False, drop_last=False)

In [176]:
y_pred_list = []
model.eval()
with torch.no_grad():
    for X_batch in test_loader:
        X_batch = X_batch.to(device)
        pred = torch.round(torch.sigmoid(model(X_batch)))
        y_pred_list.append(pred.cpu().numpy())

predictions = (np.array([a.squeeze().tolist() for a in y_pred_list][0])*2 - 1).astype(int)
predictions

array([-1, -1,  1, ..., -1,  1, -1])