In [1]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

In [2]:
tokens_f = open('data/SpamHam/tokenized_corpus.csv')
labels_f = open('data/SpamHam/labels.csv')

features = [words for words in tokens_f]
labels = [int(label) for label in labels_f]

In [3]:
from sklearn.feature_extraction.text import CountVectorizer

count_vec = CountVectorizer()
X = count_vec.fit_transform(features)
X = X.toarray()

In [4]:
from torch.utils.data import Dataset, DataLoader
import torch
import numpy as np

class SpamHamDataset(Dataset):
    def __init__(self, test_set=False):
        self.features = np.array(X)
        self.labels = np.array(labels)
        
        self.test_set = test_set
        self.test_size = int(X.shape[0] * 0.2)
        self.train_size = int(X.shape[0] - self.test_size)
        
    def __len__(self):
        return self.test_size if self.test_set else self.train_size 
    
    def __getitem__(self, idx):
        if self.test_set:
            idx += self.train_size
            
        if torch.is_tensor(idx):
            idx = idx.tolist()

        return (self.features[idx], self.labels[idx])
    
trainSet = SpamHamDataset(test_set=False)
testSet = SpamHamDataset(test_set=True)

trainset = torch.utils.data.DataLoader(trainSet, batch_size=64, shuffle=True)
testset = torch.utils.data.DataLoader(testSet, batch_size=64, shuffle=True)

In [5]:
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable

class Net(nn.Module):
    def __init__(self, vocab_size):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(vocab_size, 128)
        self.fc2 = nn.Linear(vocab_size, 2)
        
        self.fc3 = nn.Bilinear(128, 2, 2)

    def forward(self, x):
        x1 = F.relu(self.fc1(x))
        x2 = F.relu(self.fc2(x))
        x = self.fc3(x1, x2)
        return x

net = Net(trainSet[0][0].shape[0])
print(net)

Net(
  (fc1): Linear(in_features=9045, out_features=128, bias=True)
  (fc2): Linear(in_features=9045, out_features=2, bias=True)
  (fc3): Bilinear(in1_features=128, in2_features=2, out_features=2, bias=True)
)


In [6]:
if torch.cuda.is_available():  
  dev = "cuda:0" 
else:  
  dev = "cpu"  
device = torch.device(dev)

In [7]:
net = Net(trainSet[0][0].shape[0])
net.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(params=net.parameters() , lr=0.01)

In [8]:
from tqdm.notebook import tqdm

def test():
    net.eval()
    correct = 0
    total = 0

    with torch.no_grad():
        for data in testset:
            X, y = data
            output = net(X.float().to(device))
            #print(output)
            for idx, i in enumerate(output):
                if torch.argmax(i) == y[idx]:
                    correct += 1
                total += 1

    print("Accuracy: ", round(correct/total, 5))

epochs = 10
net.train()
for epoch in tqdm(range(epochs)):
    for data in tqdm(trainset):
        optimizer.zero_grad()
        X, y = data
        output = net(X.float().to(device))
        loss = criterion(output, y.long().to(device))
        
        loss.backward()
        optimizer.step()
    print(f"Epochs: {epoch} - loss: {loss}")
    test()

HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=70.0), HTML(value='')))


Epochs: 0 - loss: 0.06796116381883621
Accuracy:  0.97487


HBox(children=(FloatProgress(value=0.0, max=70.0), HTML(value='')))


Epochs: 1 - loss: 0.02038918435573578
Accuracy:  0.98833


HBox(children=(FloatProgress(value=0.0, max=70.0), HTML(value='')))


Epochs: 2 - loss: 0.0013898923061788082
Accuracy:  0.98564


HBox(children=(FloatProgress(value=0.0, max=70.0), HTML(value='')))


Epochs: 3 - loss: 6.9487898144871e-05
Accuracy:  0.98743


HBox(children=(FloatProgress(value=0.0, max=70.0), HTML(value='')))


Epochs: 4 - loss: 8.956913006841205e-06
Accuracy:  0.98743


HBox(children=(FloatProgress(value=0.0, max=70.0), HTML(value='')))


Epochs: 5 - loss: 0.00010171638859901577
Accuracy:  0.98743


HBox(children=(FloatProgress(value=0.0, max=70.0), HTML(value='')))


Epochs: 6 - loss: 4.294208792998688e-06
Accuracy:  0.98743


HBox(children=(FloatProgress(value=0.0, max=70.0), HTML(value='')))


Epochs: 7 - loss: 3.405979143167315e-08
Accuracy:  0.98743


HBox(children=(FloatProgress(value=0.0, max=70.0), HTML(value='')))


Epochs: 8 - loss: 9.990783382818336e-07
Accuracy:  0.98564


HBox(children=(FloatProgress(value=0.0, max=70.0), HTML(value='')))


Epochs: 9 - loss: 3.122146452483321e-08
Accuracy:  0.98474

