In [3]:
import torch
import pandas as pd
import re
import csv
import numpy as np
import torchtext
from collections import defaultdict
from torchtext import data
import torch.nn as nn
import torch.nn.functional as F
from sklearn.model_selection import train_test_split
from nltk import sent_tokenize,word_tokenize
from torchtext import data,vocab
from tqdm.notebook import tqdm, tqdm_notebook,tnrange
from sklearn.metrics import accuracy_score

In [1]:
def tokenize(s):
    return word_tokenize(s)

In [4]:
txt_field = data.Field(sequential = True,tokenize = tokenize,include_lengths = False, use_vocab = True)
label_field = data.Field(sequential = False,use_vocab=False,pad_token=None,unk_token=None)
train_val_fields = [('EssayText',txt_field),('Personality',label_field)]

In [5]:
train,val = data.TabularDataset.splits(path = '.',format = 'csv',train='train2.csv',validation = 'test2.csv',fields = train_val_fields)

In [8]:
vec = vocab.Vectors('glove.6B.100d.txt','/home/sanchit/Desktop/DetectSarcasmUsingCNN/GloVe')

In [9]:
txt_field.build_vocab(train,val,max_size = 100000,vectors = vec)

In [10]:
label_field.build_vocab(train)
print(txt_field.vocab.vectors.shape)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

torch.Size([32563, 100])


In [11]:
traindl,valdl = data.BucketIterator.splits(datasets=(train,val),batch_size = 16,device = device)

In [12]:
class BatchGenerator:
    def __init__(self, dl, x_field, y_field):
        self.dl, self.x_field, self.y_field = dl, x_field, y_field
        
    def __len__(self):
        return len(self.dl)
    
    def __iter__(self):
        for batch in self.dl:
            X = getattr(batch, self.x_field)
            y = getattr(batch, self.y_field)
            yield (X,y)

In [13]:
train_batch_it = BatchGenerator(traindl, 'EssayText', 'Personality')

In [15]:
class CNN1d(nn.Module):
    def __init__(self,vocab_size,embedding_dm,pad_idx):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size,embedding_dim,padding_idx = pad_idx)
        self.convs = nn.ModuleList([nn.Conv2d(in_channels = 1,out_channels = 50,kernel_size = (1,fs)) for fs in (3,4,5)])
        self.conv2 = nn.Conv2d(in_channels = 50,out_channels = 100,kernel_size = (1,2))
        self.fc1 = nn.Linear(100000,150) #Change this 
        self.fc2 = nn.Linear(150,1)
        self.dropout = nn.Dropout(0.5)
    def forward(self,text):
        print("text",text.size())
        embedded = self.embedding(text.T)
        embedded = embedded.permute(0, 2,1)
        print("embedded",embedded.size())
        x=embedded.size(2)
        y=3000-x
        print(y)
        batch_size=embedded.size(0)
        z=np.zeros((batch_size,100,y))
        z1=torch.from_numpy(z).float()
        lz=[embedded,z1]
        #print(type(lz))
        zcat = torch.cat(lz, dim = 2)
        print("zcat",zcat.size())
        zcat2=zcat.reshape([batch_size, 1, 100, 3000])
        print("zcat2",zcat2.size())
#         embedded = embedded.reshape([embedded.shape[0],1,])
        print(embedded.size(),"embedding")
        conved = [F.relu(conv(zcat2)) for conv in self.convs]
        pooled = [F.max_pool2d(conv,(1,2)) for conv in conved]
        print("Pool")
        for pl in pooled:
            print(pl.size())
        cat = torch.cat(pooled,dim = 2)
        print("cat",cat.size())
        conved2 = F.relu(self.conv2(cat))
        print("conved2",conved2.size())
        pooled2 = F.max_pool2d(conved2,(1,2))
        print(pooled2.size(),"pooled2")
        return 0
#         return pooled2

In [16]:
input_dim = len(txt_field.vocab)
embedding_dim = 100
pad_idx = txt_field.vocab.stoi[txt_field.pad_token]
model = CNN1d(input_dim,embedding_dim,pad_idx)

In [17]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')



The model has 18,267,451 trainable parameters


In [18]:
pretrained_embeddings = txt_field.vocab.vectors
model.embedding.weight.data.copy_(pretrained_embeddings)

tensor([[ 0.2837, -0.6263, -0.4435,  ...,  0.4368, -0.8261, -0.1570],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.0465,  0.6197,  0.5665,  ..., -0.3762, -0.0325,  0.8062],
        ...,
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]])

In [19]:
unk_idx = txt_field.vocab.stoi[txt_field.unk_token]
model.embedding.weight.data[unk_idx] = torch.zeros(100)
model.embedding.weight.data[pad_idx] = torch.zeros(100)

In [20]:
import torch.optim as optim

optimizer = optim.Adam(model.parameters())

criterion = nn.BCEWithLogitsLoss()

model = model.to(device)
criterion = criterion.to(device)

In [21]:
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum() / len(correct)
    return acc

In [22]:
def train(model,iterator,optimizer,criterion):
    epoch_loss = 0
    epoch_acc = 0
    model.train()
    for batch in iterator:
        optimizer.zero_grad()
        predictions = model(batch.EssayText).squeeze(1)
        loss = criterion(predictions,batch.Personality)
        acc = binary_accuracy(predictions,batch.label)
        
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
        epoch_acc += acc.item()
    return epoch_loss/len(iterator),epoch_acc/len(iterator)

In [23]:
def evaluate(model,iterator,criterion):
    epoch_loss = 0
    epoch_acc = 0
    model.eval()
    with torch.no_grad():
        for batch in iterator:
            predictions = model(batch.EssayText).squeeze(1)
            loss = criterion(predictions,batch.Personality)
            acc = binary_accuracy(predictions,batch.label)
            epoch_loss += loss.item()
            epoch_acc += acc.item()
    return epoch_loss/len(iterator),epoch_acc/len(iterator)        

In [24]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [None]:
N_EPOCHS = 1

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc = train(model, traindl, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valdl, criterion)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut4-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')



text torch.Size([1102, 16])
embedded torch.Size([16, 100, 1102])
1898
zcat torch.Size([16, 100, 3000])
zcat2 torch.Size([16, 1, 100, 3000])
torch.Size([16, 100, 1102]) embedding


In [None]:
# def fit(model, train_dl, val_dl, loss_fn, opt, epochs=3):
#     num_batch = len(train_dl)
#     for epoch in tnrange(epochs):      
#         y_true_train = list()
#         y_pred_train = list()
#         total_loss_train = 0          
        
#         t = tqdm_notebook(iter(train_dl), leave=False, total=num_batch)
#         for (X,lengths),y in t:
#             t.set_description(f'Epoch {epoch}')
#             lengths = lengths.cpu().numpy()
            
#             opt.zero_grad()
#             pred = model(X, lengths)
#             loss = loss_fn(pred, y)
#             loss.backward()
#             opt.step()
            
#             t.set_postfix(loss=loss.item())
#             pred_idx = torch.max(pred, dim=1)[1]
            
#             y_true_train += list(y.cpu().data.numpy())
#             y_pred_train += list(pred_idx.cpu().data.numpy())
#             total_loss_train += loss.item()
            
#         train_acc = accuracy_score(y_true_train, y_pred_train)
#         train_loss = total_loss_train/len(train_dl)
        
#         if val_dl:
#             y_true_val = list()
#             y_pred_val = list()
#             total_loss_val = 0
#             for (X,lengths),y in tqdm_notebook(val_dl, leave=False):
#                 pred = model(X, lengths.cpu().numpy())
#                 loss = loss_fn(pred, y)
#                 pred_idx = torch.max(pred, 1)[1]
#                 y_true_val += list(y.cpu().data.numpy())
#                 y_pred_val += list(pred_idx.cpu().data.numpy())
#                 total_loss_val += loss.item()
#             valacc = accuracy_score(y_true_val, y_pred_val)
#             valloss = total_loss_val/len(valdl)
#             print(f'Epoch {epoch}: train_loss: {train_loss:.4f} train_acc: {train_acc:.4f} | val_loss: {valloss:.4f} val_acc: {valacc:.4f}')
#         else:
#             print(f'Epoch {epoch}: train_loss: {train_loss:.4f} train_acc: {train_acc:.4f}')


In [None]:
val_batch_it = BatchGenerator(valdl, 'EssayText', 'Personality')

In [None]:
fit(model,train_batch_it,val_batch_it,F.nll_loss,optimizer,1)