In [9]:
#deal with tensors
import torch   
import random
import gensim.downloader as api
from tqdm import tqdm
#handling text data
import pickle
from torchtext import data 

In [2]:

#Reproducing same results
SEED = 2019

#Torch
torch.manual_seed(SEED)

<torch._C.Generator at 0x7fbb66c62590>

In [3]:
TEXT = data.Field(tokenize='spacy',batch_first=True,include_lengths=True)
LABEL = data.LabelField(dtype = torch.float,batch_first=True)



In [4]:
fields = [('label', LABEL),('text',TEXT)]

In [5]:
#loading custom dataset
training_data=data.TabularDataset(path = 'training_dataset_1.csv',format = 'csv',fields = fields,skip_header = True)

#print preprocessed text
# print(vars(training_data.examples[0]))



In [6]:
train_data, valid_data = training_data.split(split_ratio=0.7, random_state = random.seed(SEED))

In [7]:
#initialize glove embeddings
TEXT.build_vocab(train_data,min_freq=3,vectors = "glove.6B.100d")  
LABEL.build_vocab(train_data)

#No. of unique tokens in text
print("Size of TEXT vocabulary:",len(TEXT.vocab))

#No. of unique tokens in label
print("Size of LABEL vocabulary:",len(LABEL.vocab))

#Commonly used words
print(TEXT.vocab.freqs.most_common(5))  

# #Word dictionary
# print(TEXT.vocab.stoi)

Size of TEXT vocabulary: 46060
Size of LABEL vocabulary: 2
[('?', 208526), ('the', 186909), (',', 95037), ('of', 93937), ('What', 79355)]


In [10]:
dbfile = open('TEXT.pkl', 'wb')
pickle.dump(TEXT, dbfile)                     
dbfile.close()

In [11]:
#check whether cuda is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')  

#set batch size
BATCH_SIZE = 64

#Load an iterator
train_iterator, valid_iterator = data.BucketIterator.splits(
    (train_data, valid_data), 
    batch_size = BATCH_SIZE,
    sort_key = lambda x: len(x.text),
    sort_within_batch=True,
    device = device)



In [12]:
import torch.nn as nn

class classifier(nn.Module):
    
    #define all the layers used in model
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, 
                 bidirectional, dropout):
        
        #Constructor
        super().__init__()          
        
        #embedding layer
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        
        #lstm layer
        self.lstm = nn.LSTM(embedding_dim, 
                           hidden_dim, 
                           num_layers=n_layers, 
                           bidirectional=bidirectional, 
                           dropout=dropout,
                           batch_first=True)
        
        #dense layer
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
        
        #activation function
        self.act = nn.Sigmoid()
        
    def forward(self, text, text_lengths):
        
        #text = [batch size,sent_length]
        embedded = self.embedding(text)
        #embedded = [batch size, sent_len, emb dim]
      
        #packed sequence
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths,batch_first=True)
        
        packed_output, (hidden, cell) = self.lstm(packed_embedded)
        #hidden = [batch size, num layers * num directions,hid dim]
        #cell = [batch size, num layers * num directions,hid dim]
        
        #concat the final forward and backward hidden state
        hidden = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1)
                
        #hidden = [batch size, hid dim * num directions]
        dense_outputs=self.fc(hidden)

        #Final activation function
        outputs=self.act(dense_outputs)
        
        return outputs


In [13]:
#define hyperparameters
size_of_vocab = len(TEXT.vocab)
embedding_dim = 100
num_hidden_nodes = 32
num_output_nodes = 1
num_layers = 2
bidirection = True
dropout = 0.2

#instantiate the model
model = classifier(size_of_vocab, embedding_dim, num_hidden_nodes,num_output_nodes, num_layers, 
                   bidirectional = True, dropout = dropout)

In [14]:
#architecture
print(model)

#No. of trianable parameters
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)
    
print(f'The model has {count_parameters(model):,} trainable parameters')

#Initialize the pretrained embedding
pretrained_embeddings = TEXT.vocab.vectors
model.embedding.weight.data.copy_(pretrained_embeddings)

print(pretrained_embeddings.shape)

classifier(
  (embedding): Embedding(46060, 100)
  (lstm): LSTM(100, 32, num_layers=2, batch_first=True, dropout=0.2, bidirectional=True)
  (fc): Linear(in_features=64, out_features=1, bias=True)
  (act): Sigmoid()
)
The model has 4,665,457 trainable parameters
torch.Size([46060, 100])


In [15]:
import torch.optim as optim

#define optimizer and loss
optimizer = optim.Adam(model.parameters())
criterion = nn.BCELoss()

#define metric
def binary_accuracy(preds, y):
    #round predictions to the closest integer
    rounded_preds = torch.round(preds)
    
    correct = (rounded_preds == y).float() 
    acc = correct.sum() / len(correct)
    return acc
    
#push to cuda if available
model = model.to(device)
criterion = criterion.to(device)

In [16]:
def train(model, iterator, optimizer, criterion):
    
    #initialize every epoch 
    epoch_loss = 0
    epoch_acc = 0
    
    #set the model in training phase
    model.train()  
    
    for batch in tqdm(iterator):
        
        #resets the gradients after every batch
        optimizer.zero_grad()   
        
        #retrieve text and no. of words
        text, text_lengths = batch.text   
        
        #convert to 1D tensor
        predictions = model(text, text_lengths).squeeze()  
        
        #compute the loss
        loss = criterion(predictions, batch.label)        
        
        #compute the binary accuracy
        acc = binary_accuracy(predictions, batch.label)   
        
        #backpropage the loss and compute the gradients
        loss.backward()       
        
        #update the weights
        optimizer.step()      
        
        #loss and accuracy
        epoch_loss += loss.item()  
        epoch_acc += acc.item()    
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [17]:
def evaluate(model, iterator, criterion):
    
    #initialize every epoch
    epoch_loss = 0
    epoch_acc = 0

    #deactivating dropout layers
    model.eval()
    
    #deactivates autograd
    with torch.no_grad():
    
        for batch in iterator:
        
            #retrieve text and no. of words
            text, text_lengths = batch.text
            
            #convert to 1d tensor
            predictions = model(text, text_lengths).squeeze()
            
            #compute loss and accuracy
            loss = criterion(predictions, batch.label)
            acc = binary_accuracy(predictions, batch.label)
            
            #keep track of loss and accuracy
            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [18]:
N_EPOCHS = 2
best_valid_loss = float('inf')

for epoch in tqdm(range(N_EPOCHS)):
     
    #train the model
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    
    #evaluate the model
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
    #save the best model
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'saved_weights.pt')
    
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 1/5434 [00:00<52:56,  1.71it/s][A
  0%|          | 4/5434 [00:00<13:20,  6.78it/s][A
  0%|          | 6/5434 [00:00<10:40,  8.47it/s][A
  0%|          | 8/5434 [00:01<09:19,  9.70it/s][A
  0%|          | 10/5434 [00:01<08:02, 11.24it/s][A
  0%|          | 12/5434 [00:01<07:08, 12.65it/s][A
  0%|          | 14/5434 [00:01<07:43, 11.70it/s][A
  0%|          | 16/5434 [00:01<07:41, 11.75it/s][A
  0%|          | 18/5434 [00:01<07:20, 12.31it/s][A
  0%|          | 21/5434 [00:01<06:17, 14.33it/s][A
  0%|          | 23/5434 [00:02<06:26, 14.00it/s][A
  0%|          | 25/5434 [00:02<06:37, 13.61it/s][A
  0%|          | 27/5434 [00:02<06:12, 14.53it/s][A
  1%|          | 29/5434 [00:02<06:04, 14.84it/s][A
  1%|          | 31/5434 [00:02<06:13, 14.46it/s][A
  1%|          | 33/5434 [00:02<06:39, 13.50it/s][A
  1%|          | 35/5434 [00:02<06:16, 14.36it/s][A
  1%|          | 37/5434 [00:03<05:50, 15.38it/s][A
  1%|      

	Train Loss: 0.024 | Train Acc: 99.18%
	 Val. Loss: 0.008 |  Val. Acc: 99.78%



  0%|          | 1/5434 [00:00<52:06,  1.74it/s][A
  0%|          | 3/5434 [00:00<18:00,  5.02it/s][A
  0%|          | 4/5434 [00:00<16:16,  5.56it/s][A
  0%|          | 6/5434 [00:01<11:35,  7.80it/s][A
  0%|          | 8/5434 [00:01<08:57, 10.09it/s][A
  0%|          | 10/5434 [00:01<07:31, 12.00it/s][A
  0%|          | 12/5434 [00:01<08:18, 10.88it/s][A
  0%|          | 14/5434 [00:01<07:52, 11.47it/s][A
  0%|          | 16/5434 [00:01<07:31, 12.00it/s][A
  0%|          | 18/5434 [00:01<06:52, 13.13it/s][A
  0%|          | 20/5434 [00:02<06:58, 12.92it/s][A
  0%|          | 22/5434 [00:02<08:53, 10.14it/s][A
  0%|          | 24/5434 [00:02<09:19,  9.67it/s][A
  0%|          | 26/5434 [00:02<09:19,  9.67it/s][A
  1%|          | 28/5434 [00:02<08:14, 10.94it/s][A
  1%|          | 30/5434 [00:03<08:55, 10.08it/s][A
  1%|          | 32/5434 [00:03<08:41, 10.35it/s][A
  1%|          | 34/5434 [00:03<08:48, 10.22it/s][A
  1%|          | 36/5434 [00:03<08:39, 10.39it/s]

	Train Loss: 0.006 | Train Acc: 99.84%
	 Val. Loss: 0.007 |  Val. Acc: 99.81%





In [20]:
#load weights
path='saved_weights.pt'
model.load_state_dict(torch.load(path));
model.eval();

#inference 
import spacy
nlp = spacy.load('en')

def predict(model, sentence):
    tokenized = [tok.text for tok in nlp.tokenizer(sentence)]  #tokenize the sentence 
    indexed = [TEXT.vocab.stoi[t] for t in tokenized]          #convert to integer sequence
    length = [len(indexed)]                                    #compute no. of words
    tensor = torch.LongTensor(indexed).to(device)              #convert to tensor
    tensor = tensor.unsqueeze(1).T                             #reshape in form of batch,no. of words
    length_tensor = torch.LongTensor(length)                   #convert to tensor
    prediction = model(tensor, length_tensor)                  #prediction 
    return prediction.item()                                   

In [21]:
import pandas as pd
lines_df = pd.read_csv('output_1_annotated.csv')  

In [22]:
preds = []
corrects = []
texts = []
for i, row in lines_df.iterrows():
    text = row['Text']
    category = row['Category']
    if category == "STATEMENT":
        category = "ANSWER"
    pred = predict(model, text)
    pred = "ANSWER" if pred > 0.4 else "QUESTION"
    preds.append(pred)
    corrects.append(category)
    texts.append(text)

In [23]:
correct_question = [(i, c, p) for i, (c, p) in enumerate(zip(corrects, preds)) if c == 'QUESTION' and p == 'QUESTION']
len(correct_question)
correct_question_text = [texts[i] for i, c, p in correct_question]
correct_question_text

['Tony - how would you rewrite?',
 'What you have built with it so far? We are in the process of using it for a flask NLP app',
 "Officially nominated by my Manager to be a DevOps Champion! Being with SAP for over a year now, I'm excited to try on a new hat apart from the developer hat. Having very little experience with it, this is going to be exciting and challenging. Any suggestions on where I can start?",
 'That’s a lot of sales. Does that include the phased sales that the companies did through the month of Nov?',
 'With this release, we hope you enjoy seeing content preview of links shared in our app! Do you spot an issue or have an idea? Come collaborate with us to improve! ',
 "Whats 'phased sales'? Sales prior to thanksgiving friday?",
 'Did you have to write code for the take home or just paper design?',
 'Time to learn more infra concepts. Does anyone want to join me to read https://www.oreilly.com/library/view/terraform-up/9781492046899/ and share learnings as we read? Put a

In [24]:
incorrect_question = [(i, c, p) for i, (c, p) in enumerate(zip(corrects, preds)) if c == 'QUESTION' and p == 'ANSWER']
len(incorrect_question)
incorrect_question_text = [texts[i] for i, c, p in incorrect_question]
incorrect_question_text

['I noticed that your news letters go to promotions/updates section of my Gmail. Mostly in promotions. Most of the people do not open the promotions of the Gmail. If there is a way to remove them from promotions then that might be helpful.',
 "It's in my backlog :). I bought some courses on Unreal Engine and Unity, but then a friend who works in the game industry introduced me to Godot and got me interested. Hopefully this year I can set some time to do something simple in 2d",
 'Oh and quite important....we have a brand new login screen. Do let us know if you align to our messaging there.',
 'Thank you so much! If you have any other framework suggestions to learn on top of React, please let me know about them!',
 'Setting Padding on each side individually fixed the issue. Ready to move on but curious if someone else faced it too and can tell me I am doing something wrong fundamentally. \n',
 'React Native Question: We have many use cases where a Screen -> Component -> Sub Component. A

In [25]:
from sklearn import metrics
print(metrics.confusion_matrix(corrects, preds, labels=["QUESTION", "ANSWER"]))
print(metrics.classification_report(corrects, preds, labels=["QUESTION", "ANSWER"]))

[[ 79  27]
 [ 25 719]]
              precision    recall  f1-score   support

    QUESTION       0.76      0.75      0.75       106
      ANSWER       0.96      0.97      0.97       744

    accuracy                           0.94       850
   macro avg       0.86      0.86      0.86       850
weighted avg       0.94      0.94      0.94       850



In [None]:
import numpy as np
data = metrics.confusion_matrix(corrects, preds, labels=["QUESTION", "ANSWER"])
pd.DataFrame(data, columns=np.unique(corrects), index = np.unique(corrects))