In [1]:
import torch
import torchtext
from torchtext.datasets import text_classification
NGRAMS = 2
import os
BATCH_SIZE = 16
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [42]:
#Reproducing same results
SEED = 2019

#Torch
torch.manual_seed(SEED)

#Cuda algorithms
torch.backends.cudnn.deterministic = True  

#handling text data
from torchtext import data 

In [43]:
from nltk.tokenize import TweetTokenizer
tknzr = TweetTokenizer()

In [44]:
from torchtext.data.utils import ngrams_iterator
from nltk.corpus import stopwords

In [45]:
def text_preprocessing(text_list):
    clean_text=" ".join(text_list)
    clean_text = clean_text.replace("[^a-zA-Z0-9#]", " ")
    clean_text = clean_text.replace('#', 'hashtag ')
    clean_text = tknzr.tokenize(clean_text)
    return clean_text
    

In [29]:
TEXT = data.Field(tokenize="spacy",
                  batch_first=True,
                  include_lengths=True,
                  lower=True,
                  stop_words=set(stopwords.words('english')),
                  preprocessing=text_preprocessing)
LABEL = data.LabelField(dtype=torch.float, batch_first=True)

In [30]:
fields = [(None, None),('label', LABEL) ,('text',TEXT)]

In [31]:
# loading custom train dataset
training_data = data.TabularDataset(path='./data/train_E6oV3lV.csv',
                                    format='csv',
                                    fields=fields,
                                    skip_header=True)

# print preprocessed text
print(vars(training_data.examples[0]))

{'label': '0', 'text': ['@user', 'father', 'dysfunctional', 'selfish', 'drags', 'kids', 'dysfunction', '.', 'hashtag', 'run']}


In [32]:
import random
train_data, valid_data = training_data.split(split_ratio=0.9,stratified=True,random_state = random.seed(SEED))

In [33]:
# initialize glove embeddings
TEXT.build_vocab(train_data, min_freq=1, vectors="glove.6B.300d")
LABEL.build_vocab(train_data)

# No. of unique tokens in text
print("Size of TEXT vocabulary:", len(TEXT.vocab))

# No. of unique tokens in label
print("Size of LABEL vocabulary:", len(LABEL.vocab))

# Commonly used words
print(TEXT.vocab.freqs.most_common(10))


Size of TEXT vocabulary: 39862
Size of LABEL vocabulary: 2
[('hashtag', 67699), ('@user', 15835), ('\x9f', 15472), ('ð', 13644), ('!', 13187), ('.', 11048), ('\x98', 6569), ('\x80', 5959), (',', 5672), ("'", 5531)]


In [34]:
# check whether cuda is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# set batch size
BATCH_SIZE = 16

# Load an iterator
train_iterator, valid_iterator = data.BucketIterator.splits(
    (train_data, valid_data),
    batch_size=BATCH_SIZE,
    sort_key=lambda x: len(x.text),
    sort_within_batch=True,
    device=device)

In [35]:
import torch.nn as nn


class classifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim,
                 n_layers, bidirectional, dropout):

        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim,
                            hidden_dim,
                            num_layers=n_layers,
                            bidirectional=bidirectional,
                            dropout=dropout,
                            batch_first=True)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
        self.act = nn.Sigmoid()

    def forward(self, text, text_lengths):
        # text = [batch size,sent_length]
        embedded = self.embedding(text)
        # embedded = [batch size, sent_len, emb dim]
        # packed sequence
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded,
                                                            text_lengths,
                                                            batch_first=True)
        packed_output, (hidden, cell) = self.lstm(packed_embedded)
        # hidden = [batch size, num layers * num directions,hid dim]
        # cell = [batch size, num layers * num directions,hid dim]
        # concat the final forward and backward hidden state
        hidden = torch.cat((hidden[-2, :, :], hidden[-1, :, :]), dim=1)
        # hidden = [batch size, hid dim * num directions]
        dense_outputs = self.fc(hidden)
        # Final activation function
        outputs = self.act(dense_outputs)
        return outputs

In [36]:
# define hyperparameters
size_of_vocab = len(TEXT.vocab)
embedding_dim = 300
num_hidden_nodes = 32
num_output_nodes = 1
num_layers = 2
bidirection = True
dropout = 0.5

# instantiate the model
model = classifier(size_of_vocab,
                   embedding_dim,
                   num_hidden_nodes,
                   num_output_nodes,
                   num_layers,
                   bidirectional=True,
                   dropout=dropout)

In [37]:
#architecture
print(model)

#No. of trianable parameters
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)
    
print(f'The model has {count_parameters(model):,} trainable parameters')

#Initialize the pretrained embedding
pretrained_embeddings = TEXT.vocab.vectors
model.embedding.weight.data.copy_(pretrained_embeddings)

print(pretrained_embeddings.shape)


classifier(
  (embedding): Embedding(39862, 300)
  (lstm): LSTM(300, 32, num_layers=2, batch_first=True, dropout=0.5, bidirectional=True)
  (fc): Linear(in_features=64, out_features=1, bias=True)
  (act): Sigmoid()
)
The model has 12,069,257 trainable parameters
torch.Size([39862, 300])


In [38]:
import torch.optim as optim

# define optimizer and loss
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.BCELoss()
scheduler = optim.lr_scheduler.StepLR(optimizer, 1, gamma=0.9)


# define metric
def binary_accuracy(preds, y):
    # round predictions to the closest integer
    rounded_preds = torch.round(preds)

    correct = (rounded_preds == y).float()
    acc = correct.sum() / len(correct)
    return acc


# push to cuda if available
model = model.to(device)
criterion = criterion.to(device)

In [39]:
def train(model, iterator, optimizer, criterion, scheduler):

    # initialize every epoch
    epoch_loss = 0
    epoch_acc = 0

    # set the model in training phase
    model.train()

    for batch in iterator:

        # resets the gradients after every batch
        optimizer.zero_grad()

        # retrieve text and no. of words
        text, text_lengths = batch.text

        # convert to 1D tensor
        predictions = model(text, text_lengths).squeeze()

        # compute the loss
        loss = criterion(predictions, batch.label)

        # compute the binary accuracy
        acc = binary_accuracy(predictions, batch.label)

        # backpropage the loss and compute the gradients
        loss.backward()

        # update the weights
        optimizer.step()

        #loss and accuracy
        epoch_loss += loss.item()
        epoch_acc += acc.item()

    scheduler.step()
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [40]:
def evaluate(model, iterator, criterion):

    # initialize every epoch
    epoch_loss = 0
    epoch_acc = 0

    # deactivating dropout layers
    model.eval()

    # deactivates autograd
    with torch.no_grad():

        for batch in iterator:

            # retrieve text and no. of words
            text, text_lengths = batch.text

            # convert to 1d tensor
            predictions = model(text, text_lengths).squeeze()

            # compute loss and accuracy
            loss = criterion(predictions, batch.label)
            acc = binary_accuracy(predictions, batch.label)

            # keep track of loss and accuracy
            epoch_loss += loss.item()
            epoch_acc += acc.item()

    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [41]:
N_EPOCHS = 50
best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    # train the model
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion,
                                  scheduler)

    # evaluate the model
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)

    # save the best model
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'saved_weights.pt')

    print('Epoch: {}\n'.format(epoch))
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

RuntimeError: Length of all samples has to be greater than 0, but found an element in 'lengths' that is <= 0

In [105]:
# inference
import spacy
nlp = spacy.load('en')


def predict(model, sentence):
    tokenized = [tok.text
                 for tok in nlp.tokenizer(sentence)]  # tokenize the sentence
    indexed = [TEXT.vocab.stoi[t]
               for t in tokenized]  # convert to integer sequence
    length = [len(indexed)]  # compute no. of words
    tensor = torch.LongTensor(indexed).to(device)  # convert to tensor
    tensor = tensor.unsqueeze(1).T  # reshape in form of batch,no. of words
    length_tensor = torch.LongTensor(length)  # convert to tensor
    prediction = model(tensor, length_tensor)  # prediction
    return prediction.item()

In [106]:
import pandas as pd
test = pd.read_csv('./data/test_tweets_anuFYb8.csv')

In [107]:
test.head()

Unnamed: 0,id,tweet
0,31963,#studiolife #aislife #requires #passion #dedic...
1,31964,@user #white #supremacists want everyone to s...
2,31965,safe ways to heal your #acne!! #altwaystohe...
3,31966,is the hp and the cursed child book up for res...
4,31967,"3rd #bihday to my amazing, hilarious #nephew..."


In [108]:
prediction = []
ids = []
for row in tqdm(test.iterrows()):
    ids.append(row[1]['id'])
    prediction.append(predict(model, row[1]['tweet']))

17197it [02:04, 137.79it/s]


In [109]:
labels = []
for pred in prediction:
    if pred > 0.5:
        labels.append(1)
    else:
        labels.append(0)

In [110]:
submission = pd.DataFrame(data={'id':ids,'label':labels})

In [111]:
submission.to_csv('sub.csv',index=False)