<a href="https://colab.research.google.com/github/graviraja/100-Days-of-NLP/blob/applications%2Fclassification/applications/classification/sentiment_classification/Sentimix%20with%20XLM-Roberta.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
train_file = '/content/drive/My Drive/train_14k_split_conll.txt'
test_file = '/content/drive/My Drive/dev_3k_split_conll.txt'

In [5]:
!pip install indic_transliteration -q
!pip install contractions -q
!pip install transformers -q

In [6]:
import re
import time
import string
import contractions
import numpy as np
import pandas as pd

from indic_transliteration import sanscript
from indic_transliteration.sanscript import SchemeMap, SCHEMES, transliterate

from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn import metrics

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.optim.lr_scheduler import ReduceLROnPlateau

from torchtext.data import Field, LabelField, TabularDataset, BucketIterator
from transformers import XLMRobertaTokenizer, XLMRobertaModel, AdamW, get_linear_schedule_with_warmup

import matplotlib.pyplot as plt
import seaborn as sns

  import pandas.util.testing as tm


In [7]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [8]:
with open(train_file) as f:
    data = f.readlines()

with open(test_file, 'r') as f:
    test_data = f.readlines()

In [9]:
def parse_data(data):
    uids, sentences, sentences_info, sentiment = [], [], [], []
    
    single_sentence, single_sentence_info = [], []
    sent = ""
    uid = 0

    for idx, each_line in enumerate(data):
        line = each_line.strip()
        tokens = line.split('\t')
        num_tokens = len(tokens)
        if num_tokens == 2:
            # add the word
            single_sentence.append(tokens[0])
            # add the word info(lang)
            single_sentence_info.append(tokens[1])
        elif num_tokens == 3 and idx > 0:
            # append the sentence data
            sentences.append(single_sentence)
            sentences_info.append(single_sentence_info)
            sentiment.append(sent)
            uids.append(uid)
            sent = tokens[-1]
            uid = int(tokens[1])
            # clear the single sentence
            single_sentence = []
            single_sentence_info = []
        # new line after the sentence
        elif num_tokens == 1:
            continue
        else:
            sent = tokens[-1]
            uid = int(tokens[1])

    # for the last sentence
    if len(single_sentence) > 0:
        sentences.append(single_sentence)
        sentences_info.append(single_sentence_info)
        sentiment.append(sent)
        uids.append(uid)
        
    assert len(sentences) == len(sentences_info) == len(sentiment) == len(uids)
    return sentences, sentences_info, sentiment, uids

In [10]:
sentences, sentences_info, sentiment, uids = parse_data(data)

In [11]:
test_sentences, test_sentences_info, test_sentiment, test_uids = parse_data(test_data)

In [12]:
list(zip(sentences[0], sentences_info[0]))

[('nen', 'Eng'),
 ('á', 'O'),
 ('vist', 'Eng'),
 ('bolest', 'Eng'),
 ('vztek', 'Eng'),
 ('smutek', 'Eng'),
 ('zmatek', 'Hin'),
 ('osam', 'Hin'),
 ('ě', 'O'),
 ('lost', 'Eng'),
 ('beznad', 'Eng'),
 ('ě', 'O'),
 ('j', 'Hin'),
 ('a', 'Eng'),
 ('nakonec', 'Eng'),
 ('jen', 'Hin'),
 ('klid', 'Hin'),
 ('Asi', 'Hin'),
 ('takhle', 'Hin'),
 ('vypad', 'Hin'),
 ('á', 'O'),
 ('m', 'Hin'),
 ('ů', 'O'),
 ('j', 'Eng'),
 ('life', 'Eng'),
 ('...', 'O')]

In [13]:
data = "jen klid takhle vypad"
transliterate(data, sanscript.ITRANS, sanscript.DEVANAGARI)

'जेन् क्लिद् तख्ले व्य्पद्'

In [14]:
def translate(sentences, sentences_info):
    translated = []

    for sent, sent_info in zip(sentences, sentences_info):
        partial_translated = []
        for word, word_info in zip(sent, sent_info):
            if word_info == "Hin":
                partial_translated.append(transliterate(word, sanscript.ITRANS, sanscript.DEVANAGARI))
            else:
                partial_translated.append(word)
        translated.append(partial_translated)
    
    return translated

In [15]:
translated_sentences = translate(sentences, sentences_info)
test_translated_sentences = translate(test_sentences, test_sentences_info)

In [16]:
url_pattern = r'https(.*)/\s[\w\u0900-\u097F]+'
special_chars = r'[_…\*\[\]\(\)&“]'
names_with_numbers = r'([A-Za-z\u0900-\u097F]+)\d{3,}'
apostee = r"([\w]+)\s'\s([\w]+)"
names = r"@[\s]*[\w\u0900-\u097F]+[\s]*[_]+[\s]*[\w\u0900-\u097F]+|@[\s]*[\w\u0900-\u097F]+"

def preprocess_data(sentence_tokens):
    sentence = " ".join(sentence_tokens)
    sentence = " " + sentence
    # remove rt and … from string
    sentence = sentence.replace(" RT ", "")
    sentence = sentence.replace("…", "")
    # replace apostee
    sentence = sentence.replace("’", "'")
    # replace names
    sentence = re.sub(re.compile(names), " ", sentence)
    # remove urls
    sentence = re.sub(re.compile(url_pattern), "", sentence)
    # combine only ' related words => ... it ' s ... -> ... it's ...
    sentence = re.sub(re.compile(apostee), r"\1'\2", sentence)
    # fix contractions
    sentence = contractions.fix(sentence)
    # replace names ending with numbers with only names (remove numbers)
    sentence = re.sub(re.compile(names_with_numbers), r" ", sentence)
    sentence = " ".join(sentence.split()).strip()
    return sentence


In [17]:
MODEL_NAME = "xlm-roberta-base"
tokenizer = XLMRobertaTokenizer.from_pretrained(MODEL_NAME)

In [18]:
print(tokenizer.sep_token, tokenizer.sep_token_id)
print(tokenizer.cls_token, tokenizer.cls_token_id)
print(tokenizer.pad_token, tokenizer.pad_token_id)
print(tokenizer.unk_token, tokenizer.unk_token_id)

</s> 2
<s> 0
<pad> 1
<unk> 3


In [33]:
" ".join(sentences[32]), sentiment[32]

('@ IndiaToday Teri kimat dokodi ki ho gayi ... amit shah will capture telegana soon ... kcr will resign ...',
 'negative')

In [34]:
" ".join(translated_sentences[32])

'@ IndiaToday टेरि किमत् दोकोदि कि हो गयि ... अमित् शह् will capture telegana soon ... kcr will resign ...'

In [30]:
preprocess_data(translated_sentences[32])

'टेरि किमत् दोकोदि कि हो गयि ... अमित् शह् will capture telegana soon ... kcr will resign ...'

In [35]:
encoding = tokenizer.encode_plus(
  preprocess_data(translated_sentences[32]),
  max_length=60,
  add_special_tokens=True, # Add '[CLS]' and '[SEP]'
  return_token_type_ids=False,
  truncation=True,
  pad_to_max_length=True,
  return_attention_mask=True,
  return_tensors='pt',  # Return PyTorch tensors
)

In [36]:
print(len(encoding['input_ids'][0]))
encoding['input_ids'][0]

60


tensor([     0,  46005,  18992,   1682, 154156,  10850,    356,  13551,   1682,
          1253,   5167,  67625,    153, 129069,   4377,   8933,   3849,   4377,
          1221, 141621,   5501,  24869,  33662,    153,    472,  23150,   1221,
        199747,    153,      2,      1,      1,      1,      1,      1,      1,
             1,      1,      1,      1,      1,      1,      1,      1,      1,
             1,      1,      1,      1,      1,      1,      1,      1,      1,
             1,      1,      1,      1,      1,      1])

In [22]:
print(len(encoding['attention_mask'][0]))
encoding['attention_mask']

60


tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

In [23]:
tokenizer.convert_ids_to_tokens(encoding['input_ids'][0])

['<s>',
 '▁',
 'ः',
 'आ',
 'न्',
 '▁या',
 'र्',
 '▁ने',
 'ह',
 '▁',
 '😔',
 '😔',
 '▁क',
 'ब्',
 '▁करे',
 'ग',
 '▁वो',
 'ह',
 '्',
 '▁पो',
 'स्त',
 '्',
 '▁',
 '😭',
 '▁ऊ',
 'स्',
 'ने',
 '▁न',
 '▁स',
 'च्',
 '▁मे',
 'इन्',
 '▁photo',
 'shoot',
 '▁कर',
 '्न',
 '▁च',
 'ह',
 'िये',
 '▁फिर',
 '्',
 '▁वो',
 'ह',
 '्',
 '▁पो',
 'स्त',
 '्',
 '▁करे',
 'ग',
 '</s>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>']

In [24]:
" ".join(sentences[29]), sentiment[29]

('Madam @ SushmaSwaraj ji we always miss you as a # videsh _ mantri',
 'positive')

In [19]:
" ".join(translated_sentences[29])

'ंअदम् @ Sउश्मSवरज् जि we always miss you as a # विदेश् _ मन्त्रि'

In [20]:
preprocess_data(translated_sentences[29])

'ंअदम् जि we always miss you as a # विदेश् _ मन्त्रि'

In [21]:
" ".join(sentences[10]), sentiment[10]

('@ ECISVEEP Can you answer miscalculated votes on each seat ? One vote matters ! # deshkamahatyohar hai aur apne dhji … https // t . co / SuHS4mx6Dm',
 'neutral')

In [22]:
" ".join(translated_sentences[10])

'@ ECISVEEP Can you answer miscalculated votes on each seat ? One vote मत्तेर्स् ! # देश्कमहत्योहर् है और् अप्ने dhji … https // t . cओ / SउःS४म्क्ष्६ड्म्'

In [23]:
preprocess_data(translated_sentences[10])

'Can you answer miscalculated votes on each seat ? One vote मत्तेर्स् ! # देश्कमहत्योहर् है और् अप्ने dhji'

In [25]:
%%time
processed_sentences = []

for sent in translated_sentences:
    processed_sentences.append(preprocess_data(sent))

test_data = []

for sent in test_translated_sentences:
    test_data.append(preprocess_data(sent))

CPU times: user 660 ms, sys: 4.4 ms, total: 664 ms
Wall time: 665 ms


In [26]:
sentiment_mapping = {
    "negative": 0,
    "neutral": 1,
    "positive": 2
}

In [27]:
labels = [sentiment_mapping[sent] for sent in sentiment]
test_label = [sentiment_mapping[sent] for sent in test_sentiment]

In [28]:
train_uids, val_uids, train_data, val_data, train_label, val_label = train_test_split(uids, processed_sentences, labels, test_size=0.2)

In [29]:
len(train_data), len(val_data), len(test_data)

(11200, 2800, 3000)

In [30]:
train_token_lengths = [len(sent.split()) for sent in train_data]

In [31]:
MAX_LEN = 150

In [32]:
class SentiMixDataSet(Dataset):
    def __init__(self, inputs, labels, tokenizer, max_len):
        self.sentences = inputs
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
    
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, item):
        sentence = self.sentences[item]
        sentiment = int(self.labels[item])
        
        encoding = self.tokenizer.encode_plus(
            sentence,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            pad_to_max_length=True,
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        
        return {
            "text": sentence,
            "input_ids": encoding['input_ids'].flatten(),
            "attention_mask": encoding['attention_mask'].flatten(),
            "label": torch.tensor(sentiment, dtype=torch.long)
        }

In [33]:
train_dataset = SentiMixDataSet(train_data, train_label, tokenizer, MAX_LEN)
val_dataset = SentiMixDataSet(val_data, val_label, tokenizer, MAX_LEN)
test_dataset = SentiMixDataSet(test_data, test_label, tokenizer, MAX_LEN)

In [34]:
BATCH_SIZE = 64

In [35]:
train_data_loader = DataLoader(train_dataset, shuffle=True, batch_size=BATCH_SIZE)
valid_data_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)
test_data_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

In [36]:
# sample
sample = next(iter(train_data_loader))

In [37]:
sample["input_ids"].shape, sample["attention_mask"].shape, sample["label"].shape

(torch.Size([64, 150]), torch.Size([64, 150]), torch.Size([64]))

In [38]:
class XLMModel(nn.Module):
    def __init__(self, output_dim, dropout=0.3):
        super().__init__()

        self.bert = XLMRobertaModel.from_pretrained(MODEL_NAME)
        hidden_size = self.bert.config.to_dict()['hidden_size']

        self.out = nn.Linear(hidden_size, output_dim)
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, input_ids, attention_mask):
        # text => [batch_size, seq_len]

        _, pooled_output = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        logits = self.out(self.dropout(pooled_output))
        return logits

In [39]:
output_dim = 3
model = XLMModel(output_dim)

In [40]:
model = model.to(device)

In [41]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 278,045,955 trainable parameters


In [42]:
EPOCHS = 10
lr = 5e-5
min_lr = 1e-7
lr_decay=0.5
lr_patience=2


optimizer = optim.Adam(model.parameters(), lr=lr)
scheduler = ReduceLROnPlateau(optimizer, 'min', lr_decay, lr_patience, verbose=True, min_lr=min_lr)
loss_fn = nn.CrossEntropyLoss().to(device)

In [43]:
def train(model, iterator, clip=2.0):
    epoch_loss = 0
    model.train()

    for batch in iterator:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        targets = batch["label"].to(device)

        predictions = model(
            input_ids=input_ids,
            attention_mask=attention_mask
        )

        optimizer.zero_grad()
        loss = loss_fn(predictions, targets)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        epoch_loss += loss.item()
    
    return epoch_loss / len(iterator)

In [44]:
def simple_accuracy(preds, labels):
    """Takes in two lists of predicted labels and actual labels and returns the accuracy in the form of a float. """
    return np.equal(preds, labels).mean()

In [45]:
def evaluate(model, iterator):
    model.eval()
    epoch_loss = 0
    preds = []
    trgs = []

    with torch.no_grad():
        for batch in iterator:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            targets = batch["label"].to(device)

            predictions = model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )
            
            loss = loss_fn(predictions, targets)
            epoch_loss += loss.item()

            trgs.extend(targets.detach().cpu().numpy().tolist())
            _, predicted = torch.max(predictions, 1)
            preds.extend(predicted.detach().cpu().numpy().tolist())

    return epoch_loss / len(iterator), simple_accuracy(preds, trgs)

In [46]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [47]:
best_valid_loss = float('inf')

for epoch in range(EPOCHS):
    start_time = time.time()
    train_loss = train(model, train_data_loader)
    val_loss, val_acc = evaluate(model, valid_data_loader)
    end_time = time.time()
    scheduler.step(val_loss)
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    print(f"Epoch: {epoch + 1:02} | Time: {epoch_mins}m {epoch_secs:.2f}s")
    print(f"\tTrain Loss: {train_loss:.3f} | Val Loss: {val_loss:.3f} | Val Acc: {val_acc:.3f}")
    
    if val_loss < best_valid_loss:
        best_valid_loss = val_loss
        torch.save(model.state_dict(), 'xlm_roberta.pt')


Epoch: 01 | Time: 5m 17.00s
	Train Loss: 1.007 | Val Loss: 0.932 | Val Acc: 0.558
Epoch: 02 | Time: 5m 25.00s
	Train Loss: 0.896 | Val Loss: 0.954 | Val Acc: 0.551
Epoch: 03 | Time: 5m 25.00s
	Train Loss: 0.832 | Val Loss: 0.882 | Val Acc: 0.594
Epoch: 04 | Time: 5m 25.00s
	Train Loss: 0.758 | Val Loss: 0.916 | Val Acc: 0.587
Epoch: 05 | Time: 5m 25.00s
	Train Loss: 0.681 | Val Loss: 0.943 | Val Acc: 0.596
Epoch: 06 | Time: 5m 25.00s
	Train Loss: 0.596 | Val Loss: 1.002 | Val Acc: 0.599
Epoch: 07 | Time: 5m 25.00s
	Train Loss: 0.517 | Val Loss: 1.116 | Val Acc: 0.585


KeyboardInterrupt: ignored

In [19]:
model.load_state_dict(torch.load('xlm_roberta.pt'))

NameError: ignored

In [1]:
with torch.no_grad():
    model.eval()
    preds = []
    targets = []
    for batch in test_data_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        targets = batch["label"].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        # get the predicted labels
        _, predicted = torch.max(outputs, 1)
        # Add data to lists
        preds.extend(predicted.detach().cpu().numpy().tolist())
        targets.extend(batch.label.detach().cpu().numpy().tolist())

print(metrics.classification_report(targets, preds))

NameError: ignored