In [1]:
!pip install transformers
!pip install regex requests



In [2]:
import pandas as pd
import transformers
import numpy as np
from transformers import RobertaTokenizer
from transformers import *
import tokenizers
from keras.preprocessing.sequence import pad_sequences
import torch
import torch.nn as nn
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler, Dataset
from transformers import RobertaModel,RobertaConfig
from transformers import get_linear_schedule_with_warmup
from transformers import AdamW
import random
import time
import datetime
import sys

Using TensorFlow backend.


In [3]:
# Data Loader
from google.colab import drive
drive.mount('/content/gdrive')
root_path = 'gdrive/My Drive/tweet-sentiment-extraction/'

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [4]:


def read_test():
  paths = root_path+"test.csv"
  df = pd.read_csv(paths)
  df['text'] = df['text'].astype(str)
  return shuffle(df,random_state=np.random.randint(0,10))
def reaf_submission():
    test=pd.read_csv('../input/tweet-sentiment-extraction/sample_submission.csv')
    return shuffle(test,random_state=np.random.randint(0,10))
def read_train():
  paths = root_path+"train.csv"
  df = pd.read_csv(paths)
  df['text'] = df['text'].astype(str)
  df['selected_text']=df['selected_text'].astype(str)
  return shuffle(df,random_state=np.random.randint(0,10))

print(read_train().sample(frac=1).head(20).index.tolist)
train_csv, test_org = read_train().sample(frac=1).reset_index(drop=True), read_test().sample(frac=1).reset_index(drop=True)
train, val = train_test_split(train_csv, test_size=0.12)
train, test_csv = train_test_split(train, test_size=0.03)

<bound method IndexOpsMixin.tolist of Int64Index([ 1069,  7890, 13745, 26428,  9965, 15382, 17022,   572, 26498,
             6739, 21819, 11569, 26332, 24904, 23416, 21725,  9548, 19582,
            19464, 15337],
           dtype='int64')>


In [0]:
# Metric tested 
def jaccard(str1,str2):
  a = set(str1.lower().split()) 
  b = set(str2.lower().split())
  c = a.intersection(b)
  return float(len(c)) / (len(a) + len(b) - len(c))

In [6]:
currModel = 'roberta-base'
# using both to simplfy various methods
# tokenzier = RobertaTokenizer.from_pretrained(root_path,do_lower_case=True)
tokenzier = RobertaTokenizer.from_pretrained(currModel,do_lower_case=True)
tokenizer = tokenizers.ByteLevelBPETokenizer(
    vocab_file=root_path+'vocab.json', 
    merges_file=root_path+'merges.txt', 
    lowercase=True,
    add_prefix_space=True
)
# these are the ids used by our tokenizer
sentiment = {
    'neutral': int(tokenizer.encode('neutral').ids[0]),
    'positive': int(tokenizer.encode('positive').ids[0]),
    'negative':int(tokenizer.encode('negative').ids[0])
}
print(sentiment)

{'neutral': 7974, 'positive': 1313, 'negative': 2430}


In [0]:
# The structure of our input would be of the following form [Text={Context},special_token,sentiment={Question}]
# which is then predicted with selected_text.
def get_input_ids(idf):
    input_ids = []
    for i in range(idf.shape[0]):
        context,label = idf['text'].iloc[i].split(),idf['sentiment'].iloc[i].split()
        encoded_input = tokenzier.encode_plus(
                            " ".join(context)," ".join(label),                   # Sentence to encode.
                            # Add '[CLS]' and '[SEP]' 
                    )
        # encode_input returns input_ids and attention masks. 
        # Masks would be helpful latter to instruct model to avoid 
        # special tokens such as padding. 
        input_ids.append(encoded_input)
    return input_ids

In [0]:

# train_set_ids = get_input_ids(train)
# val_set_ids = get_input_ids(val)
test_ids = get_input_ids(test_csv)


The next part of data pre-processing is to come up with start and end tokens which gives start index and end index of the selected word in our encoded input. 

For instance, Conside the sample with

text = "Sooo SAD have to leave boston"

selected_text = "Sooo SAD" 

In this case start token is S and end token is D 
input_ids = [0, 3, 3, 11990, 560, 38457, 3, 2, 2, 2430, 2]

start_token = [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]

end_token = [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0]

Note: There are two ways this can be done character level and token level. I'm using character level tokenization in order to prevent loss of training samples.



In [0]:
# print(input_ids[3])
def create_labels(train2,t_ids):
    st = []
    et = []
    for i in range(train2.shape[0]):
        text,label = " ".join(train2['text'].iloc[i].split()), " ".join(train2['selected_text'].iloc[i].split())
        char = np.zeros((len(text)))
        idx = text.find(label)
        char[idx:idx+len(label)] = 1
        if text[idx-1]==' ': char[idx-1] = 1
        enc = tokenizer.encode(text)
        offsets = enc.offsets
        chard_to_token_index = []
        start = list(char).index(1)+1
        end_idx = start+len(label)
        # print(start,end_idx)
        for k in range(len(offsets)):
            my_set = range(offsets[k][0],offsets[k][1])
            # print(my_set)
            if(start <= offsets[k][1] and  offsets[k][1] <= end_idx or (start in my_set and end_idx in my_set)):
                chard_to_token_index.append(k)
        start_tokens,end_tokens = [0]*len(t_ids[i].input_ids),[0]*len(t_ids[i].input_ids)
        if(len(chard_to_token_index) > 1):
            # print(i)
            start_tokens[chard_to_token_index[0]+1] = 1
            end_tokens[chard_to_token_index[1]+1] = 1
            st.append(start_tokens)
            et.append(end_tokens)
        else:
            start_tokens[chard_to_token_index[0]+1],end_tokens[chard_to_token_index[0]+1] = 1,1
            st.append(start_tokens)
            et.append(end_tokens)
    return st,et
# train_st,train_et = create_labels(train,train_set_ids)
# val_st,val_et = create_labels(val,val_set_ids) 
# test_st,test_et = create_labels(test_csv,train_set_ids)

In [10]:
# Given the sequence length lets set MAX_LEN to 100
MAX_LEN = 100
def padding_input(ids,max_len,mask=False):
    print('\nPadding token: "{:}", ID: {:}'.format(tokenzier.pad_token, tokenzier.pad_token_id))
# seperate attention_masks and input_ids.
    print(ids[0])
    train_ids = [seq.input_ids for seq in ids]
    attention_masks = [seq.attention_mask for seq in ids]
    # input_ids = None
    train_ids = pad_sequences(train_ids, maxlen=MAX_LEN, dtype="long",value=1, truncating="post", padding="post")
    if mask:
        attention_masks = pad_sequences(attention_masks, maxlen=MAX_LEN, dtype="long", 
                            value=0, truncating="post", padding="post")
        
        return (train_ids,attention_masks)
    return train_ids
# train_ids, train_masks = padding_input(train_set_ids,MAX_LEN,True)
# val_ids,val_masks = padding_input(val_set_ids,MAX_LEN,True)
test_ids,test_masks = padding_input(test_ids,MAX_LEN,True) 
# Converting to torch tensors in order to use GPU.


Padding token: "<pad>", ID: 1
{'input_ids': [0, 1368, 5471, 734, 24, 12905, 29, 10, 16829, 47, 12905, 241, 634, 7409, 6, 8, 24, 12905, 29, 1531, 734, 98, 734, 3545, 328, 2, 2, 1313, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [0]:
# print(train_ids.shape)
# train_data = {
#     "ids": torch.tensor(train_ids,dtype=torch.long),
#     "att_mask": torch.tensor(train_masks,dtype=torch.long),
#     "token_type_ids": torch.zeros((train.shape[0],MAX_LEN),dtype=torch.long)
# }
# val_data = {
#     "ids": torch.tensor(val_ids,dtype=torch.long),
#     "att_mask": torch.tensor(val_masks,dtype=torch.long),
#     "token_type_ids": torch.zeros((val.shape[0],MAX_LEN),dtype=torch.long)

# }
test_data = {
    "ids": torch.tensor(test_ids,dtype=torch.long),
    "att_mask": torch.tensor(test_masks,dtype=torch.long),
    "token_type_ids": torch.zeros((test_csv.shape[0],MAX_LEN),dtype=torch.long)
}
# train_label = {
#     "start_tokens":torch.tensor(pad_sequences(train_st,maxlen=MAX_LEN, dtype="long", 
#                             value=0, truncating="post", padding="post"),dtype=torch.long),
#     "end_tokens":torch.tensor(pad_sequences(train_et,maxlen=MAX_LEN, dtype="long", 
#                             value=0, truncating="post", padding="post"),dtype=torch.long),
# }
# val_label = {
#     "start_tokens":torch.tensor(pad_sequences(val_st,maxlen=MAX_LEN, dtype="long", 
#                             value=0, truncating="post", padding="post"),dtype=torch.long),
#     "end_tokens": torch.tensor(pad_sequences(val_et,maxlen=MAX_LEN, dtype="long", 
#                             value=0, truncating="post", padding="post"),dtype=torch.long),
# }


In [0]:
# Create the DataLoader for our training set.
# print(val_data["ids"].shape,val_data["att_mask"].shape,val_data["token_type_ids"].shape)
# print(val_label["start_tokens"].shape,val_label["end_tokens"].shape)

In [0]:
# Uncomment only for retraining 
# batch_size = 32
# train_data = TensorDataset(train_data["ids"], train_data["att_mask"], train_data["token_type_ids"],train_label["start_tokens"],train_label["end_tokens"])
# train_sampler = RandomSampler(train_data)
# train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)
# # Create the DataLoader for our validation set.
# validation_data = TensorDataset(val_data["ids"], val_data["att_mask"], val_data["token_type_ids"],val_label["start_tokens"],val_label["end_tokens"])
# validation_sampler = SequentialSampler(validation_data)
# validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)



In [0]:
# Data pre-processing is one of the most important tasks when it comes to NLP. Next includes model 
# selection and training in order to make prediction on this new task.
# Model

class SentimentAnalysis(nn.Module):
    def __init__(self,configs):
        super(SentimentAnalysis,self).__init__()
        self.roberta = RobertaModel.from_pretrained(root_path+'roberta-base-weights.bin',config=configs)
        self.avgpool = nn.AvgPool2d(kernel_size=1,stride=(1,4))
        self.container = nn.Sequential(
            nn.Dropout(p=0.1),
            nn.Conv1d(768*2,768,kernel_size=1),
            nn.Softmax(dim=-1),
            
        )
        self.linear = nn.Linear(768,2)
    def forward(self,token_ids,att_masks,type_ids):
        _,_,y = self.roberta(
            input_ids=token_ids,
            attention_mask=att_masks,
            token_type_ids=type_ids,
            )
        avg_pool = torch.squeeze(self.avgpool(torch.stack((y[-2],y[-3],y[-4],y[-5]),dim=-1)),dim=-1)
        stacked = torch.transpose(torch.cat((y[-1],avg_pool),dim=-1),dim0=1,dim1=2)
        contained = self.container(stacked)
        start_logits,end_logits = self.linear(torch.transpose(contained,dim0=1,dim1=2)).split(1, dim=-1) 
        start_logits = start_logits.squeeze(-1) # (bs x SL)
        end_logits = end_logits.squeeze(-1) # (bs x SL)
        return start_logits,end_logits

In [0]:
model = SentimentAnalysis(configs=root_path+'config.json')

In [0]:
# device = torch.device("cuda")
# model.to(device)
# # print(model)

In [0]:
def calculate_loss(prediction,target):
    some_loss = nn.CrossEntropyLoss()
    diff_in_start = some_loss(prediction[0],target[0])
    diff_in_end  = some_loss(prediction[1],target[1])
    return (diff_in_start+diff_in_end)

In [0]:
# def format_time(elapsed):
#     elapsed_rounded = int(round((elapsed)))
#     return str(datetime.timedelta(seconds=elapsed_rounded))

# seed_val = 40
# random.seed(seed_val)
# np.random.seed(seed_val)
# torch.manual_seed(seed_val)
# torch.cuda.manual_seed_all(seed_val)
# def train_model(model,TEST_SENTENCE):
#     optimizer = AdamW(model.parameters(),
#                   lr = 3e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5
#                   eps = 1e-8, # args.adam_epsilon  - default is 1e-8.
#                   weight_decay = 0
#                 )
#     epochs = 5
#     total_steps = len(train_dataloader) * epochs
#     scheduler = get_linear_schedule_with_warmup(optimizer, 
#                                         num_warmup_steps = 0, 
#                                         num_training_steps = total_steps)
#     loss_values = []
#     for epoch_i in range(0, epochs):
#         print("")
#         print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
#         print('Training...')
#         t0 = time.time()
#         total_loss = 0
#         model.train()
        
#         for step, batch in enumerate(train_dataloader):
#             if step % 32 == 0 and not step == 0:
#                 elapsed = format_time(time.time() - t0)
#                 print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))
#             tok_ids = batch[0].to(device)
#             atten_masks = batch[1].to(device)
#             tok_type_ids = batch[2].to(device)
#             st_tok = torch.max(batch[3],1)[1].to(device)
#             # .to(device)
#             et_tok = torch.max(batch[4],1)[1].to(device)

#             model.zero_grad()

#             start_logits, end_logits = model(token_ids=tok_ids,
#                         att_masks=atten_masks,type_ids=tok_type_ids)
#             loss = calculate_loss((start_logits,end_logits),(st_tok,et_tok))
#             total_loss += torch.sum(loss).item()
#             loss.backward()
#         # clip grad to prevent exploding gradient problem
#         # Update parameters
#             optimizer.step()
#         # update learning rate
#             scheduler.step()
#     # Calculate the average loss over the training data.
#         avg_train_loss = total_loss / len(train_dataloader)  
#         loss_values.append(avg_train_loss)
#         print("")
#         print("  Average training loss: {0:.2f}".format(avg_train_loss))
#         print("  Training epcoh took: {:}".format(format_time(time.time() - t0)))
#         print("Running Validation...")
#         t0 = time.time()
#         model.eval()
#         val_loss, eval_accuracy = 0, 0
#         nb_eval_steps, nb_eval_examples = 0, 0

#         for batch in validation_dataloader:
#             v_input_ids = batch[0].to(device)
#             v_input_mask =  batch[1].to(device)
#             v_tok_ids = batch[2].to(device)
#             v_st  = torch.max(batch[3],1)[1].to(device)
#             v_et = torch.max(batch[4],1)[1].to(device)
#             with torch.no_grad():        
#                 start_logits, end_logits = model(token_ids=v_input_ids,
#                         att_masks=v_input_mask,type_ids=v_tok_ids)

# #             start_logits = start_logits.detach().cpu().numpy()
# #             end_logits = end_logits.detach().cpu().numpy()
# #             val_st = v_st.to('cpu').numpy()
# #             val_et = v_et.to('cpu').numpy()
#             # Calculate the accuracy for this batch of test sentences.
#             tmp_eval_accuracy = calculate_loss((start_logits,end_logits),(v_st,v_et))
#             # Accumulate the total accuracy.
#             eval_accuracy += tmp_eval_accuracy
#             # Track the number of batches
#             nb_eval_steps += 1
#         print("  Accuracy: {0:.2f}".format(eval_accuracy/nb_eval_steps))
#         print("  Validation took: {:}".format(format_time(time.time() - t0)))
#     print(" ")
#     print("Training complete!")
#     return loss_values

In [0]:
# freeze = train_model(model,"leggo")
# torch.save(model.state_dict(),root_path+"baseline/baseline")

In [0]:
model.load_state_dict(torch.load((root_path+"baseline/baseline")))
device = torch.device("cuda")
# model.to(device)
model.eval()

In [0]:

raw_data = test_csv
jaccard_output = []
text_output = []
print(test_data["ids"].shape)
test_data["ids"].cuda(),
test_data["att_mask"].cuda()
test_data["token_type_ids"].cuda()
y= model(test_data["ids"],test_data["att_mask"],test_data["token_type_ids"])
print(y[0].shape,y[1].shape)
for i in range(raw_data.shape[0]):
    start = torch.argmax(y[0][i])
    end = torch.argmax(y[1][i])
    text1 = " "+" ".join(raw_data['text'].iloc[k].split())
    enc = tokenizer.encode(text1)
    st = tokenizer.decode(enc.ids[a-1:b+1])
    text_output.append(st)
    jaccard_output.append(jaccard(raw_data['selected_text'].iloc[k],st))
print(raw_data['text'].iloc[k],text_output)
print(np.mean(jaccard_output))

torch.Size([726, 100])
