In [63]:
!pip install transformers



In [0]:
import pandas as pd
import transformers
from keras.callbacks import LambdaCallback
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import GRU
from keras.layers import Conv2D
import numpy as np

In [65]:
# Data Loader
from google.colab import drive
drive.mount('/content/gdrive')
root_path = 'gdrive/My Drive/tweet-sentiment-extraction/'
code = "4/zgHVgt5EYIxQpCKdQTmi61ujDfdn_p673NBszogLYWsCZMJpSbY_WIQ"

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [66]:
def read_test():
  path = root_path + "test.csv"
  df = pd.read_csv(path)
  df['text'] = df['text'].astype(str)
  return df
def reaf_submission():
    test=pd.read_csv('../input/tweet-sentiment-extraction/sample_submission.csv')
    return test

def read_train():
  path = root_path + "train.csv"
  df = pd.read_csv(path)
  df['text'] = df['text'].astype(str)
  df['selected_text']=df['selected_text'].astype(str)
  return df

train, test = read_train(), read_test()
samples = train.sample(1)
print(samples)

          textID  ... sentiment
5612  85c482fcbf  ...   neutral

[1 rows x 4 columns]


In [0]:
# loss function specifically requested for this

def jaccard(str1,str2):
  a = set(str1.lower().split()) 
  b = set(str2.lower().split())
  c = a.intersection(b)
  return float(len(c)) / (len(a) + len(b) - len(c))

In [68]:
from transformers import RobertaTokenizer
from transformers import *
import tokenizers
 
# using both to simplfy various methods
tokenzier = RobertaTokenizer.from_pretrained('roberta-base',do_lower_case=True)
tokenzier.save_vocabulary(root_path)
tokenizer = tokenizers.ByteLevelBPETokenizer(
    vocab_file=root_path+'vocab.json', 
    merges_file=root_path+'merges.txt', 
    lowercase=True,
    add_prefix_space=True
)
# these are the ids used by our tokenizer
sentiment = {
    'neutral': int(tokenizer.encode('neutral').ids[0]),
    'positive': int(tokenizer.encode('positive').ids[0]),
    'negative':int(tokenizer.encode('negative').ids[0])
}
print(sentiment)

{'neutral': 7974, 'positive': 1313, 'negative': 2430}


In [0]:
# The structure of our input would be of the following form [Text={Context},special_token,sentiment={Question}]
# which is then predicted with selected_text.
def get_input_ids(df):
    input_ids = []
    for i in range(df.shape[0]):
        context,label = df.loc[i,'text'].split(),df.loc[i,'sentiment'].split()
        encoded_input = tokenzier.encode_plus(
                            " ".join(context)," ".join(label),                   # Sentence to encode.
                            # Add '[CLS]' and '[SEP]' 
                    )
        # encode_input returns input_ids and attention masks. 
        # Masks would be helpful latter to instruct model to avoid 
        # special tokens such as padding. 
        input_ids.append(encoded_input)
    return input_ids

In [0]:
input_ids = get_input_ids(train)
test_ids = get_input_ids(test)

The next part of data pre-processing is to come up with start and end tokens which gives start index and end index of the selected word in our encoded input. 

For instance, Conside the sample with

text = "Sooo SAD have to leave boston"

selected_text = "Sooo SAD" 

In this case start token is S and end token is D 
input_ids = [0, 3, 3, 11990, 560, 38457, 3, 2, 2, 2430, 2]

start_token = [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]

end_token = [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0]

Note: There are two ways this can be done character level and token level. I'm using character level tokenization in order to prevent loss of training samples.



In [0]:
# print(input_ids[3])
start_tokens = [0] * train.shape[0]
end_tokens = [0] * train.shape[0]
for i in range(train.shape[0]):
    text,label = " ".join(train.loc[i,'text'].split()), " ".join(train.loc[i,'selected_text'].split())
    char = np.zeros((len(text)))
    idx = text.find(label)
    char[idx:idx+len(label)] = 1
    if text[idx-1]==' ': char[idx-1] = 1
    enc = tokenizer.encode(text)
    offsets = enc.offsets
    chard_to_token_index = []
    start = list(char).index(1)+1
    end_idx = start+len(label)
    # print(start,end_idx)
    for k in range(len(offsets)):
        my_set = range(offsets[k][0],offsets[k][1])
        # print(my_set)
        if(start <= offsets[k][1] and  offsets[k][1] <= end_idx or (start in my_set and end_idx in my_set)):
            chard_to_token_index.append(k)
    start_tokens,end_tokens = [0]*len(input_ids[i].input_ids),[0]*len(input_ids[i].input_ids)
    if(len(chard_to_token_index) > 1):
        start_tokens[chard_to_token_index[0]+1] = 1
        end_tokens[chard_to_token_index[1]+1] = 1
    else:  
        # print(i)
        start_tokens[chard_to_token_index[0]+1],end_tokens[chard_to_token_index[0]+1] = 1,1

In [72]:
# (input_ids,start_tokens,end_tokens)
# Next Step is to select max_len depending on our id_size and pad accordingly
print('Max sentence length: ', max([len(sen.input_ids) for sen in input_ids]))


Max sentence length:  91


In [74]:
from keras.preprocessing.sequence import pad_sequences
import torch
# Given the sequence length lets set MAX_LEN to 100
MAX_LEN = 100
def padding_input(ids,max_len,mask=False):
    print('\nPadding token: "{:}", ID: {:}'.format(tokenzier.pad_token, tokenzier.pad_token_id))
# seperate attention_masks and input_ids.
    train_ids = [seq.input_ids for seq in ids]
    attention_masks = [seq.attention_mask for seq in ids]
    # input_ids = None
    train_ids = pad_sequences(train_ids, maxlen=MAX_LEN, dtype="long",value=1, truncating="post", padding="post")
    if mask:
        attention_masks = pad_sequences(attention_masks, maxlen=MAX_LEN, dtype="long", 
                            value=0, truncating="post", padding="post")
        
        return (train_ids,attention_masks)
    return train_ids
train_ids,masks = padding_input(input_ids,MAX_LEN,True)
test_ids,test_masks = padding_input(test_ids,MAX_LEN,True)
# Converting to torch tensors in order to use GPU.
train_processed = {
    "ids": torch.tensor(train_ids),
    "mask": torch.tensor(masks),
    "start_tokens": torch.tensor(start_tokens),
    "end_token": torch.tensor(end_tokens),
    "token_type_ids": torch.zeros((train.shape[0],MAX_LEN),dtype=torch.float32)

}
test_processed = {
    "ids": torch.tensor(test_ids),
    "mask": torch.tensor(test_masks),
    "token_type_ids": torch.zeros((test.shape[0],MAX_LEN),dtype=torch.float32)

}


Padding token: "<pad>", ID: 1

Padding token: "<pad>", ID: 1


In [0]:
# Data pre-processing is one of the most important tasks when it comes to NLP. Next includes model 
# selection and training in order to make prediction on this new tasks
