In [None]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.17.0-py3-none-any.whl (3.8 MB)
[K     |████████████████████████████████| 3.8 MB 5.2 MB/s 
[?25hCollecting tokenizers!=0.11.3,>=0.11.1
  Downloading tokenizers-0.11.6-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.5 MB)
[K     |████████████████████████████████| 6.5 MB 32.2 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.4.0-py3-none-any.whl (67 kB)
[K     |████████████████████████████████| 67 kB 4.3 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 52.0 MB/s 
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.49-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 15.2 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Fo

In [None]:
import os
import torch
import pandas as pd
import torch.nn as nn
import numpy as np
import torch.nn.functional as F
from torch.optim import lr_scheduler
import matplotlib.pyplot as plt
from sklearn import model_selection
from sklearn import metrics
import transformers
import tokenizers
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup
from tqdm.autonotebook import tqdm

In [None]:
from google.colab import drive
drive.mount('/content/drive')
os.chdir('/content/drive/Shareddrives/lign167')

Mounted at /content/drive


In [None]:
batch_size=64
BERT_PATH = "bert-base-uncased"
TRAINING_FILE ="tweet-sentiment-extraction/train.csv"
VALID_BATCH_SIZE = 16
MODEL_PATH = "model.bin"
TOKENIZER = tokenizers.BertWordPieceTokenizer(
    "bert-base-uncased/vocab.txt",
    lowercase=True
)


In [None]:
class Value_DataBase:
    #This class acts as a value database for our losses and jaccard score in one traning session
    def __init__(self):
        self.value = 0
        self.average = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.value = val
        self.sum += val * n
        self.count += n
        self.average = self.sum / self.count

class EarlyStopping:
    def __init__(self, patience=7, mode="max", delta=0.001):
        self.patience = patience
        self.counter = 0
        self.mode = mode
        self.best_score = None
        self.early_stop = False
        self.delta = delta
        if self.mode == "min": self.val_score = float("inf")
        else: self.val_score = float("-inf")

    def __call__(self, epoch_score, model, model_path):

        if self.mode == "min": score = -epoch_score
        else: score = np.copy(epoch_score)

        if self.best_score is None:
            self.best_score = score
            self.save_checkpoint(epoch_score, model, model_path)
        elif score < self.best_score + self.delta:
            self.counter += 1
            print('EarlyStopping counter: {} out of {}'.format(self.counter, self.patience))
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            self.save_checkpoint(epoch_score, model, model_path)
            self.counter = 0

    def save_checkpoint(self, epoch_score, model, model_path):
        if epoch_score not in [-np.inf, np.inf, -np.nan, np.nan]:
            print('Validation score improved from ' + str(self.val_score) +'to ' + str(epoch_score))#Saving model!'.format(self.val_score, epoch_score))
            torch.save(model.state_dict(), model_path)
        self.val_score = epoch_score


def jaccard(str1, str2): 
    a = set(str1.lower().split()) 
    b = set(str2.lower().split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

Processing

In [None]:
def processing(whole_tweet, extracted_tweet, sentiment, tokenizer, max_len):
  #Initialize variables
  et_len = len(extracted_tweet)
  ind_start = 0
  ind_end = 0

  #Find starting and end index values of extracted_tweet
  for i in range(len(whole_tweet)):
    if whole_tweet[i] == extracted_tweet[0] and whole_tweet[i:i+et_len] == extracted_tweet:
      ind_start = i
      ind_end = i+et_len-1
      break
  
  #Get character level binary encoding
  binary_targets= [0]*len(whole_tweet)
  binary_targets[ind_start: ind_end+1] = [1]*(ind_end+1-ind_start)

  #Get tokenized values (ID's and index offsets)
  tokenized_tweet = tokenizer.encode(whole_tweet)
  tweet_ids = tokenized_tweet.ids[1:-1]
  tweet_offset = tokenized_tweet.offsets[1:-1]

  #Retrive the relevant indexs
  pos_target_ind =[i  for i,(offstart,offend) in enumerate(tweet_offset) if sum(binary_targets[offstart:offend])>0]

  sentiment_id = {'positive': 5000, 'negative': 6000, 'neutral': 7000}#{'positive': 3893, 'negative': 4997, 'neutral': 8699}

  bert_input_ids = [101] + [sentiment_id[sentiment]] + [102] + tweet_ids + [102]
  token_type = [0]*3 + [1]* (len(tweet_ids)+1)
  masks = [1]* len(token_type)
  tweet_offset = [(0,0)]*3 + tweet_offset + [(0,0)]
  
  target_pos_ind_start = pos_target_ind[0] + 3
  target_pos_ind_end = pos_target_ind[-1] + 3

  #BERT takes same size input, so we must pad if lenght is less than maximum sentence length
  pad_quantity= max_len - len(bert_input_ids)
  if pad_quantity>0:
    bert_input_ids+= [0]*pad_quantity
    masks+= [0]*pad_quantity
    token_type+= [0]*pad_quantity
    tweet_offset+= [(0,0)]*pad_quantity
  
  #Return everything as a dictionary
  return {
        'ids': bert_input_ids,
        'mask': masks,
        'token_type_ids': token_type,
        'targets_start': target_pos_ind_start,
        'targets_end': target_pos_ind_end,
        'orig_tweet': whole_tweet,
        'orig_selected': extracted_tweet,
        'sentiment': sentiment,
        'offsets': tweet_offset
    }

## Data Loader

In [None]:
class TweetDataset:
    """
    Dataset which stores the tweets and returns them as processed features
    """
    def __init__(self, tweet, sentiment, selected_text):
        self.tweet = tweet
        self.sentiment = sentiment
        self.selected_text = selected_text
        self.tokenizer = TOKENIZER
        self.max_len = 128
    
    def __len__(self):
        return len(self.tweet)

    def __getitem__(self, item):
        data = processing(
            self.tweet[item], 
            self.selected_text[item], 
            self.sentiment[item],
            self.tokenizer,
            self.max_len
        )

        # Return the processed data where the lists are converted to `torch.tensor`s
        return {
            'ids': torch.tensor(data["ids"], dtype=torch.long),
            'mask': torch.tensor(data["mask"], dtype=torch.long),
            'token_type_ids': torch.tensor(data["token_type_ids"], dtype=torch.long),
            'targets_start': torch.tensor(data["targets_start"], dtype=torch.long),
            'targets_end': torch.tensor(data["targets_end"], dtype=torch.long),
            'orig_tweet': data["orig_tweet"],
            'orig_selected': data["orig_selected"],
            'sentiment': data["sentiment"],
            'offsets': torch.tensor(data["offsets"], dtype=torch.long)
        }


## MODEL

In [None]:
class ExtractSentiModel(transformers.BertPreTrainedModel):
    """
    Model class that combines a pretrained bert model with a linear later
    """
    def __init__(self, conf):
        super(ExtractSentiModel, self).__init__(conf)
        
        self.bertmodel = transformers.BertModel.from_pretrained(BERT_PATH, config=conf)    # load pretrained model
        self.dropout = nn.Dropout(0.1)          #set dropout to avoid overfitting
        self.lay0 = nn.Linear(768 * 2, 2)       # 768 bert's hidden rep dim, 2 since we pick last two layers for start and end logits
        torch.nn.init.normal_(self.lay0.weight, std=0.02)
    
    def forward(self, ids, mask, token_type_ids):
        # Return the hidden states from the BERT backbone
        outputs = self.bertmodel(ids, attention_mask=mask, token_type_ids=token_type_ids)

        output = torch.cat((outputs.hidden_states[-1], outputs.hidden_states[-2]), dim=-1) # bs x SL x (768 * 2)  #concat last two hidden states to make it less specific to original bert
        
        output = self.dropout(output) # bs x SL x (768 * 2)
        
        logits = self.lay0(output) # bs x SL x 2
        start_logits, end_logits = logits.split(1, dim=-1)

        start_logits = start_logits.squeeze() # (bs x SL)
        end_logits = end_logits.squeeze() # (bs x SL)

        return start_logits, end_logits

Training

In [None]:
def training(data_dic, test_loader, es, model, optimizer, device, scheduler=None):
  #Prepare model to train
  loss_vals=  []
  for epoch in range(5):

    model.train()

    losss = Value_DataBase()
    jacs = Value_DataBase()

    epoch_loss= []
    for ind, dic in enumerate(tqdm(data_dic, total = len(data_dic))):
      #Send data to device for gpu
      id = dic["ids"].to(device,dtype=torch.long)
      tok_type = dic['token_type_ids'].to(device, dtype=torch.long)
      mask = dic["mask"].to(device,dtype=torch.long)
      tar_start = dic["targets_start"].to(device,dtype=torch.long)
      tar_end = dic["targets_end"].to(device,dtype=torch.long)

      #Reset the gradients
      model.zero_grad()

      #Perform prediction
      start_logit, end_logit = model(ids=id, mask=mask, token_type_ids=tok_type)

      #Calculate loss
      loss_func = nn.CrossEntropyLoss()
      loss = loss_func(start_logit, tar_start) +loss_func(end_logit, tar_end)   

      #Gradient calculagtion
      loss.backward()
      epoch_loss.append(loss.item())

      #Update weights 
      optimizer.step()

      #Scheduler is updated
      scheduler.step()

      #_, predicted = outputs.max(1)
      #total += len(data_dic)
      #correct += predicted.eq(labels).sum().item()

      #Convert outputs to probabilities (utilized on cpu)
      start_prob = torch.softmax(start_logit, dim=1).cpu().detach().numpy()
      end_prob = torch.softmax(end_logit, dim=1).cpu().detach().numpy()

      #Calculate the accuracy metric, Jaccard in our case
      jac_score = []
      for i in range(len(dic['orig_tweet'])):
        cur_jac_score, _ = calculate_jaccard_score(original_tweet=dic['orig_tweet'][i], target_string= dic["orig_selected"][i], sentiment_val=dic["sentiment"][i], idx_start= np.argmax(start_prob[i,:]), idx_end= np.argmax(end_prob[i,:]),offsets=dic["offsets"][i])
        jac_score.append(cur_jac_score)
      
      jacs.update(np.mean(jac_score), id.size(0))
      losss.update(loss.item(), id.size(0))

      print("loss_avg: " + str(losss.average), " Jaccard_avg: "+ str(jacs.average))
    loss_vals.append(sum(epoch_loss)/len(epoch_loss))
    jaccard = eval_fn(test_loader, model, device)
    print(f"Jaccard Score = {jaccard}")
    es(jaccard, model, model_path=f"model.bin")
    if es.early_stop:
        print("Early stopping")
        break
  return loss_vals

In [None]:
def calculate_jaccard_score(original_tweet, target_string, sentiment_val, idx_start, idx_end, offsets, verbose=False):
    #This function will return the jaccard similarity score for the phrase we predict and the actual phrase
    target_string = target_string.strip()
    
    # Get the predicted output string
    predict_out  = ""
    
    # When sentiment of tweet is neutral then we set the entire tweet as selected text
    if len(original_tweet.split()) < 2 or sentiment_val == "neutral":
        return jaccard(target_string, original_tweet.strip()), original_tweet

    #The end index has to be greater than end index otherwise we set end to start (EDGE CASE)
    idx_end = max(idx_end, idx_start)
    
    for idx in range(idx_start, idx_end + 1):
        n = len(offsets)
        predict_out = predict_out + original_tweet[slice(offsets[idx][0],offsets[idx][1])]
        # When the end of current token is not equal to start of next token, that means that in between
        # there is a space which is causing this to happen so we add it
        if (idx+1) < n:
            if offsets[idx][1] != offsets[idx+1][0]:
                predict_out = predict_out + " "

    # Jaccard score between predicted and actual string and predicted final string returned
    return jaccard(target_string, predict_out.strip()), predict_out


def eval_fn(data_loader, model, device):
    """
    Evaluation function to predict on the test set
    """
    # Evaluation mode
    
    model.eval()
    losses = Value_DataBase()
    jaccards = Value_DataBase()
    
    # Turns off gradient calculations
    with torch.no_grad():
        tk0 = tqdm(data_loader, total=len(data_loader))
        # Make predictions and calculate loss / jaccard score for each batch
        for bi, d in enumerate(tk0):
            ids = d["ids"]
            token_type_ids = d["token_type_ids"]
            mask = d["mask"]
            sentiment = d["sentiment"]
            orig_selected = d["orig_selected"]
            orig_tweet = d["orig_tweet"]
            targets_start = d["targets_start"]
            targets_end = d["targets_end"]
            offsets = d["offsets"].numpy()

            # Move tensors to GPU for faster matrix calculations
            ids = ids.to(device, dtype=torch.long)
            token_type_ids = token_type_ids.to(device, dtype=torch.long)
            mask = mask.to(device, dtype=torch.long)
            targets_start = targets_start.to(device, dtype=torch.long)
            targets_end = targets_end.to(device, dtype=torch.long)

            # Predict logits for start and end indexes
            predict_start, predict_end = model(ids=ids,mask=mask,token_type_ids=token_type_ids)
            # Calculate loss
            loss_func = nn.CrossEntropyLoss()
            loss = loss_func(predict_start, targets_start) +loss_func(predict_end, targets_end)   
            # Change the logits to some values like probability by applying softmax to it
            predict_start = torch.softmax(predict_start, dim=1).cpu().detach().numpy()
            predict_end = torch.softmax(predict_end, dim=1).cpu().detach().numpy()
            # Calculate jaccard scores for each tweet in the batch
            jaccard_scores = []
            for i, tweet in enumerate(orig_tweet):
                selected_tweet = orig_selected[i]
                tweet_sentiment = sentiment[i]
                jaccard_score, _ = calculate_jaccard_score(
                    original_tweet=tweet,
                    target_string=selected_tweet,
                    sentiment_val=tweet_sentiment,
                    idx_start=np.argmax(predict_start[i, :]),
                    idx_end=np.argmax(predict_end[i, :]),
                    offsets=offsets[i]
                )
                jaccard_scores.append(jaccard_score)

            # Update running jaccard score and loss
            jaccards.update(np.mean(jaccard_scores), ids.size(0))
            losses.update(loss.item(), ids.size(0))
            # Print the running average loss and jaccard score
            tk0.set_postfix(loss=losses.average, jaccard=jaccards.average)
    
    print("Jaccard = "+ str(jaccards.average))
    return jaccards.average

In [None]:
loss_vals = []
def run_model():
  #Run the training on first 22000 data (80-20 split)
    data = pd.read_csv("tweet-sentiment-extraction/train.csv")
    data = data.dropna(subset=['text', 'selected_text'])
    data_train = data.iloc[:22000]
    data_test = data.iloc[22000:]
    #27481
    #Create the training dataset instance
    train_instance = TweetDataset(tweet=data_train['text'].values,sentiment=data_train['sentiment'].values, selected_text=data_train['selected_text'].values)
  
    #Create teset dataset instance
    test_instance = TweetDataset(tweet=data_test['text'].values, sentiment=data_test['sentiment'].values, selected_text=data_test['selected_text'].values)
  
    #Create data loader for training dataset and test
    train_loader = torch.utils.data.DataLoader(train_instance, batch_size=batch_size, num_workers=4)
    
    test_loader = torch.utils.data.DataLoader(test_instance ,batch_size=16, num_workers=2)
    
    device = "cuda" if torch.cuda.is_available() else "cpu"
    #Load model
    model = transformers.BertConfig.from_pretrained(BERT_PATH)

    #We want to concatenate the last two hidden layers state in BERT, so set to TRUE
    model.output_hidden_states=True

    #Build model
    model= ExtractSentiModel(conf=model)

    #Shift processing to GPU
    model.to(device)

    n_training_steps= int(len(data_train) / batch_size * 5)
    
    #choosing 0.0001 as learning rate since anything more is overshooting
    optimizer = AdamW(model.parameters(), lr=0.0001)
    scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=0, num_training_steps=n_training_steps)
    
    #To stop training when performance doesnt increase after 2 epochs
    es = EarlyStopping(patience=2, mode="max")

    loss_vals.append(training(train_loader, test_loader, es, model, optimizer, device, scheduler=scheduler))
    return loss_vals


In [None]:
loss_vals = run_model()


  cpuset_checked))
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/344 [00:00<?, ?it/s]

loss_avg: 9.548194885253906  Jaccard_avg: 0.45000171929519756
loss_avg: 8.412696838378906  Jaccard_avg: 0.5174686826232342
loss_avg: 7.629729588826497  Jaccard_avg: 0.5261553863954821
loss_avg: 6.994774580001831  Jaccard_avg: 0.5547013004981551
loss_avg: 6.573116874694824  Jaccard_avg: 0.5563684617600574
loss_avg: 6.161499977111816  Jaccard_avg: 0.5507763147813726
loss_avg: 5.837836469922747  Jaccard_avg: 0.5666614924511081
loss_avg: 5.611604988574982  Jaccard_avg: 0.5572538258379042
loss_avg: 5.3591081566280785  Jaccard_avg: 0.5637904358699694
loss_avg: 5.168366646766662  Jaccard_avg: 0.559773387536923
loss_avg: 5.034110220995816  Jaccard_avg: 0.559664949348252
loss_avg: 4.932930827140808  Jaccard_avg: 0.5584189962943347
loss_avg: 4.806537224696233  Jaccard_avg: 0.5581632154968816
loss_avg: 4.685731836727688  Jaccard_avg: 0.5615850321740798
loss_avg: 4.592257738113403  Jaccard_avg: 0.5611489752594713
loss_avg: 4.489658698439598  Jaccard_avg: 0.5604940985684945
loss_avg: 4.439952443627

  0%|          | 0/343 [00:00<?, ?it/s]

Jaccard = 0.6936444555573447
Jaccard Score = 0.6936444555573447
Validation score improved from -infto 0.6936444555573447


  0%|          | 0/344 [00:00<?, ?it/s]

loss_avg: 1.9385682344436646  Jaccard_avg: 0.6971111787518038
loss_avg: 1.7143351435661316  Jaccard_avg: 0.7512617525703463
loss_avg: 1.6218727032343547  Jaccard_avg: 0.7132085752788878
loss_avg: 1.5986284911632538  Jaccard_avg: 0.7216016358811026
loss_avg: 1.5670389652252197  Jaccard_avg: 0.7240953757142616
loss_avg: 1.584448556105296  Jaccard_avg: 0.7233546623339423
loss_avg: 1.5580445187432426  Jaccard_avg: 0.7250431489231955
loss_avg: 1.5677815824747086  Jaccard_avg: 0.7189706670141452
loss_avg: 1.600290338198344  Jaccard_avg: 0.7072069371474942
loss_avg: 1.6006627798080444  Jaccard_avg: 0.7039335400597289
loss_avg: 1.608488299629905  Jaccard_avg: 0.702573377212242
loss_avg: 1.6566847562789917  Jaccard_avg: 0.6945475167099656
loss_avg: 1.6754049888023963  Jaccard_avg: 0.6880325100827034
loss_avg: 1.6853666816438948  Jaccard_avg: 0.6856507840417001
loss_avg: 1.6886051177978516  Jaccard_avg: 0.6864812366803571
loss_avg: 1.6779517829418182  Jaccard_avg: 0.6841945890809715
loss_avg: 1.

  0%|          | 0/343 [00:00<?, ?it/s]

Jaccard = 0.6955029170358776
Jaccard Score = 0.6955029170358776
Validation score improved from 0.6936444555573447to 0.6955029170358776


  0%|          | 0/344 [00:00<?, ?it/s]

loss_avg: 1.470123291015625  Jaccard_avg: 0.7551403891247641
loss_avg: 1.3796153664588928  Jaccard_avg: 0.7860024242250805
loss_avg: 1.280098597208659  Jaccard_avg: 0.7666654338138713
loss_avg: 1.2307603359222412  Jaccard_avg: 0.7710947155011354
loss_avg: 1.2163466453552245  Jaccard_avg: 0.7622316751786861
loss_avg: 1.1973779400189717  Jaccard_avg: 0.7681867472050895
loss_avg: 1.1796379259654455  Jaccard_avg: 0.7770863500177357
loss_avg: 1.1704024821519852  Jaccard_avg: 0.7729100056702807
loss_avg: 1.226081954108344  Jaccard_avg: 0.7612587471237833
loss_avg: 1.2250982165336608  Jaccard_avg: 0.7617877583241033
loss_avg: 1.212345307523554  Jaccard_avg: 0.7635182922524528
loss_avg: 1.268597036600113  Jaccard_avg: 0.7548835679094129
loss_avg: 1.2879172746951764  Jaccard_avg: 0.7501276026668725
loss_avg: 1.2866841980389185  Jaccard_avg: 0.7518249058429555
loss_avg: 1.289505918820699  Jaccard_avg: 0.755185388310568
loss_avg: 1.2822233363986015  Jaccard_avg: 0.7545492207725337
loss_avg: 1.297

  0%|          | 0/343 [00:00<?, ?it/s]

Jaccard = 0.6954575971759697
Jaccard Score = 0.6954575971759697
EarlyStopping counter: 1 out of 2


  0%|          | 0/344 [00:00<?, ?it/s]

loss_avg: 0.7838442921638489  Jaccard_avg: 0.8630896013708513
loss_avg: 0.8007495105266571  Jaccard_avg: 0.8887323006854257
loss_avg: 0.8082037170728048  Jaccard_avg: 0.8540788089225589
loss_avg: 0.7748519033193588  Jaccard_avg: 0.8461963120382787
loss_avg: 0.7990685701370239  Jaccard_avg: 0.8351643458066691
loss_avg: 0.8150940239429474  Jaccard_avg: 0.8338026670202282
loss_avg: 0.8061433434486389  Jaccard_avg: 0.8418398411942182
loss_avg: 0.7944857254624367  Jaccard_avg: 0.8418406003577115
loss_avg: 0.8175704545444913  Jaccard_avg: 0.8377310041298945
loss_avg: 0.8290010154247284  Jaccard_avg: 0.8344577806892686
loss_avg: 0.8163488507270813  Jaccard_avg: 0.8348889249581586
loss_avg: 0.8468711922566096  Jaccard_avg: 0.828044230041288
loss_avg: 0.855420456482814  Jaccard_avg: 0.8245487109827884
loss_avg: 0.8652968278953007  Jaccard_avg: 0.8252490789328427
loss_avg: 0.8558443347613017  Jaccard_avg: 0.8287216436441981
loss_avg: 0.8491580076515675  Jaccard_avg: 0.826641111285303
loss_avg: 0

  0%|          | 0/343 [00:00<?, ?it/s]

Jaccard = 0.6870560589962181
Jaccard Score = 0.6870560589962181
EarlyStopping counter: 2 out of 2
Early stopping


In [None]:
loss_vals

[[2.0865504339683887,
  1.4816334727545117,
  1.0446251282511756,
  0.6632374324510957]]

In [None]:
df_test = pd.read_csv("tweet-sentiment-extraction/test.csv")
df_test.loc[:, "selected_text"] = df_test.text.values

In [None]:
device = torch.device("cuda")
model_config = transformers.BertConfig.from_pretrained(BERT_PATH)
model_config.output_hidden_states = True

In [None]:
model1 = ExtractSentiModel(conf=model_config)
model1.to(device)
model1.load_state_dict(torch.load("model_0.bin"))
model1.eval()

In [None]:
final_output = []

test_dataset = TweetDataset(tweet=df_test.text.values,sentiment=df_test.sentiment.values,selected_text=df_test.selected_text.values)

data_loader = torch.utils.data.DataLoader(test_dataset,shuffle=False,batch_size=VALID_BATCH_SIZE,num_workers=1)

with torch.no_grad():
    tk0 = tqdm(data_loader, total=len(data_loader))
    for bi, d in enumerate(tk0):
        ids = d["ids"]
        token_type_ids = d["token_type_ids"]
        mask = d["mask"]
        sentiment = d["sentiment"]
        orig_selected = d["orig_selected"]
        orig_tweet = d["orig_tweet"]
        targets_start = d["targets_start"]
        targets_end = d["targets_end"]
        offsets = d["offsets"].numpy()

        ids = ids.to(device, dtype=torch.long)
        token_type_ids = token_type_ids.to(device, dtype=torch.long)
        mask = mask.to(device, dtype=torch.long)
        targets_start = targets_start.to(device, dtype=torch.long)
        targets_end = targets_end.to(device, dtype=torch.long)

        outputs_start, outputs_end = model1(
            ids=ids,
            mask=mask,
            token_type_ids=token_type_ids
        )
        
        outputs_start = torch.softmax(outputs_start, dim=1).cpu().detach().numpy()
        outputs_end = torch.softmax(outputs_end, dim=1).cpu().detach().numpy()

        for px, tweet in enumerate(orig_tweet):
            selected_tweet = orig_selected[px]
            tweet_sentiment = sentiment[px]
            _, output_sentence = calculate_jaccard_score(
                original_tweet=tweet,
                target_string=selected_tweet,
                sentiment_val=tweet_sentiment,
                idx_start=np.argmax(outputs_start[px, :]),
                idx_end=np.argmax(outputs_end[px, :]),
                offsets=offsets[px]
            )
            final_output.append(output_sentence)

In [None]:
final_output
sample = pd.read_csv("tweet-sentiment-extraction/test.csv")
sample.loc[:, 'selected_text'] = final_output
sample.selected_text = sample.selected_text
sample.to_csv("submission.csv", index=False)

In [None]:
sample.head()

In [None]:
data1 = pd.read_csv("tweet-sentiment-extraction/train.csv")
data1.info()

In [None]:
sample.info()