In [26]:
!pip install -q transformers

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
[0m

In [27]:
import torch
from transformers import AutoConfig, AutoModel, AutoTokenizer

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.preprocessing import LabelEncoder
from time import time
from torch.utils.data import DataLoader
from logging import getLogger, INFO, StreamHandler, FileHandler, Formatter
import torch.nn as nn

In [28]:

train = pd.read_csv('../input/tweet-sentiment-extraction/train.csv')
test = pd.read_csv('../input/tweet-sentiment-extraction/test.csv')


In [29]:

le = LabelEncoder()
train_y = le.fit_transform(train["sentiment"].values)
train["label"] = train_y

In [30]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27481 entries, 0 to 27480
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   textID         27481 non-null  object
 1   text           27480 non-null  object
 2   selected_text  27480 non-null  object
 3   sentiment      27481 non-null  object
 4   label          27481 non-null  int64 
dtypes: int64(1), object(4)
memory usage: 1.0+ MB


In [31]:
le.classes_

array(['negative', 'neutral', 'positive'], dtype=object)

In [32]:
train.dropna(inplace=True)

In [33]:
class CFG:
    model_name = "microsoft/deberta-base"
    batch_size = 2
    apex = True
    debug = True
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    max_length = 512
    dropout = 0.1
    folds = 5
    gradient_accumulation = 5
    scheduler = "cosine"
    num_warmup_steps=0
    apex = True
    epochs = 10

In [34]:
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup

In [35]:
def get_scheduler(CFG, optimizer, num_training_steps):
    if CFG.scheduler == "cosine":
        scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=CFG.num_warmup_steps, num_training_steps=num_training_steps)
    elif CFG.scheduler == "linear":
        scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=CFG.num_warmup_steps, num_training_steps=num_training_steps)
    return scheduler
        

In [36]:
tokenizer = AutoTokenizer.from_pretrained(CFG.model_name)
CFG.tokenizer = tokenizer

In [37]:
def prepare_input(tweet, sentiment, selected_text, CFG):
  
  selected_text = " " + " ".join(str(selected_text).split())
  tweet = " " + " ".join(str(tweet).split())


  tokenized_text = CFG.tokenizer(tweet, add_special_tokens=False, padding='max_length', max_length=100, truncation=True, return_offsets_mapping=True)


  len_st = len(selected_text) - 1
  idx0 = None
  idx1 = None
  for ind in (start_idx for start_idx, e in enumerate(tweet) if e == selected_text[1]):
    if " "+tweet[ind:ind+len_st] == selected_text:
      idx0 = ind
      idx1 = ind+len_st-1
      break

  char_targets = [0]*len(tweet)
  for idx in range(idx0, idx1+1):
    char_targets[idx] = 1
  

  target_tokens_idx = []
  for i, (start_indx, end_indx) in enumerate(tokenized_text["offset_mapping"]):
    if sum(char_targets[start_indx:end_indx]) > 0:
      target_tokens_idx.append(i)
  

  start_indx = target_tokens_idx[0]
  end_indx = target_tokens_idx[-1]


  sentiments_id = {
    'positive': 1313,
    'negative': 2430,
    'neutral': 7974
  }

  tokenized_text.keys()
  input_ids = [0] + [sentiments_id[sentiment]] + [2] + [2] + tokenized_text["input_ids"] + [2]
  token_type_ids = [0] * 4 + tokenized_text["token_type_ids"] + [0]
  attention_mask = [1] * 4 + tokenized_text["attention_mask"] + [0]
  offset_mapping = [(0,0)] *4 + tokenized_text["offset_mapping"] + [(0,0)]


  data =  {
      "input_ids" : input_ids,
      "token_type_ids": token_type_ids,
      "attention_mask": attention_mask,
      "offset_mapping": offset_mapping,
      "tweet" : tweet,
      "selected_text": selected_text,
      "start_token_indx": start_indx,
      "end_token_indx": end_indx,
      "sentiment" : sentiment

  }

  return data


In [38]:
from pandas.io.xml import preprocess_data
class TextDataset(torch.utils.data.Dataset):

  def __init__(self, train_df, CFG):

    self.CFG = CFG
    self.texts = train_df["text"].values
    self.sentiments = train_df["sentiment"].values
    self.selected_texts = train_df["selected_text"].values
    self.labels = train_df["label"].values
  
  def __getitem__(self, idx):
    tweet = self.texts[idx]
    sentiment = self.sentiments[idx]
    selected_text = self.selected_texts[idx]
    label = self.labels[idx]
    preprocessed_text = prepare_input(tweet, sentiment, selected_text, self.CFG)

    preprocessed_text =  {
        "input_ids" : torch.tensor(preprocessed_text["input_ids"], dtype=torch.long),
        "token_type_ids": torch.tensor(preprocessed_text["token_type_ids"], dtype=torch.long),
        "attention_mask": torch.tensor(preprocessed_text["attention_mask"], dtype=torch.long),
        "offset_mapping": torch.tensor(preprocessed_text["offset_mapping"], dtype=torch.long),
        "origin_tweet" : preprocessed_text["tweet"],
        "origin_selected_text": preprocessed_text["selected_text"],
        "start_token_indx": torch.tensor(preprocessed_text["start_token_indx"], dtype=torch.long),
        "end_token_indx": torch.tensor(preprocessed_text["end_token_indx"], dtype=torch.long),
        "sentiment": preprocessed_text["sentiment"],
        "label": torch.tensor(label, dtype=torch.int64)
      }

    return preprocessed_text
  
  def __len__(self):
    return len(self.texts)

In [39]:
class TweetModel(torch.nn.Module):

  def __init__(self, CFG):
    super(TweetModel, self).__init__()
    self.config = AutoConfig.from_pretrained(CFG.model_name)
    self.model = AutoModel.from_pretrained(CFG.model_name)
    self.dropout = nn.Dropout(CFG.dropout)
    self.fc = nn.Linear(self.config.hidden_size, 2)
    self.cls = nn.Linear(self.config.hidden_size, 3)
  def forward(self, data):

    features = self.model(input_ids = data["input_ids"], attention_mask=data['attention_mask'],token_type_ids=data["token_type_ids"]) #
    last_hidden_state = self.dropout(features[0])

    out = self.fc(last_hidden_state)
    start_logits, end_logits = torch.split(out, 1, dim=2)
    start_logits = start_logits.squeeze(-1)
    end_logits = end_logits.squeeze(-1)
    sentence_representation = last_hidden_state[:,0,:]
    class_logit = self.cls(sentence_representation)
    return start_logits, end_logits, class_logit


In [40]:
class AverageMeter:
  def __init__(self):
    self.reset()
  
  def reset(self):
    self.val = 0
    self.sum = 0
    self.avg = 0
    self.count = 0
  def update(self, val, n=1):
    self.val = val
    self.sum += val*n
    self.count += n
    self.avg = self.sum / self.count


In [41]:
def jaccard(str1, str2): 
    a = set(str1.lower().split()) 
    b = set(str2.lower().split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

In [42]:
def format_prediction(original_tweet, target_string, sentiment_val, offset, predicted_start_token__, predicted_end_token__):

  if predicted_start_token__ > predicted_end_token__:
    predicted_end_token__ = predicted_start_token__
  
  predicted_selected_text = " "
  for ix in range(predicted_start_token__, predicted_end_token__+1):
    start_char = offset[ix][0]
    end_char = offset[ix][1]
    predicted_selected_text += original_tweet[start_char:end_char]
    if (ix+1) < len(offset) + 1 and offset[ix][1] < offset[ix][0]:
      predicted_selected_text += " "
  if sentiment_val == "neutral" or len(original_tweet.split(" ")) <2:
    predicted_selected_text = original_tweet

  jac_score = jaccard(target_string.strip(), predicted_selected_text.strip())
  return jac_score, predicted_selected_text, predicted_selected_text

In [51]:
from itertools import starmap
def training_epoch(epoch, model, optimizer, criterion, train_dl, scheduler):
    scaler = torch.cuda.amp.GradScaler()
    model.train()
    loss_val = AverageMeter()
    
    for step, data in enumerate(train_dl):
        input_ids = data["input_ids"].to(CFG.device)
        token_type_ids = data["token_type_ids"].to(CFG.device)
        attention_mask = data["attention_mask"].to(CFG.device)
        start_token_indx = data["start_token_indx"].to(CFG.device)
        end_token_indx = data["end_token_indx"].to(CFG.device)
        label = data["label"].to(CFG.device)
        batch_size = end_token_indx.shape[0]
#         print(f"label type : {label.type()}")
        with torch.cuda.amp.autocast():
            predicted_start_token, predicted_end_token, class_logit = model({"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": attention_mask})

            start_loss = criterion(predicted_start_token, start_token_indx)
            end_loss = criterion(predicted_end_token, end_token_indx)
#             print(f"class_logit type : {class_logit.type()}")
#             print(f"class_logit shape : {class_logit.shape}")
#             print(f"class_logit shape : {label.shape}")
            cls_loss = criterion(class_logit, label)
            total_loss = start_loss + end_loss + cls_loss

        loss_val.update(total_loss.item(), batch_size)
        if CFG.gradient_accumulation > 0:
            total_loss = total_loss/ CFG.gradient_accumulation
        
        if (step+1) % CFG.gradient_accumulation == 0:
            scaler.scale(total_loss).backward()
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
            if CFG.scheduler:
                scheduler.step()

    return loss_val.avg




def valid_epoch(epoch, model, criterion, valid_dl):
    model.eval()
    loss_val = AverageMeter()
    jaccards = AverageMeter()
    predicted_selected_texts = []
    for data in valid_dl:
        input_ids = data["input_ids"].to(CFG.device)
        token_type_ids = data["token_type_ids"].to(CFG.device)
        attention_mask = data["attention_mask"].to(CFG.device)
        start_token_indx = data["start_token_indx"].to(CFG.device)
        end_token_indx = data["end_token_indx"].to(CFG.device)
        offset_mapping = data["offset_mapping"]
        origin_tweet = data["origin_tweet"]
        origin_selected_text = data["origin_selected_text"]
        sentiment_val = data["sentiment"]
        label = data["label"].to(CFG.device)
        batch_size = end_token_indx.shape[0]
    
        with torch.no_grad():
            predicted_start_token, predicted_end_token, class_logit = model({"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": attention_mask})

        start_loss = criterion(predicted_start_token, start_token_indx)
        end_loss = criterion(predicted_end_token, end_token_indx)
        cls_loss = criterion(class_logit, label)
        total_loss = start_loss + end_loss + cls_loss

        predicted_start_token__ = torch.softmax(predicted_start_token, dim=1).cpu().detach().numpy()
        predicted_end_token__ = torch.softmax(predicted_end_token, dim=1).cpu().detach().numpy()
        predicted_label = torch.softmax(class_logit, dim=1).cpu().detach().numpy()

        jaccard_scores = []
        for px, tweet in enumerate(origin_tweet):
            jacard, _, predicted_selected_text = format_prediction(tweet, origin_selected_text[px], sentiment_val[px], offset_mapping[px], np.argmax(predicted_start_token__[px, :]), np.argmax(predicted_end_token__[px, :]))
            jaccard_scores.append(jacard)
            predicted_selected_texts.append(predicted_selected_text)

        loss_val.update(total_loss.item(), batch_size)
        jaccards.update(np.mean(jaccard_scores), batch_size)
    
    predicted_selected_texts = np.array(predicted_selected_texts)
    return loss_val.avg, jaccards.avg, predicted_selected_texts

In [52]:
def train_loop(fold, train, CFG):
    train_folds = train[train["fold"] != fold].reset_index()
    valid_fold = train[train["fold"] == fold].reset_index()

    train_dataset = TextDataset(train_folds, CFG)
    valid_dataset = TextDataset(valid_fold, CFG)

    train_dl = DataLoader(train_dataset, batch_size=CFG.batch_size)
    valid_dl = DataLoader(valid_dataset, batch_size=CFG.batch_size)
        
    model = TweetModel(CFG).to(CFG.device)
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.AdamW(model.parameters(), lr=0.001, betas=(0.9, 0.999), eps=1e-08)
    num_training_steps = int(len(train_folds))/CFG.batch_size * CFG.epochs
    scheduler = get_scheduler(CFG, optimizer, num_training_steps) 
    
    best_score = -np.Inf
    for epoch in range(CFG.epochs):
        start_time = time()
        avg_train_loss = training_epoch(epoch, model, optimizer, criterion, train_dl, scheduler)
        print(f"avg_train_loss = {avg_train_loss}")
        avg_val_loss, score, predicted_selected_text = valid_epoch(epoch, model, criterion, valid_dl)
        print(f"avg_val_loss = {avg_val_loss}")

        end_time = time()
        elapsed = end_time - start_time
        LOGGER.info(f'Epoch {epoch+1} - avg_train_loss: {avg_train_loss:.4f}  avg_val_loss: {avg_val_loss:.4f}  time: {elapsed:.0f}s')
        LOGGER.info(f'Epoch {epoch+1} - Score: {score:.4f}')
        if score > best_score:
            best_score = score
            LOGGER.info(f'Epoch {epoch+1} - Save Best Score: {best_score:.4f} Model')
            torch.save({
                'epoch': epoch+1,
                'model_state_dict': model.state_dict(),
                'prediction_selected_texts': predicted_selected_text,
            }, OUTPUT_DIR+f"model_fold{fold}.pth")
  
    torch.cuda.empty_cache()

    valid_fold["predicted_selected_text"] = predicted_selected_text
    return valid_fold

In [45]:
# # For Debugging
# CFG.debug  = True
# if CFG.debug == True:
#     CFG.folds = 5
#     train = train.sample(100, replace=True).reset_index(drop=True)
# train.shape
# train = train.reset_index()

In [46]:
from sklearn.model_selection import KFold
kf = KFold(n_splits=5, random_state=None, shuffle=True)
for fold , (train_index, valid_index) in enumerate(kf.split(train["text"])):
    train.loc[valid_index, "fold"] = fold

In [47]:
logger = getLogger(__name__)
logger.setLevel(INFO)

In [48]:
OUTPUT_DIR = "./"
def get_logger(filename=OUTPUT_DIR+'train'):
    
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=f"{filename}.log")
#     handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger


In [24]:
LOGGER = get_logger()


In [1]:



# oof_df = pd.DataFrame()
# for fold in range(CFG.folds):
#     _oof_df = train_loop(fold, train, CFG)
#     oof_df = pd.concat([oof_df, _oof_df])
#     LOGGER.info(f"Training at Fold {fold} has ended !!!")

# oof_df.to_pickle("oof_df.pickle")

# Inference

In [54]:
def prepare_input_inference(tweet, CFG):
    
    tweet = " " + " ".join(str(tweet).split())


    tokenized_text = CFG.tokenizer(tweet, add_special_tokens=False, padding='max_length', max_length=100, truncation=True, return_offsets_mapping=True)


  
  

 
   

    input_ids = [0, 0] + [2] + [2] + tokenized_text["input_ids"] + [2]
    token_type_ids = [0] * 4 + tokenized_text["token_type_ids"] + [0]
    attention_mask = [1] * 4 + tokenized_text["attention_mask"] + [0]
    offset_mapping = [(0,0)] *4 + tokenized_text["offset_mapping"] + [(0,0)]


    data =  {
        "input_ids" : torch.tensor([input_ids], dtype=torch.long).to(CFG.device),
        "token_type_ids": torch.tensor([token_type_ids], dtype=torch.long).to(CFG.device),
        "attention_mask": torch.tensor([attention_mask], dtype=torch.long).to(CFG.device),
        "offset_mapping": torch.tensor([offset_mapping], dtype=torch.long).to(CFG.device)
        
    }

    return data, tweet


In [55]:
def format_inference(original_tweet, offset, predicted_start_token__, predicted_end_token__, predicted_label):
    if predicted_start_token__ > predicted_end_token__:
        predicted_end_token__ = predicted_start_token__
    offset = offset.squeeze()
    predicted_selected_text = " "
    
    
    
    for ix in range(predicted_start_token__, predicted_end_token__+1):
        start_char = offset[ix][0]
        end_char = offset[ix][1]
        predicted_selected_text += original_tweet[start_char:end_char]
        if (ix+1) < len(offset) + 1 and offset[ix][1] < offset[ix][0]:
            predicted_selected_text += " "
    
    sentiment_val = le.classes_[predicted_label]
    if sentiment_val == "neutral" or len(original_tweet.split(" ")) <2:
        predicted_selected_text = original_tweet
    
    return predicted_selected_text, sentiment_val

In [56]:
tweet = "Recession hit Veronique Branquinho, she has to quit her company, such a shame!"

In [58]:
data,tweet = prepare_input_inference(tweet, CFG)

In [60]:
def inference(data, tweet):
    model = TweetModel(CFG).to(CFG.device)
    start_index_logits = []
    end_index_logits = []
    cls_logits = []
    tweet = test.iloc[2]["text"]

    
    for fold in range(CFG.folds):
        path = f"./model_fold{fold}.pth"
        checkpoint = torch.load(path)
        model.load_state_dict(checkpoint['model_state_dict'])
        predicted_start_token, predicted_end_token, cls_logit = model(data)
        start_index_logits.append(predicted_start_token)
        end_index_logits.append(predicted_end_token)
        cls_logits.append(cls_logit)

    start_index_logit = torch.mean(torch.stack(start_index_logits), dim=0)
    end_index_logit = torch.mean(torch.stack(end_index_logits), dim=0)
    cls_logit = torch.mean(torch.stack(cls_logits), dim=0)

    predicted_start_token__ = torch.softmax(start_index_logit, dim=1).cpu().detach().numpy()
    predicted_end_token__ = torch.softmax(end_index_logit, dim=1).cpu().detach().numpy()
    predicted_label = torch.softmax(cls_logit, dim=1).cpu().detach().numpy()

    predicted_selected_text = format_inference(tweet, data["offset_mapping"], np.argmax(predicted_start_token__[0, :]), np.argmax(predicted_end_token__[0, :]), np.argmax(predicted_label[0, :]))


    return predicted_selected_text

        
        

In [61]:
predicted_selected_text, sentiment_val = inference(data, tweet)

Some weights of the model checkpoint at microsoft/deberta-base were not used when initializing DebertaModel: ['lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.weight']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [63]:
print(f"original tweet : {tweet}")
print(f"sentiment of the tweet : {sentiment_val}")
print(f"selected text  :{predicted_selected_text}")

original tweet :  Recession hit Veronique Branquinho, she has to quit her company, such a shame!
sentiment of the tweet : positive
selected text  : shame!
