In [None]:
import transformers
import tokenizers
import torch
import pandas as pd
import numpy as np
import os

In [None]:
class config:
    MAX_LEN = 128
    TRAIN_BATCH_SIZE = 32
    VALID_BATCH_SIZE = 16
    EPOCHS = 5
    BERT_PATH = "../input/bert-base-uncased/"
    MODEL_PATH = "model.bin"
    TRAINING_FILE = "../input/tweet-sentiment-extraction/train.csv"
    TOKENIZER = tokenizers.BertWordPieceTokenizer(
        os.path.join(BERT_PATH,"vocab.txt"),
        lowercase=True)

In [None]:
class TweetDataset:
    def __init__(self,tweet,sentiment,selected_text):
        self.tweet=tweet
        self.sentiment=sentiment
        self.selected_text=selected_text
        self.max_len=config.MAX_LEN
        self.tokenizer=config.TOKENIZER
        
    def __len__(self):
        return len(self.tweet)
    
    def __getitem__(self,item):
        tweet=" ".join(str(self.tweet[item]).split())
        selected_text=" ".join(str(self.selected_text[item]).split())
        
        len_sel_text=len(selected_text)
        idx0=-1
        idx1=-1
        for ind in (i for i,e in enumerate(tweet) if e==selected_text[0]):
            if tweet[ind:ind+len(selected_text)]==selected_text:
                idx0=ind
                idx1=ind+len(selected_text)-1
                break
        
        char_targets=[0]*len(tweet)
        if idx0!=-1 and idx1!=-1:
            for j in range(idx0,idx1+1):
                if tweet[j]!=" ":
                    char_targets[j]=1
        tok_tweet=self.tokenizer.encode(tweet)
        
        tok_tweet_tokens=tok_tweet.tokens
        tok_tweet_ids=tok_tweet.ids
        tok_tweet_offsets=tok_tweet.offsets[1:-1]
        
        targets=[0]*(len(tok_tweet_tokens)-2)
        for j,(offset1,offset2) in enumerate(tok_tweet_offsets):
            if sum(char_targets[offset1:offset2])>0:
                targets[j]=1
        targets=[0]+targets+[0]
        target_start=[0]*len(targets)
        target_end=[0]*len(targets)
        
        non_zero=np.nonzero(targets)[0]
        if len(non_zero)>0:
            target_start[non_zero[0]]=1
            target_end[non_zero[-1]]=1
            
        mask=[1]*len(tok_tweet_ids)
        token_type_ids=[1]*len(tok_tweet_ids)
        
        padding_len=self.max_len-len(tok_tweet_ids)
        
        ids=tok_tweet_ids+[0]*padding_len
        mask=mask+[0]*padding_len
        token_tpye_ids=token_type_ids+[0]*padding_len
        targets=targets+[0]*padding_len
        target_start=target_start+[0]*padding_len
        target_end=target_end+[0]*padding_len   
        
        sentiment=[1,0,0]
        if self.sentiment[item]=="positive":
            sentiment=[0,0,1]
        if self.sentiment[item]=="negative":
            sentiment=[0,1,0]
        return {
            "ids":torch.tensor(ids,dtype=torch.long),
            "mask":torch.tensor(mask,dtype=torch.long),
            "token_type_ids":torch.tensor(token_type_ids,dtype=torch.long),
            "targets":torch.tensor(targets,dtype=torch.long),
            "target_start":torch.tensor(target_start,dtype=torch.long),
            "target_end":torch.tensor(target_end,dtype=torch.long),
            "padding_len":torch.tensor(padding_len,dtype=torch.long),
            "tweet_token":" ".join(tok_tweet_tokens),
            "orig_tweet":self.tweet[item],
            "sentiment":torch.tensor(sentiment,dtype=torch.long),
            "orig_sentiment":self.sentiment[item],
            "orig_selected_text":self.selected_text[item]
        }
        

In [None]:
if __name__=="__main__":
    df=pd.read_csv(config.TRAINING_FILE).dropna().reset_index(drop=True)
    dset=TweetDataset(
        tweet=df.text.values,
        sentiment=df.sentiment.values,
        selected_text=df.selected_text.values
    )
    print(dset[0])

In [None]:
import torch.nn as nn
import torch

class BERTBaseUncased(nn.Module):
    def __init__(self):
        super(BERTBaseUncased, self).__init__()
        self.bert = transformers.BertModel.from_pretrained(config.BERT_PATH)
        self.l0 = nn.Linear(768, 2)

    def forward(self, ids, mask, token_type_ids):
        sequence_output, pooled_output = self.bert(ids, attention_mask=mask, token_type_ids=token_type_ids)
        logits=self.l0(sequence_output)
        start_logits,end_logits=logits.split(1,dim=-1)
        start_logits=start_logits.squeeze(1)
        end_logits=end_logits.squeeze(1)
        return start_logits,end_logits

In [None]:
def jaccard(str1, str2): 
    a = set(str1.lower().split()) 
    b = set(str2.lower().split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

In [None]:
def loss_fn(o1,o2,t1,t2):
    l1=nn.BCEWithLogitsLoss(o1,t1)
    l2=nn.BCEWithLogitsLoss(o2,t2)
    return l1+l2

In [None]:
class AverageMeter():
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

In [None]:
def train_fn(data_loader, model, optimizer, device, scheduler):
    model.train()
    losses=AverageMeter()
    tk0=tqdm(data_loader, total=len(data_loader))
    for bi, d in enumerate(tk0):
        ids = d["ids"]
        token_type_ids = d["token_type_ids"]
        mask = d["mask"]
        target_start = d["target_start"]
        target_end = d["target_end"]

        ids = ids.to(device, dtype=torch.long)
        token_type_ids = token_type_ids.to(device, dtype=torch.long)
        mask = mask.to(device, dtype=torch.long)
        target_start = target_start.to(device, dtype=torch.float)
        target_end = target_end.to(device, dtype=torch.float)

        optimizer.zero_grad()
        o1,o2 = model(ids=ids, mask=mask, token_type_ids=token_type_ids)

        loss = loss_fn(o1,o2,target_start,target_end)
        loss.backward()
        optimizer.step()
        scheduler.step()
        losses.update(loss.item(),ids.size(0))
        tk0.set_postfix(loss=losses.avg)           
        

In [None]:
from sklearn import model_selection
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup
from tqdm import tqdm

def run():
    dfx = pd.read_csv(config.TRAINING_FILE).dropna().reset_index(drop=True)

    df_train, df_valid = model_selection.train_test_split(
        dfx, 
        test_size=0.1, 
        random_state=42, 
        stratify=dfx.sentiment.values
    )

    df_train = df_train.reset_index(drop=True)
    df_valid = df_valid.reset_index(drop=True)

    train_dataset = TweetDataset(
        tweet=df_train.text.values, 
        sentiment=df_train.sentiment.values,
        selected_text=df_train.selected_text.values
    )

    train_data_loader = torch.utils.data.DataLoader(
        train_dataset, batch_size=config.TRAIN_BATCH_SIZE, num_workers=4
    )

    valid_dataset = TweetDataset(
        tweet=df_valid.text.values, 
        sentiment=df_valid.sentiment.values,
        selected_text=df_valid.selected_text.values
    )

    valid_data_loader = torch.utils.data.DataLoader(
        valid_dataset, batch_size=config.VALID_BATCH_SIZE, num_workers=1
    )

    device = torch.device("cuda")
    model = BERTBaseUncased()
    model.to(device)

    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_parameters = [
        {
            "params": [
                p for n, p in param_optimizer if not any(nd in n for nd in no_decay)
            ],
            "weight_decay": 0.001,
        },
        {
            "params": [
                p for n, p in param_optimizer if any(nd in n for nd in no_decay)
            ],
            "weight_decay": 0.0,
        },
    ]

    num_train_steps = int(len(df_train) / config.TRAIN_BATCH_SIZE * config.EPOCHS)
    optimizer = AdamW(optimizer_parameters, lr=3e-5)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=0, num_training_steps=num_train_steps
    )

    model = nn.DataParallel(model)

    best_jaccord = 0
    for epoch in range(config.EPOCHS):
        train_fn(train_data_loader, model, optimizer, device, scheduler)


In [None]:

if __name__ == "__main__":
    run()