<a href="https://colab.research.google.com/github/AoShuang92/calibration_is_all_you_need/blob/main/benchmark.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np 
import os
import string
import pandas as pd
import seaborn as sns
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score
#NLP
from torchtext import data
from torchtext.vocab import Vectors, GloVe
import nltk
from nltk.util import ngrams
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
import re
#torch
import torch
import torch.nn as nn
from torch.autograd import Variable
from torch.nn import functional as F
import nltk
nltk.download('wordnet')
import math
import torchtext
from torchtext.data.utils import get_tokenizer
from torchtext import data

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
train_dir = "/content/drive/MyDrive/calibration_project/sentiment_analysis/Tweets.csv"
df = pd.read_csv(train_dir)
batch_size = 8
max_length = 35
def remove_unnecessary(text):
    #remove_URL
    url = re.compile(r'https?://\S+|www\.\S+')
    text = url.sub('', text)

    #remove_html
    html = re.compile(r'<.*?>')
    text = html.sub('', text)

    #remove @
    text = re.sub('@[^\s]+','',text)

    #remove_emoji
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)

    #Removes integers 
    text = ''.join([i for i in text if not i.isdigit()])         
    
    #remove_punct
    table = str.maketrans('', '', string.punctuation)
    text = text.translate(table)

    #Replaces contractions from a string to their equivalents 
    contraction_patterns = [(r'won\'t', 'will not'), (r'can\'t', 'cannot'), (r'i\'m', 'i am'), 
                            (r'ain\'t', 'is not'), (r'(\w+)\'ll', '\g<1> will'), (r'(\w+)n\'t', '\g<1> not'),
                            (r'(\w+)\'ve', '\g<1> have'), (r'(\w+)\'s', '\g<1> is'), (r'(\w+)\'re', '\g<1> are'),
                            (r'(\w+)\'d', '\g<1> would'), (r'&', 'and'), (r'dammit', 'damn it'), 
                            (r'dont', 'do not'), (r'wont', 'will not')]
    
    patterns = [(re.compile(regex), repl) for (regex, repl) in contraction_patterns]
    for (pattern, repl) in patterns:
        text, _= re.subn(pattern, repl, text)



    #lemmatize_sentence
    sentence_words = text.split(' ')
    new_sentence_words = list()
    
    for sentence_word in sentence_words:
        sentence_word = sentence_word.replace('#', '')
        new_sentence_word = WordNetLemmatizer().lemmatize(sentence_word.lower(), wordnet.VERB)
        new_sentence_words.append(new_sentence_word)
        
    new_sentence = ' '.join(new_sentence_words)
    new_sentence = new_sentence.strip()

    return new_sentence


def prepare_csv(df_train, seed=27, val_ratio=0.2):
    idx = np.arange(df_train.shape[0])    
    np.random.shuffle(idx)
    val_size = int(len(idx) * val_ratio)
    if not os.path.exists('cache'): # cache is tem memory file 
        os.makedirs('cache')
    
    df_train.iloc[idx[val_size:], :][['tweet_id', 'airline_sentiment', 'text']].to_csv(
        'cache/dataset_train.csv', index=False)
    
    df_train.iloc[idx[:val_size], :][['tweet_id', 'airline_sentiment', 'text']].to_csv(
        'cache/dataset_val.csv', index=False)    
    
def get_iterator(dataset, batch_size, train=True,
                 shuffle=True, repeat=False, device=None): 
    dataset_iter = data.Iterator(
        dataset, batch_size=batch_size, device=device,
        train=train, shuffle=shuffle, repeat=repeat,
        sort=False)  
    return dataset_iter

def get_dataset(fix_length=max_length, lower=False, vectors=None,train_dir = train_dir, batch_size=batch_size, device=None): 
    print('Preparing dataset...')
    train = pd.read_csv(train_dir,error_bad_lines=False)
    train['airline_sentiment'] = train['airline_sentiment'].map({
    'positive': 0,
    'negative': 1,
    'neutral':2})
    # labels = train['airline_sentiment'].tolist()
    train['text'] = train['text'].apply(lambda x: remove_unnecessary(x))
    
    if vectors is not None:
        lower=True
    #to split dataset for train and validation
    prepare_csv(train)
    TEXT = torchtext.data.Field(tokenize=get_tokenizer("spacy"),init_token='<sos>',eos_token='<eos>',lower=True,batch_first=True, 
                      fix_length=fix_length)
    
    LABEL = data.Field(use_vocab=False, sequential=False, dtype=torch.float16)
    ID = data.Field(use_vocab=False, sequential=False, dtype=torch.float16)   
    train_temp, val_temp = data.TabularDataset.splits(
        path='cache/', format='csv', skip_header=True,
        train='dataset_train.csv', validation='dataset_val.csv',
        fields=[('tweet_id', ID), ('airline_sentiment', LABEL), ('text', TEXT)])  
  
    TEXT.build_vocab(
        train_temp, val_temp,
        max_size=20000,
        min_freq=10)
        #vectors=GloVe(name='6B', dim=300) )
    LABEL.build_vocab(train_temp)
    ID.build_vocab(train_temp, val_temp)

    word_embeddings = TEXT.vocab.vectors
    ntokens = len(TEXT.vocab.stoi)
    vocab_size = len(TEXT.vocab)
    
    train_loader = get_iterator(train_temp, batch_size=batch_size, 
                              train=True, shuffle=True,
                              repeat=False,device=device)
    test_loader = get_iterator(val_temp, batch_size=batch_size, 
                            train=True, shuffle=True,
                            repeat=False, device=device)

    print('Train samples:%d'%(len(train_temp)), 'Valid samples:%d'%(len(val_temp)),'Train minibatch nb:%d'%(len(train_loader)),
          'Valid minibatch nb:%d'%(len(test_loader)), 'ntokens:%d'%(ntokens))
    
    return vocab_size, ntokens, word_embeddings, train_loader, test_loader

vocab_size, ntokens, word_embeddings, train_loader, test_loader = get_dataset (fix_length=max_length, lower=False, vectors=None,train_dir = train_dir, batch_size=batch_size, device=None)

Preparing dataset...
Train samples:11712 Valid samples:2928 Train minibatch nb:1464 Valid minibatch nb:366 ntokens:1603


In [None]:
def create_masks(sentence):
    
    sentence_mask = sentence!=0
    sentence_mask = sentence_mask.to(device)
    sentence_mask = sentence_mask.unsqueeze(1).unsqueeze(1)         # (batch_size, 1, 1, max_words)
     
    return sentence_mask 

def get_batch(source, i):
    seq_len = min(max_length, len(source) - 1 - i)
    data = source[i:i+seq_len]
    target = source[i+1:i+1+seq_len]
    return data, target

class Embeddings(nn.Module):
    """
    Implements embeddings of the words and adds their positional encodings. 
    """
    def __init__(self, vocab_size, d_model, max_len = max_length):
        super(Embeddings, self).__init__()
        self.d_model = d_model
        self.dropout = nn.Dropout(0.1)
        self.embed = nn.Embedding(vocab_size, d_model)
        self.pe = self.create_positinal_encoding(max_len, self.d_model)
        self.dropout = nn.Dropout(0.1)
        
    def create_positinal_encoding(self, max_len, d_model):
        pe = torch.zeros(max_len, d_model).to(device)
        for pos in range(max_len):   # for each position of the word
            for i in range(0, d_model, 2):   # for each dimension of the each position
                pe[pos, i] = math.sin(pos / (10000 ** ((2 * i)/d_model)))
                pe[pos, i + 1] = math.cos(pos / (10000 ** ((2 * (i + 1))/d_model)))
        pe = pe.unsqueeze(0)   # include the batch size
        return pe
        
    def forward(self, encoded_words):
        embedding = self.embed(encoded_words) * math.sqrt(self.d_model)
        embedding += self.pe[:, :embedding.size(1)]   # pe will automatically be expanded with the same batch size as encoded_words
        embedding = self.dropout(embedding)
        return embedding

class MultiHeadAttention(nn.Module):
    
    def __init__(self, heads, d_model):
        
        super(MultiHeadAttention, self).__init__()
        assert d_model % heads == 0
        self.d_k = d_model // heads
        self.heads = heads
        self.dropout = nn.Dropout(0.1)
        self.query = nn.Linear(d_model, d_model)
        self.key = nn.Linear(d_model, d_model)
        self.value = nn.Linear(d_model, d_model)
        self.concat = nn.Linear(d_model, d_model)
        
    def forward(self, query, key, value, mask):
        """
        query, key, value of shape: (batch_size, max_len, 512)
        mask of shape: (batch_size, 1, 1, max_words)
        """
        # (batch_size, max_len, 512)
        query = self.query(query)
        key = self.key(key)        
        value = self.value(value)   
        
        # (batch_size, max_len, 512) --> (batch_size, max_len, h, d_k) --> (batch_size, h, max_len, d_k)
        query = query.view(query.shape[0], -1, self.heads, self.d_k).permute(0, 2, 1, 3)   
        key = key.view(key.shape[0], -1, self.heads, self.d_k).permute(0, 2, 1, 3)  
        value = value.view(value.shape[0], -1, self.heads, self.d_k).permute(0, 2, 1, 3)  
        
        # (batch_size, h, max_len, d_k) matmul (batch_size, h, d_k, max_len) --> (batch_size, h, max_len, max_len)
        scores = torch.matmul(query, key.permute(0,1,3,2)) / math.sqrt(query.size(-1))
        #scores = torch.matmul(query, key.permute(2,1,0,0)) / math.sqrt(query.size(-1))
        scores = scores.masked_fill(mask == 0, -1e9)    # (batch_size, h, max_len, max_len)
        weights = F.softmax(scores, dim = -1)           # (batch_size, h, max_len, max_len)
        weights = self.dropout(weights)
        # (batch_size, h, max_len, max_len) matmul (batch_size, h, max_len, d_k) --> (batch_size, h, max_len, d_k)
        context = torch.matmul(weights, value)
        # (batch_size, h, max_len, d_k) --> (batch_size, max_len, h, d_k) --> (batch_size, max_len, h * d_k)
        context = context.permute(0,2,1,3).contiguous().view(context.shape[0], -1, self.heads * self.d_k)
        # (batch_size, max_len, h * d_k)
        interacted = self.concat(context)
        return interacted

class FeedForward(nn.Module):

    def __init__(self, d_model, middle_dim = 2048):
        super(FeedForward, self).__init__()
        
        self.fc1 = nn.Linear(d_model, middle_dim)
        self.fc2 = nn.Linear(middle_dim, d_model)
        self.dropout = nn.Dropout(0.1)

    def forward(self, x):
        out = F.relu(self.fc1(x))
        out = self.fc2(self.dropout(out))
        return out

class EncoderLayer(nn.Module):

    def __init__(self, d_model, heads):
        super(EncoderLayer, self).__init__()
        self.layernorm = nn.LayerNorm(d_model)
        self.self_multihead = MultiHeadAttention(heads, d_model)
        self.feed_forward = FeedForward(d_model)
        self.dropout = nn.Dropout(0.1)

    def forward(self, embeddings, mask):
        interacted = self.dropout(self.self_multihead(embeddings, embeddings, embeddings, mask))
        interacted = self.layernorm(interacted + embeddings)
        feed_forward_out = self.dropout(self.feed_forward(interacted))
        encoded = self.layernorm(feed_forward_out + interacted)
        return encoded

class DecoderLayer(nn.Module):
    
    def __init__(self, d_model, heads):
        super(DecoderLayer, self).__init__()
        self.layernorm = nn.LayerNorm(d_model)
        self.self_multihead = MultiHeadAttention(heads, d_model)
        self.src_multihead = MultiHeadAttention(heads, d_model)
        self.feed_forward = FeedForward(d_model)
        self.dropout = nn.Dropout(0.1)
        
    def forward(self, embeddings, encoded, src_mask, target_mask):
        query = self.dropout(self.self_multihead(embeddings, embeddings, embeddings, target_mask))
        query = self.layernorm(query + embeddings)
        interacted = self.dropout(self.src_multihead(query, encoded, encoded, src_mask))
        interacted = self.layernorm(interacted + query)
        feed_forward_out = self.dropout(self.feed_forward(interacted))
        decoded = self.layernorm(feed_forward_out + interacted)
        return decoded


class Transformer(nn.Module):    
    def __init__(self, d_model, heads, num_layers, ntokens):
        super(Transformer, self).__init__()
        
        self.d_model = d_model
        self.vocab_size = ntokens
        self.embed = Embeddings(self.vocab_size, d_model)#max_len
        #self.embed_dec = Embeddings(self.vocab_size, d_model,max_length)
        self.encoder = nn.ModuleList([EncoderLayer(d_model, heads) for _ in range(num_layers)])
        #self.decoder = nn.ModuleList([DecoderLayer(d_model, heads) for _ in range(num_layers)])
        self.logit = nn.Linear(max_length*self.d_model, 3)   
        
    def encode(self, src_words, src_mask):
        src_embeddings = self.embed(src_words)
        for layer in self.encoder:
            src_embeddings = layer(src_embeddings, src_mask)
        return src_embeddings
    
    # def decode(self, target_words, target_mask, src_embeddings, src_mask):
    #     tgt_embeddings = self.embed_dec(target_words)
    #     for layer in self.decoder:
    #         tgt_embeddings = layer(tgt_embeddings, src_embeddings, src_mask, target_mask)
    #     return tgt_embeddings
        
    def forward(self, src_words, src_mask):
        encoded = self.encode(src_words, src_mask)
        #decoded = self.decode(target_words, target_mask, encoded, src_mask)
        #out = F.log_softmax(self.logit(encoded), dim = 2)
        encoded = encoded.view(batch_size,-1)
        out = self.logit(encoded)
        return out

class AdamWarmup:
    
    def __init__(self, model_size, warmup_steps, optimizer):
        
        self.model_size = model_size
        self.warmup_steps = warmup_steps
        self.optimizer = optimizer
        self.current_step = 0
        self.lr = 0
        
    def get_lr(self):
        return self.model_size ** (-0.5) * min(self.current_step ** (-0.5), self.current_step * self.warmup_steps ** (-1.5))
        
    def step(self):
        # Increment the number of steps each time we call the step function
        self.current_step += 1
        lr = self.get_lr()
        for param_group in self.optimizer.param_groups:
            param_group['lr'] = lr
        # update the learning rate
        self.lr = lr
        self.optimizer.step()

class LossWithLS(nn.Module):

    def __init__(self, size, smooth):
        super(LossWithLS, self).__init__()
        self.criterion = nn.KLDivLoss(size_average=False, reduce=False)
        self.confidence = 1.0 - smooth
        self.smooth = smooth
        self.size = size
        
    def forward(self, prediction, target, mask):
        """
        prediction of shape: (batch_size, max_words, vocab_size)
        target and mask of shape: (batch_size, max_words)
        """
        prediction = prediction.view(-1, prediction.size(-1))   # (batch_size * max_words, vocab_size)
        target = target.contiguous().view(-1)   # (batch_size * max_words)
        mask = mask.float()
        mask = mask.view(-1)       # (batch_size * max_words)
        labels = prediction.data.clone()
        labels.fill_(self.smooth / (self.size - 1))
        labels.scatter_(1, target.data.unsqueeze(1), self.confidence)
        loss = self.criterion(prediction, labels)    # (batch_size * max_words, vocab_size)
        loss = (loss.sum(1) * mask).sum() / mask.sum()
        return loss

def seed_everything(seed=27):
  #random.seed(seed)
  torch.manual_seed(seed)
  torch.cuda.manual_seed_all(seed)
  np.random.seed(seed)
  os.environ['PYTHONHASHSEED'] = str(seed)
  torch.backends.cudnn.deterministic = True
  torch.backends.cudnn.benchmark = False


In [None]:
def train(train_loader, transformer, criterion, epoch,optim):    
    transformer.train()
    sum_loss = 0
    count = 0
    for i, pair in enumerate(train_loader): 
        input = pair.text.to(device)
        target = pair.airline_sentiment.to(device)

        input_mask = create_masks(input)
        out = transformer(input, input_mask )
        loss = criterion(out, target.long())

        optim.zero_grad()
        loss.backward()
        optim.step()
        

def valid (test_loader,transformer): 
    transformer.eval()
    sum_loss = 0
    sum_acc = 0

    for i, pair in enumerate(test_loader): 

        input = pair.text.to(device)
        target = pair.airline_sentiment.to(device)

        input_mask = create_masks(input)
        out = transformer(input, input_mask)
        loss = criterion(out, target.long())
        sum_loss += loss.item()
        num_corrects = (torch.max(out, 1)[1].view(target.size()).data == target.data).float().sum()
        acc = 100.0 * num_corrects / batch_size
        sum_acc += acc.item()
    return sum_loss/len(test_loader) ,  sum_acc/len(test_loader)




In [None]:
class _ECELoss(nn.Module):
    def __init__(self, n_bins=15):
        """
        n_bins (int): number of confidence interval bins
        """
        super(_ECELoss, self).__init__()
        bin_boundaries = torch.linspace(0, 1, n_bins + 1)
        self.bin_lowers = bin_boundaries[:-1]
        self.bin_uppers = bin_boundaries[1:]

    def forward(self, logits, labels):
        softmaxes = F.softmax(logits, dim=1)
        confidences, predictions = torch.max(softmaxes, 1)
        accuracies = predictions.eq(labels)
        ece = torch.zeros(1, device=logits.device)
        for bin_lower, bin_upper in zip(self.bin_lowers, self.bin_uppers):
            # Calculated |confidence - accuracy| in each bin
            in_bin = confidences.gt(bin_lower.item()) * confidences.le(bin_upper.item())
            prop_in_bin = in_bin.float().mean()
            if prop_in_bin.item() > 0:
                accuracy_in_bin = accuracies[in_bin].float().mean()
                avg_confidence_in_bin = confidences[in_bin].mean()
                ece += torch.abs(avg_confidence_in_bin - accuracy_in_bin) * prop_in_bin

        return ece


In [None]:
seed_everything()
criterion = nn.CrossEntropyLoss()
d_model = 512
heads = 8
num_layers = 3
lr = 1e-4
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = Transformer(d_model = d_model, heads = heads, num_layers = num_layers, ntokens = ntokens)
model = model.to(device)
#adam_optimizer = torch.optim.Adam(model.parameters(), lr=0, betas=(0.9, 0.98), eps=1e-9)
#transformer_optimizer = AdamWarmup(model_size = d_model, warmup_steps = 4000, optimizer = adam_optimizer)
optim = torch.optim.Adam(model.parameters(), lr=lr)

best_acc = 0
best_epoch = 0
for epoch in range(10):
    train(train_loader, model, criterion, epoch,optim)
    loss, acc = valid (test_loader,model)
    if acc > best_acc:
        best_acc = acc
        best_epoch = epoch
        state = {'epoch': epoch, 'transformer': model, 'transformer_optimizer': transformer_optimizer}
        torch.save(state, '/content/drive/MyDrive/calibration_project/sentiment_analysis/best_base_model_object.pth.tar')
        torch.save(model.state_dict(), '/content/drive/MyDrive/calibration_project/sentiment_analysis/best_base_model_object.pth')
    print('cur epoch:%d, cur acc:%.5f, best epoch:%d, best acc:%.5f'%(epoch,acc, best_epoch, best_acc))

cur epoch:0, cur acc:72.95082, best epoch:0, best acc:72.95082
cur epoch:1, cur acc:74.93169, best epoch:1, best acc:74.93169
cur epoch:2, cur acc:77.25410, best epoch:2, best acc:77.25410
cur epoch:3, cur acc:75.64891, best epoch:2, best acc:77.25410
cur epoch:4, cur acc:71.27732, best epoch:2, best acc:77.25410
cur epoch:5, cur acc:74.11202, best epoch:2, best acc:77.25410
cur epoch:6, cur acc:72.71175, best epoch:2, best acc:77.25410
cur epoch:7, cur acc:71.44809, best epoch:2, best acc:77.25410
cur epoch:8, cur acc:75.34153, best epoch:2, best acc:77.25410
cur epoch:9, cur acc:75.47814, best epoch:2, best acc:77.25410
cur epoch:10, cur acc:76.33197, best epoch:2, best acc:77.25410
cur epoch:11, cur acc:75.34153, best epoch:2, best acc:77.25410
cur epoch:12, cur acc:74.35109, best epoch:2, best acc:77.25410
cur epoch:13, cur acc:73.29235, best epoch:2, best acc:77.25410
cur epoch:14, cur acc:73.42896, best epoch:2, best acc:77.25410


KeyboardInterrupt: ignored