<a href="https://colab.research.google.com/github/AoShuang92/calibration_is_all_you_need/blob/main/LM_V2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%matplotlib inline
import torch
import torch.nn as nn
import torch.nn.functional as F
from collections import Counter
import json
from torch.utils.data import Dataset
import torch.utils.data
import math
import pandas as pd
import re
import string
from nltk.stem import WordNetLemmatizer
import nltk
nltk.download('wordnet')
from nltk.corpus import wordnet
import torchtext
from torchtext import data
from torchtext.data.utils import get_tokenizer
import os
from torchtext.vocab import Vectors, GloVe
from nltk.util import ngrams
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
max_length = bptt = 35
class Embeddings(nn.Module):
    """
    Implements embeddings of the words and adds their positional encodings. 
    """
    def __init__(self, vocab_size, d_model, max_len = max_length):
        super(Embeddings, self).__init__()
        self.d_model = d_model
        self.dropout = nn.Dropout(0.1)
        self.embed = nn.Embedding(vocab_size, d_model)
        self.pe = self.create_positinal_encoding(max_len, self.d_model)
        self.dropout = nn.Dropout(0.1)
        
    def create_positinal_encoding(self, max_len, d_model):
        pe = torch.zeros(max_len, d_model).to(device)
        for pos in range(max_len):   # for each position of the word
            for i in range(0, d_model, 2):   # for each dimension of the each position
                pe[pos, i] = math.sin(pos / (10000 ** ((2 * i)/d_model)))
                pe[pos, i + 1] = math.cos(pos / (10000 ** ((2 * (i + 1))/d_model)))
        pe = pe.unsqueeze(0)   # include the batch size
        return pe
        
    def forward(self, encoded_words):
        embedding = self.embed(encoded_words) * math.sqrt(self.d_model)
        #print("embedding",embedding.size(),encoded_words.size())
        #print("pe",self.pe.size())
        embedding += self.pe[:, :embedding.size(1)]   # pe will automatically be expanded with the same batch size as encoded_words
        embedding = self.dropout(embedding)
        return embedding

class MultiHeadAttention(nn.Module):
    
    def __init__(self, heads, d_model):
        
        super(MultiHeadAttention, self).__init__()
        assert d_model % heads == 0
        self.d_k = d_model // heads
        self.heads = heads
        self.dropout = nn.Dropout(0.1)
        self.query = nn.Linear(d_model, d_model)
        self.key = nn.Linear(d_model, d_model)
        self.value = nn.Linear(d_model, d_model)
        self.concat = nn.Linear(d_model, d_model)
        
    def forward(self, query, key, value, mask):
        """
        query, key, value of shape: (batch_size, max_len, 512)
        mask of shape: (batch_size, 1, 1, max_words)
        """
        # (batch_size, max_len, 512)
        query = self.query(query)
        key = self.key(key)        
        value = self.value(value)   
        
        # (batch_size, max_len, 512) --> (batch_size, max_len, h, d_k) --> (batch_size, h, max_len, d_k)
        query = query.view(query.shape[0], -1, self.heads, self.d_k).permute(0, 2, 1, 3)   
        key = key.view(key.shape[0], -1, self.heads, self.d_k).permute(0, 2, 1, 3)  
        value = value.view(value.shape[0], -1, self.heads, self.d_k).permute(0, 2, 1, 3)  
        
        # (batch_size, h, max_len, d_k) matmul (batch_size, h, d_k, max_len) --> (batch_size, h, max_len, max_len)
        scores = torch.matmul(query, key.permute(0,1,3,2)) / math.sqrt(query.size(-1))
        #scores = torch.matmul(query, key.permute(2,1,0,0)) / math.sqrt(query.size(-1))
        #print("scores",scores.size())([35, 8, 2, 2])
        scores = scores.masked_fill(mask == 0, -1e9)    # (batch_size, h, max_len, max_len)
        weights = F.softmax(scores, dim = -1)           # (batch_size, h, max_len, max_len)
        weights = self.dropout(weights)
        # (batch_size, h, max_len, max_len) matmul (batch_size, h, max_len, d_k) --> (batch_size, h, max_len, d_k)
        context = torch.matmul(weights, value)
        # (batch_size, h, max_len, d_k) --> (batch_size, max_len, h, d_k) --> (batch_size, max_len, h * d_k)
        context = context.permute(0,2,1,3).contiguous().view(context.shape[0], -1, self.heads * self.d_k)
        # (batch_size, max_len, h * d_k)
        interacted = self.concat(context)
        return interacted

class FeedForward(nn.Module):

    def __init__(self, d_model, middle_dim = 2048):
        super(FeedForward, self).__init__()
        
        self.fc1 = nn.Linear(d_model, middle_dim)
        self.fc2 = nn.Linear(middle_dim, d_model)
        self.dropout = nn.Dropout(0.1)

    def forward(self, x):
        out = F.relu(self.fc1(x))
        out = self.fc2(self.dropout(out))
        return out

class EncoderLayer(nn.Module):

    def __init__(self, d_model, heads):
        super(EncoderLayer, self).__init__()
        self.layernorm = nn.LayerNorm(d_model)
        self.self_multihead = MultiHeadAttention(heads, d_model)
        self.feed_forward = FeedForward(d_model)
        self.dropout = nn.Dropout(0.1)

    def forward(self, embeddings, mask):
        interacted = self.dropout(self.self_multihead(embeddings, embeddings, embeddings, mask))
        interacted = self.layernorm(interacted + embeddings)
        feed_forward_out = self.dropout(self.feed_forward(interacted))
        encoded = self.layernorm(feed_forward_out + interacted)
        return encoded

class DecoderLayer(nn.Module):
    
    def __init__(self, d_model, heads):
        super(DecoderLayer, self).__init__()
        self.layernorm = nn.LayerNorm(d_model)
        self.self_multihead = MultiHeadAttention(heads, d_model)
        self.src_multihead = MultiHeadAttention(heads, d_model)
        self.feed_forward = FeedForward(d_model)
        self.dropout = nn.Dropout(0.1)
        
    def forward(self, embeddings, encoded, src_mask, target_mask):
        query = self.dropout(self.self_multihead(embeddings, embeddings, embeddings, target_mask))
        query = self.layernorm(query + embeddings)
        interacted = self.dropout(self.src_multihead(query, encoded, encoded, src_mask))
        interacted = self.layernorm(interacted + query)
        feed_forward_out = self.dropout(self.feed_forward(interacted))
        decoded = self.layernorm(feed_forward_out + interacted)
        return decoded

class Transformer(nn.Module):
    
    def __init__(self, d_model, heads, num_layers, word_map):
        super(Transformer, self).__init__()
        
        self.d_model = d_model
        self.vocab_size = word_map
        self.embed = Embeddings(self.vocab_size, d_model)#max_len
        self.embed_dec = Embeddings(self.vocab_size, d_model,max_len=max_length)
        self.encoder = nn.ModuleList([EncoderLayer(d_model, heads) for _ in range(num_layers)])
        self.decoder = nn.ModuleList([DecoderLayer(d_model, heads) for _ in range(num_layers)])
        self.logit = nn.Linear(d_model, self.vocab_size)
        
    def generate_square_subsequent_mask(self, sz):
        mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
        return mask
    
        
    def encode(self, src_words, src_mask):
        src_embeddings = self.embed(src_words)
        for layer in self.encoder:
            src_embeddings = layer(src_embeddings, src_mask)
        return src_embeddings
    
    def decode(self, target_words, target_mask, src_embeddings, src_mask):
        tgt_embeddings = self.embed(target_words)
        for layer in self.decoder:
            tgt_embeddings = layer(tgt_embeddings, src_embeddings, src_mask, target_mask)
        return tgt_embeddings
        
    def forward(self, src_words, src_mask, target_words, target_mask):
        encoded = self.encode(src_words, src_mask)
        decoded = self.decode(target_words, target_mask, encoded, src_mask)
        out = F.log_softmax(self.logit(decoded), dim = 2)
        return out

class AdamWarmup:
    
    def __init__(self, model_size, warmup_steps, optimizer):
        
        self.model_size = model_size
        self.warmup_steps = warmup_steps
        self.optimizer = optimizer
        self.current_step = 0
        self.lr = 0
        
    def get_lr(self):
        return self.model_size ** (-0.5) * min(self.current_step ** (-0.5), self.current_step * self.warmup_steps ** (-1.5))
        
    def step(self):
        # Increment the number of steps each time we call the step function
        self.current_step += 1
        lr = self.get_lr()
        for param_group in self.optimizer.param_groups:
            param_group['lr'] = lr
        # update the learning rate
        self.lr = lr
        self.optimizer.step()

class LossWithLS(nn.Module):

    def __init__(self, size, smooth):
        super(LossWithLS, self).__init__()
        self.criterion = nn.KLDivLoss(size_average=False, reduce=False)
        self.confidence = 1.0 - smooth
        self.smooth = smooth
        self.size = size
        
    def forward(self, prediction, target, mask):
        """
        prediction of shape: (batch_size, max_words, vocab_size)
        target and mask of shape: (batch_size, max_words)
        """
        prediction = prediction.view(-1, prediction.size(-1))   # (batch_size * max_words, vocab_size)
        target = target.contiguous().view(-1)   # (batch_size * max_words)
        mask = mask.float()
        mask = mask.reshape(-1)       # (batch_size * max_words)
        labels = prediction.data.clone()
        labels.fill_(self.smooth / (self.size - 1))
        labels.scatter_(1, target.data.unsqueeze(1), self.confidence)
        loss = self.criterion(prediction, labels)    # (batch_size * max_words, vocab_size)
        loss = (loss.sum(1) * mask).sum() / mask.sum()
        return loss



In [None]:
import numpy as np
import os

train_dir = "/content/drive/MyDrive/calibration_project/sentiment_analysis/Tweets.csv"
# train_dir = "/content/combined_qa_train_ID.csv"
# test_dir = "/content/combined_qa_test_50_ID.csv"
batch_size = 16
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


def remove_unnecessary(text):
    #remove_URL
    url = re.compile(r'https?://\S+|www\.\S+')
    text = url.sub('', text)

    #remove_html
    html = re.compile(r'<.*?>')
    text = html.sub('', text)

    #remove @
    text = re.sub('@[^\s]+','',text)

    #remove_emoji
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)

    #Removes integers 
    text = ''.join([i for i in text if not i.isdigit()])         
    
    #remove_punct
    table = str.maketrans('', '', string.punctuation)
    text = text.translate(table)

    #Replaces contractions from a string to their equivalents 
    contraction_patterns = [(r'won\'t', 'will not'), (r'can\'t', 'cannot'), (r'i\'m', 'i am'), 
                            (r'ain\'t', 'is not'), (r'(\w+)\'ll', '\g<1> will'), (r'(\w+)n\'t', '\g<1> not'),
                            (r'(\w+)\'ve', '\g<1> have'), (r'(\w+)\'s', '\g<1> is'), (r'(\w+)\'re', '\g<1> are'),
                            (r'(\w+)\'d', '\g<1> would'), (r'&', 'and'), (r'dammit', 'damn it'), 
                            (r'dont', 'do not'), (r'wont', 'will not')]
    
    patterns = [(re.compile(regex), repl) for (regex, repl) in contraction_patterns]
    for (pattern, repl) in patterns:
        text, _= re.subn(pattern, repl, text)

    #lemmatize_sentence
    sentence_words = text.split(' ')
    new_sentence_words = list()
    
    for sentence_word in sentence_words:
        sentence_word = sentence_word.replace('#', '')
        new_sentence_word = WordNetLemmatizer().lemmatize(sentence_word.lower(), wordnet.VERB)
        new_sentence_words.append(new_sentence_word)
        
    new_sentence = ' '.join(new_sentence_words)
    new_sentence = new_sentence.strip()

    return new_sentence.lower()


def prepare_csv(train_dir, seed=27, val_ratio=0.2):
    df_train = pd.read_csv(train_dir,error_bad_lines=False)
    idx = np.arange(df_train.shape[0])    
    np.random.shuffle(idx)
    val_size = int(len(idx) * val_ratio)
    if not os.path.exists('cache'): # cache is tem memory file 
        os.makedirs('cache')
    
    df_train.iloc[idx[val_size:], :][['tweet_id', 'text']].to_csv(
        'cache/dataset_train.csv', index=False)
    
    df_train.iloc[idx[:val_size], :][['tweet_id',  'text']].to_csv(
        'cache/dataset_val.csv', index=False)  
    
    
#prepare_csv(train_dir) 

def get_iterator(dataset, batch_size, train=True,
                 shuffle=True, repeat=False, device=None): 
    dataset_iter = data.Iterator(
        dataset, batch_size=batch_size, device=device,
        train=train, shuffle=shuffle, repeat=repeat,
        sort=False)  
    return dataset_iter

def batchify(TEXT,data, bsz):
    #print(type(data))
    #print("data",data.examples[0].text)
    data = TEXT.numericalize([data])
    print("data0",data.size())
    # Divide the dataset into bsz parts.
    nbatch = data.size(0) // bsz
    #nbatch = len(data) // bsz
    print("nbatch",nbatch)
    # Trim off any extra elements that wouldn't cleanly fit (remainders).
    #print(nbatch)
    data = data.narrow(0, 0, nbatch * bsz)
    print("data1",data.size())
    # Evenly divide the data across the bsz batches.
    data = data.view(bsz, -1).t().contiguous()
    print("data2",data.size())
    #print(data.size())
    return data.to(device)

def get_dataset(fix_length=max_length, lower=False, vectors=None,train_dir = train_dir, batch_size=batch_size, device=None): 
    df_train = pd.read_csv(train_dir,error_bad_lines=False)
    df_train['text'] = df_train['text'].apply(lambda x: remove_unnecessary(x))
    
    prepare_csv(train_dir)
    if vectors is not None:
        lower=True

    TEXT = torchtext.data.Field(tokenize=get_tokenizer("spacy"),init_token='<sos>',eos_token='<eos>',lower=True,
                      fix_length=fix_length)
    
    LABEL = data.Field(use_vocab=False, sequential=False, dtype=torch.float16)
    ID = data.Field(use_vocab=False, sequential=False, dtype=torch.float16) 
    


    train_temps = data.TabularDataset(
        path='/content/cache/dataset_train.csv', format='csv', skip_header=True,
        fields=[("tweet_id",ID),('text', TEXT), ('airline_sentiment', LABEL)]) 
    test_temps = data.TabularDataset(
        path='/content/cache/dataset_val.csv', format='csv', skip_header=True,
        fields=[("tweet_id",ID),('text', TEXT) , ('airline_sentiment', LABEL)]) 

    TEXT.build_vocab(train_temps,test_temps, max_size=20000,
        min_freq=10, vectors=GloVe(name='6B', dim=300))
    ID.build_vocab(train_temps, test_temps)
    word_embeddings = TEXT.vocab.vectors
    all_words = TEXT.vocab.itos
    vocab_size = len(TEXT.vocab)
    ntokens = len(TEXT.vocab.stoi)
    print("ntokens:",ntokens,vocab_size,len(all_words))

    TEXT.build_vocab(train_temps,max_size=20000,
        min_freq=10, vectors=GloVe(name='6B', dim=300))
    ID.build_vocab(train_temps)
    word_embeddings_train = TEXT.vocab.vectors
    words_train = TEXT.vocab.itos
    vocab_size_train = len(TEXT.vocab)
    ntokens_train = len(TEXT.vocab.stoi)
    print("ntokens_train:",ntokens_train,vocab_size_train,len(words_train))

    TEXT.build_vocab(test_temps,vectors=GloVe(name='6B', dim=300))
    ID.build_vocab(test_temps)
    word_embeddings_test = TEXT.vocab.vectors
    words_test = TEXT.vocab.itos
    vocab_size_test = len(TEXT.vocab)
    ntokens_test = len(TEXT.vocab.stoi)
    print("ntokens_train:",ntokens_test,vocab_size_test,len(words_test))

    train_loader = batchify(TEXT, words_train, batch_size)
    test_loader = batchify(TEXT,words_test, batch_size)

    print('Train samples:%d'%(len(train_temps)), 'Valid samples:%d'%(len(test_temps)),'Train minibatch nb:%d'%(len(train_loader)),
            'Valid minibatch nb:%d'%(len(test_loader)))
    return vocab_size, word_embeddings, ntokens, train_loader, test_loader

In [None]:
vocab_size, word_embeddings, ntokens, train_loader, test_loader= get_dataset(fix_length=max_length,train_dir = train_dir, batch_size=batch_size)
print(ntokens)

ntokens: 1985 1985 1985
ntokens_train: 1722 1722 1722
ntokens_train: 6360 6360 6360
data0 torch.Size([1722, 1])
nbatch 107
data1 torch.Size([1712, 1])
data2 torch.Size([107, 16])
data0 torch.Size([6360, 1])
nbatch 397
data1 torch.Size([6352, 1])
data2 torch.Size([397, 16])
Train samples:11712 Valid samples:2928 Train minibatch nb:107 Valid minibatch nb:397
1985


In [None]:
def create_masks(question, reply_input):
    
    def subsequent_mask(size):
        mask = torch.triu(torch.ones(size, size)).transpose(0, 1).type(dtype=torch.uint8)
        return mask.unsqueeze(0)
    
    question_mask = question!=0
    question_mask = question_mask.to(device)
    question_mask = question_mask.unsqueeze(1).unsqueeze(1)         # (batch_size, 1, 1, max_words)
     
    reply_input_mask = reply_input!=0
    reply_input_mask = reply_input_mask.unsqueeze(1)  # (batch_size, 1, max_words)
    reply_input_mask = reply_input_mask & subsequent_mask(reply_input.size(-1)).type_as(reply_input_mask.data) 
    reply_input_mask = reply_input_mask.unsqueeze(1) # (batch_size, 1, max_words, max_words)
    
    return question_mask, reply_input_mask

def get_batch(source, i):
    seq_len = min(max_length, len(source) - 1 - i)
    data = source[i:i+seq_len]
    target = source[i+1:i+1+seq_len]
    return data, target

In [None]:
lr = 5.0
d_model = 512
heads = 8
num_layers = 3


model = Transformer(d_model = d_model, heads = heads, num_layers = num_layers, word_map = ntokens)
model = model.to(device)
adam_optimizer = torch.optim.Adam(model.parameters(), lr=0, betas=(0.9, 0.98), eps=1e-9)
transformer_optimizer = AdamWarmup(model_size = d_model, warmup_steps = 4000, optimizer = adam_optimizer)
optimizer = torch.optim.SGD(model.parameters(), lr=lr)
criterion = nn.CrossEntropyLoss()

import time
def train():
    model.train() # Turn on the train mode
    total_loss = 0.
    start_time = time.time()

    for batch, i in enumerate(range(0, train_loader.size(0) - 1, bptt)):
        
        data, targets = get_batch(train_loader, i)
        data = data.permute(1,0)
        targets = targets.permute(1,0)
        src_mask,tar_mask = create_masks(data,targets)
        output = model(data, src_mask, targets, tar_mask)
        targets = targets.reshape(-1)
        loss = criterion(output.view(-1,ntokens), targets)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
        optimizer.step()

        total_loss += loss.item()
        log_interval = 50
        if i % log_interval == 0 and i > 0:
            cur_loss = total_loss / log_interval
            #elapsed = time.time() - start_time
            #print("epoch:",epoch, "loss:",cur_loss)
            # print('| epoch {:3d} | {:5d}/{:5d} batches | '
            #       'lr {:02.2f} | ms/batch {:5.2f} | '
            #       'loss {:5.2f} | ppl {:8.2f}'.format(
            #         epoch, i, len(train_loader) // max_length, scheduler.get_lr()[0],
            #         elapsed * 1000 / log_interval,
            #         cur_loss, math.exp(cur_loss)))
            total_loss = 0
            start_time = time.time()

def evaluate(eval_model, data_source):
    eval_model.eval() # Turn on the evaluation mode
    total_loss = 0.
    with torch.no_grad():
        for batch, i in enumerate(range(0, data_source.size(0) - 1, bptt)):
            data, targets = get_batch(data_source, i)
            data = data.permute(1,0)
            targets = targets.permute(1,0)
            src_mask,tar_mask = create_masks(data,targets)
            output = eval_model(data, src_mask,targets,tar_mask)
            output_flat = output.view(-1, ntokens)
            targets = targets.reshape(-1)
            total_loss += len(data) * criterion(output_flat, targets).item()
    return total_loss / (len(data_source) - 1)

best_val_loss = float("inf")
epochs = 50 # The number of epochs
best_model = None
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95)
best_epoch = 0
for epoch in range(1, epochs + 1):
    epoch_start_time = time.time()
    train()
    val_loss = evaluate(model, test_loader)
    #print('-' * 89)
    # print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
    #       'valid ppl {:8.2f}'.format(epoch, (time.time() - epoch_start_time),
    #                                  val_loss, math.exp(val_loss)))
    #print("end of epoch:",epoch, "valid_loss:", val_loss, "valid_ppl:",math.exp(val_loss))
    #print('-' * 89)

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        best_model = model
        best_epoch = epoch
        
        torch.save(model.state_dict(), '/content/drive/MyDrive/calibration_project/sentiment_analysis/best_language_model.pth')
    print("epoch:",epoch,"loss:",val_loss, "best_epoch",best_epoch,"best_loss:",best_val_loss)

    scheduler.step()

epoch: 1 loss: 6.114708121496302 best_epoch 1 best_loss: 6.114708121496302
epoch: 2 loss: 10.22878407097348 best_epoch 1 best_loss: 6.114708121496302
epoch: 3 loss: 5.695517930669032 best_epoch 3 best_loss: 5.695517930669032
epoch: 4 loss: 4.741299306466683 best_epoch 4 best_loss: 4.741299306466683
epoch: 5 loss: 4.7504697610403746 best_epoch 4 best_loss: 4.741299306466683
epoch: 6 loss: 5.161473902733878 best_epoch 4 best_loss: 4.741299306466683
epoch: 7 loss: 5.2604770029470815 best_epoch 4 best_loss: 4.741299306466683
epoch: 8 loss: 6.218741768795722 best_epoch 4 best_loss: 4.741299306466683
epoch: 9 loss: 7.828553333233938 best_epoch 4 best_loss: 4.741299306466683
epoch: 10 loss: 9.137231724862835 best_epoch 4 best_loss: 4.741299306466683
epoch: 11 loss: 4.977654687624245 best_epoch 4 best_loss: 4.741299306466683
epoch: 12 loss: 4.670363020957578 best_epoch 12 best_loss: 4.670363020957578
epoch: 13 loss: 6.5080383922004215 best_epoch 12 best_loss: 4.670363020957578
epoch: 14 loss: 