In [23]:
import ssl 
import numpy as np
import pandas as pd 
import torch
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime 
import copy
from collections import defaultdict
from torch import tensor
import torch.nn as nn
from transformers import BertTokenizer,get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset,DataLoader
import torch.nn.functional as F
import torch.optim as optim
from torch.optim import lr_scheduler
from torchtext import transforms
import pytorch_transformers
from transformers import BertForTokenClassification
from pytorch_transformers import BertModel
ssl._create_default_https_context = ssl._create_unverified_context 

**Background**: The ubiquitousness of smartphones enables people to announce an emergency they’re observing in real-time. Because of this, more agencies are interested in programatically monitoring Twitter. But, it’s not always clear whether a person’s words are actually announcing a disaster. 

**Objective**: Build a machine learning model that predicts which Tweets are about real disasters and which one’s aren’t. 

**Type**: Classification

**Scale**: 7163 for train; 3263 for test

**Evaluation**: F1

In [40]:
train = pd.read_csv('/Users/wangshuo/Library/Mobile Documents/com~apple~CloudDocs/data/nlp-getting-started/train.csv')

Test = pd.read_csv('/Users/wangshuo/Library/Mobile Documents/com~apple~CloudDocs/data/nlp-getting-started/test.csv')

In [3]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

In [5]:
# max length
token_lens = []
for txt in train.text:
    tokens = tokenizer.encode(txt,max_length=512)
    token_lens.append(len(tokens))

In [None]:
sns.set(rc={"figure.dpi": 200},font_scale=1.1)
sns.displot(token_lens)

In [41]:
class DT_dataset(Dataset):
    def __init__(self,txts,labels,tokenizer,max_len):
        self.txts = txts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __len__(self):
        return len(self.txts)
    
    def __getitem__(self,idx):
        txt = str(self.txts[idx])
        label = self.labels[idx]
        
        encoding = self.tokenizer.encode_plus(
                        txt,
                        add_special_tokens=True,
                        max_length=self.max_len,
                        return_token_type_ids=False,
                        padding='max_length',
                        truncation=True,
                        return_attention_mask=True,
                        return_tensors='pt')
        return {
            'txt':txt,
            'input_ids':encoding['input_ids'].flatten(),
            'attention_mask':encoding['attention_mask'].flatten(),
            'labels':torch.tensor(label,dtype=torch.long)
        }

In [42]:
train_data,test_data = train_test_split(train,test_size=0.2,stratify=train['target'],random_state=7)

train_data= DT_dataset(train_data['text'].to_numpy(),train_data['target'].to_numpy(),tokenizer,80)
test_data = DT_dataset(test_data['text'].to_numpy(),test_data['target'].to_numpy(),tokenizer,80)

train_loader = DataLoader(train_data,batch_size=16,shuffle=True)
test_loader = DataLoader(test_data,batch_size=16,shuffle=True)

In [43]:
class BertClassifier(nn.Module):
    def __init__(self,n_classes):
        super(BertClassifier,self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-cased')
        self.dropout = nn.Dropout(p=0.3)
        self.out = nn.Linear(self.bert.config.hidden_size,n_classes)
        
    def forward(self,input_ids,attention_mask):
        _,pooled_output = self.bert(input_ids=input_ids,attention_mask=attention_mask)
        output = self.dropout(pooled_output)
        return self.out(output)
    

In [59]:
# Dataset
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
class DT_dataset(Dataset):
    def __init__(self,df): 
        self.df = df 
        self.labels = [label for label in df['target']]
        self.texts = [tokenizer(text,
                                padding='max_length',
                                max_length=64,
                                truncation=True,
                                return_tensors='pt')
                     for text in df['text']]
    
    def __len__(self):
        return len(self.df)
    
    def classes(self):
        return self.labels
    
    def __getitem__(self,idx):
        texts = self.texts[idx]
        labels = np.array(self.labels[idx])
        return texts,labels

train_data,test_data = train_test_split(train,test_size=0.2,stratify=train['target'],random_state=7)

train_data,test_data = DT_dataset(train_data), DT_dataset(test_data)

train_loader = DataLoader(train_data,batch_size=2,shuffle=True)
test_loader = DataLoader(test_data,batch_size=2,shuffle=True)

In [265]:
# Model
class BertClassifier(nn.Module):
    def __init__(self,dropout=0.5):
        super(BertClassifier,self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-cased')
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(768,2)
        self.relu = nn.ReLU()
        
    def forward(self,input_id, mask):
        _, pooled_output = self.bert(input_ids=input_id,attention_mask=mask)
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
        final_layer = self.relu(linear_output)
        return final_layer

In [None]:
optimizer = optim.SGD(model.parameters(), lr=0.01,momentum=0.9)

scheduler = lr_scheduler.MultiStepLR(optimizer, milestones=[6,11], gamma=0.1)

In [44]:
EPOCHS = 10

model = BertClassifier(2).to('mps')

criterion = nn.CrossEntropyLoss().to('mps')

optimizer = optim.AdamW(model.parameters(), lr=2e-5)

total_steps = len(train_loader)*EPOCHS

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, 
                            num_training_steps=total_steps)

In [300]:
#
def get_acc(output, label):
    total = output.shape[0]
    _, pred_label = output.max(1)
    num_correct = (pred_label == label).sum().item()
    return num_correct / total

def train(net, train_data, test_data, num_epochs, optimizer, criterion,scheduler):
    best_wts = copy.deepcopy(model.state_dict())
    best_acc = 0
    prev_time = datetime.now()
    for epoch in range(num_epochs):
        train_loss = 0
        train_acc = 0
        net = net.train()
        for item in train_data:
            # forward
#             im = im.to('mps')
#             label = label.to('mps')
            mask = item['attention_mask']
            input_id = item['input_ids']
            output = net(input_id,mask)
            loss = criterion(output, item['labels'])
            # backward
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            train_loss += loss.item()
            train_acc += get_acc(output, item['labels'])
        scheduler.step()
        cur_time = datetime.now()
        h, remainder = divmod((cur_time - prev_time).seconds, 3600)
        m, s = divmod(remainder, 60)
        time_str = "Time %02d:%02d:%02d" % (h, m, s) 
        if test_data is not None:
            valid_loss = 0
            valid_acc = 0
            net = net.eval()
            for item in test_data:
#                 im = im.to('mps')
#                 label = label.to('mps')
                mask = item['attention_mask']
                input_id = item['input_ids']
                output = net(input_id,mask)
                loss = criterion(output, item['labels'])
                valid_loss += loss.item()
                valid_acc += get_acc(output,item['labels'])
                epoch_str = (
                "Epoch %d. Train Loss: %f, Train Acc: %f, Valid Loss: %f, Valid Acc: %f, "
                % (epoch, train_loss / len(train_data),
                   train_acc / len(train_data), valid_loss / len(test_data),
                   valid_acc / len(test_data)))
            if valid_acc > best_acc:
                best_acc = valid_acc
                best_wts = copy.deepcopy(model.state_dict())
        else:
            epoch_str = ("Epoch %d. Train Loss: %f, Train Acc: %f, " %
                         (epoch, train_loss / len(train_data),
                          train_acc / len(train_data)))
        prev_time = cur_time
        print(epoch_str + time_str)
    print(best_wts)    
    model.load_state_dict(best_wts)

train(model, train_loader,test_loader, 13, optimizer, criterion,scheduler,len(train_data)) 

In [45]:
def train_epoch(model,data_loader,loss_fn,optimizer,scheduler,n_examples):
    model = model.train()
    losses = []
    correct_predictions = 0
    
    for d in data_loader:
        input_ids = d["input_ids"].to('mps')
        attention_mask = d["attention_mask"].to('mps')
        targets = d["labels"].to('mps')
        
        outputs = model(input_ids=input_ids,attention_mask=attention_mask)
        
        _, preds = torch.max(outputs, dim=1)
        loss = loss_fn(outputs, targets)
        
        correct_predictions += torch.sum(preds == targets)
        losses.append(loss.item())
        
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()
    return correct_predictions,correct_predictions / n_examples, np.mean(losses)

In [38]:
def eval_model(model,data_loader,loss_fn,n_examples):
    model = model.eval()
    
    losses = []
    correct_predictions = 0
    
    with torch.no_grad():
        for d in data_loader:
            input_ids = d["input_ids"].to('mps')
            attention_mask = d["attention_mask"].to('mps')
            targets = d["labels"].to('mps')
            
            outputs = model(input_ids=input_ids,attention_mask=attention_mask)
            
            _, preds = torch.max(outputs, dim=1)
            
            loss = loss_fn(outputs, targets)
            
            correct_predictions += torch.sum(preds == targets)
            losses.append(loss.item())
            
        return correct_predictions / n_examples, np.mean(losses)

In [None]:
%%time

history = defaultdict(list)
best_acc = 0

for epoch in range(EPOCHS):
    print(f'Epoch {epoch + 1}/{EPOCHS}')
    print('_'*10)
    
    train_acc,train_loss = train_epoch(model,train_loader,criterion,optimizer,scheduler,
                                      len(train_data))
    
    print(f'Train loss {train_loss} accuracy {train_acc}')
    
    val_acc, val_loss = eval_model(model,test_loader,criterion,len(test_data))
    
    print(f'Val   loss {val_loss} accuracy {val_acc}')
    print()
    
    history['train_acc'].append(train_acc)
    history['train_loss'].append(train_loss)
    history['val_acc'].append(val_acc)
    history['val_loss'].append(val_loss)
    
    if val_acc > best_acc:
        torch.save(model.state_dict(), 'best_model_state.bin')
        best_acc = val_acc

In [49]:
a,b,c = train_epoch(model,train_loader,criterion,optimizer,scheduler,len(train_data))

In [50]:
a 

tensor(3046, device='mps:0')

In [51]:
a.double()

TypeError: Cannot convert a MPS Tensor to float64 dtype as the MPS framework doesn't support float64. Please use float32 instead.

In [None]:
correct_predictions,correct_predictions / n_examples, np.mean(losses)