### **Imports**

In [0]:
!pip install transformers
!pip install tqdm


In [0]:
import zipfile
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset,DataLoader,RandomSampler,SequentialSampler
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import transformers
from transformers import AdamW,get_linear_schedule_with_warmup
from transformers import BertTokenizer,BertForSequenceClassification
from tqdm.notebook import tqdm
from sklearn.metrics import f1_score,accuracy_score
import random

### **Preprocessing**

In [4]:
df = pd.read_csv('/content/train.csv')
df.set_index('id',inplace=True)
df['keyword'] = df['keyword'].fillna(" ")
df['location'] = df['location'].fillna(" ")
df.head(),df.target.value_counts()

(   keyword location                                               text  target
 id                                                                            
 1                    Our Deeds are the Reason of this #earthquake M...       1
 4                               Forest fire near La Ronge Sask. Canada       1
 5                    All residents asked to 'shelter in place' are ...       1
 6                    13,000 people receive #wildfires evacuation or...       1
 7                    Just got sent this photo from Ruby #Alaska as ...       1,
 0    4342
 1    3271
 Name: target, dtype: int64)

In [0]:
def split_data(df):
  X_train,X_val,y_train,y_val = train_test_split(df.index.values,
                                                 df.target.values,
                                                 test_size=0.15,
                                                 random_state=10,
                                                 )
  df['data_type'] = ['not_assigned']*df.shape[0]
  df.loc[X_train,"data_type"] = "train"
  df.loc[X_val,"data_type"] = "val"
  return df



In [0]:
sample = df.copy()
sample = split_data(sample)

In [7]:
sample.head()

Unnamed: 0_level_0,keyword,location,text,target,data_type
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,,,Our Deeds are the Reason of this #earthquake M...,1,train
4,,,Forest fire near La Ronge Sask. Canada,1,train
5,,,All residents asked to 'shelter in place' are ...,1,train
6,,,"13,000 people receive #wildfires evacuation or...",1,train
7,,,Just got sent this photo from Ruby #Alaska as ...,1,train


### **Dataset**

In [8]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased',do_lower_case=True)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




In [0]:
class Dataset:

  def __init__(self,df,tokenizer,data_type ="train"):
    self.data_type = data_type
    self.df = df
    self.tokenizer = tokenizer
    self.text = self.df[self.df.data_type == self.data_type].text.values
    self.location = self.df[self.df.data_type == self.data_type].location.values
    self.keyword   = self.df[self.df.data_type == self.data_type].keyword.values
    if (self.data_type == 'train' or self.data_type == 'val'):
      self.target = torch.tensor(self.df[self.df.data_type == self.data_type ].target.values)

  def tokenize(self,txt,max_len):
    return self.tokenizer.batch_encode_plus(txt,
                                     add_special_tokens=True,
                                     return_attention_mask=True,
                                     pad_to_max_length = True,
                                     max_length = max_len,
                                     return_tensor = 'pt'
                                     )
  
  def get_dataset(self,lengths):
     encoded_data_text = self.tokenize(self.text,lengths[0])
     encoded_data_keyword = self.tokenize(self.keyword,lengths[1])
     encoded_data_location = self.tokenize(self.location,lengths[2])
    
     input_ids_text = torch.tensor(encoded_data_text['input_ids'])
     input_ids_keyword = torch.tensor(encoded_data_keyword['input_ids'])
     input_ids_location = torch.tensor(encoded_data_location['input_ids'])
    
     attention_mask_text = torch.tensor(encoded_data_text['attention_mask'])
     attention_mask_keyword = torch.tensor(encoded_data_keyword['attention_mask'])
     attention_mask_location = torch.tensor(encoded_data_location['attention_mask'])
     
     input_ids = torch.cat((input_ids_text, input_ids_keyword , input_ids_location),1)
     attention_mask = torch.cat((attention_mask_text , attention_mask_keyword , attention_mask_location),1)

     if (self.data_type == 'train' or self.data_type == 'val'):
       return TensorDataset(input_ids,attention_mask,self.target)
    
     elif (self.data_type == 'test'):
       return TensorDataset(input_ids,attention_mask)
    

In [0]:
dataset_train = Dataset(sample,tokenizer).get_dataset([280,10,10])
dataset_val = Dataset(sample,tokenizer,'val').get_dataset([280,10,10])
assert (len(dataset_train) + len(dataset_val)) == df.shape[0]

### **Model**

In [0]:
bs = 8
dataloader_train = DataLoader(dataset_train,sampler=RandomSampler(dataset_train),batch_size=bs)
dataloader_val = DataLoader(dataset_val,sampler = SequentialSampler(dataset_val),batch_size=16)

In [12]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased',
                                                      num_labels=2,
                                                      output_attentions=False,
                                                      output_hidden_states=False
                                                      )

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440473133.0, style=ProgressStyle(descri…




In [0]:
epochs = 5
optimizer = AdamW(model.parameters(),lr=3e-05,eps=1e-08)
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps=0,
                                            num_training_steps= len(dataloader_train)*epochs
                                            )

### **Metrics**

In [0]:
def f_score(preds,labels):
  preds_flat = np.argmax(preds,axis=1).flatten()
  labels_flat = labels.flatten()
  return f1_score(labels_flat,preds_flat,average='weighted')

In [0]:
def accuracy(preds,labels):
  preds_flat = np.argmax(preds,axis=1).flatten()
  labels_flat = labels.flatten()
  return accuracy_score(labels_flat,preds_flat)

### **Training**

In [0]:
seed_val = 10
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [20]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
print(device)

cuda


In [0]:
def evaluate(dataloader):
  model.eval()
  loss_val_total = 0
  preds,val_labels = [],[]

  for batch in dataloader:
    batch = tuple(b.to(device) for b in batch) 
    inputs = {
              "input_ids" : batch[0],
              "attention_mask" : batch[1],
              "labels" : batch[2]
              }
    
    with torch.no_grad():
      outputs = model(**inputs)
    
    loss = outputs[0]
    logits = outputs[1]
    loss_val_total += loss.item()

    logits = logits.detach().cpu().numpy()
    label_ids = inputs["labels"].detach().cpu().numpy()
    
    preds.append(logits)
    val_labels.append(label_ids)
  
  loss_val_avg = loss_val_total/len(dataloader)
  preds = np.concatenate(preds,axis=0)
  val_labels = np.concatenate(val_labels,axis=0)

  return loss_val_avg,preds,val_labels

In [22]:
for epoch in tqdm(range(1,epochs+1)):
  model.train()
  loss_train_total = 0

  progress_bar = tqdm(dataloader_train,desc= f'Epoch {epoch}',leave=False,disable=False)

  for batch in progress_bar:
    model.zero_grad()

    batch = tuple(b.to(device) for b in batch)
    inputs = {
              "input_ids" : batch[0],
              "attention_mask" : batch[1],
              "labels" : batch[2]
              }
    outputs = model(**inputs)

    loss = outputs[0]
    loss_train_total += loss.item()
    loss.backward()

    nn.utils.clip_grad_norm_(model.parameters(),1.0)

    optimizer.step()
    scheduler.step()

    progress_bar.set_postfix({'Training Loss': '{:.3f}'.format(loss.item()/len(batch))})
  
  torch.save(model.state_dict(),f'FineTuned_BERT_epoch_{epoch}.model')
  tqdm.write(f'\n Epoch {epoch}')

  loss_train_avg = loss_train_total/len(dataloader_train)
  tqdm.write(f'Training Loss {loss_train_avg:.4f}')
  
  val_loss , preds , val_labels =  evaluate(dataloader_val)
  
  val_f1 = f_score(preds,val_labels)
  val_acc = accuracy(preds,val_labels)

  tqdm.write(f'Validation Loss : {val_loss:.4f}')
  tqdm.write(f'F1 Score : {val_f1}')
  tqdm.write(f'Accuracy : {val_acc}')


HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, description='Epoch 1', max=809.0, style=ProgressStyle(description_widt…


 Epoch 1
Training Loss 0.4893
Validation Loss : 0.4374
F1 Score : 0.7857522026574095
Accuracy : 0.7854640980735552


HBox(children=(FloatProgress(value=0.0, description='Epoch 2', max=809.0, style=ProgressStyle(description_widt…


 Epoch 2
Training Loss 0.3976
Validation Loss : 0.4020
F1 Score : 0.8366586287360362
Accuracy : 0.8380035026269702


HBox(children=(FloatProgress(value=0.0, description='Epoch 3', max=809.0, style=ProgressStyle(description_widt…


 Epoch 3
Training Loss 0.3242
Validation Loss : 0.6050
F1 Score : 0.8241001594309025
Accuracy : 0.8248686514886164


HBox(children=(FloatProgress(value=0.0, description='Epoch 4', max=809.0, style=ProgressStyle(description_widt…


 Epoch 4
Training Loss 0.2479
Validation Loss : 0.6841
F1 Score : 0.821876114597442
Accuracy : 0.8231173380035026


HBox(children=(FloatProgress(value=0.0, description='Epoch 5', max=809.0, style=ProgressStyle(description_widt…


 Epoch 5
Training Loss 0.1689
Validation Loss : 0.8694
F1 Score : 0.8045937703189079
Accuracy : 0.8047285464098074



In [23]:
model.load_state_dict(torch.load('/content/FineTuned_BERT_epoch_2.model', map_location=torch.device('cpu')))

<All keys matched successfully>

In [35]:
test = pd.read_csv('/content/test.csv')
test.set_index('id',inplace=True)
test['keyword'] = test['keyword'].fillna(" ")
test['location'] = test['location'].fillna(" ")
sample_test = test.copy()
sample_test['data_type'] = ['test']*test.shape[0]
sample_test.head()

Unnamed: 0_level_0,keyword,location,text,data_type
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,,,Just happened a terrible car crash,test
2,,,"Heard about #earthquake is different cities, s...",test
3,,,"there is a forest fire at spot pond, geese are...",test
9,,,Apocalypse lighting. #Spokane #wildfires,test
11,,,Typhoon Soudelor kills 28 in China and Taiwan,test


In [0]:
dataset_test = Dataset(sample_test,tokenizer,'test').get_dataset([280,10,10])

In [0]:
dataloader_test = DataLoader(dataset_test,sampler=SequentialSampler(dataset_test),batch_size=16)

In [0]:
def predict(dataloader):
  model.eval()
  loss_test_total = 0
  preds = []

  for batch in dataloader:
    batch = tuple(b.to(device) for b in batch) 
    inputs = {
              "input_ids" : batch[0],
              "attention_mask" : batch[1]
              }
    
    with torch.no_grad():
      outputs = model(**inputs)
    
    logits = outputs[0]
    logits = logits.detach().cpu().numpy()
    preds.append(logits)
  
  preds = np.concatenate(preds,axis=0)
 
  return preds


In [0]:
test_preds=  predict(dataloader_test)

In [0]:
test_preds_flat = np.argmax(test_preds,axis=1).flatten()


In [0]:
sample_submission = pd.read_csv('/content/sample_submission.csv')
sample_submission.set_index('id',inplace=True)

In [0]:
sample_submission['target'] = test_preds_flat

In [0]:
sample_submission.to_csv('/content/submission.csv')

**81.595%** accuracy on test set


