In [2]:
import numpy as np
import pandas as pd
import time
import datetime
import gc
import random
import nltk
from nltk.corpus import stopwords
import re
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler,random_split
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

import transformers
from transformers import BertForSequenceClassification, AdamW, BertConfig,BertTokenizer,get_linear_schedule_with_warmup


In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
import pandas as pd
train_path="/content/drive/MyDrive/Colab Notebooks/train (1).csv"
df = pd.read_csv(train_path)

In [5]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [6]:
df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


# **Clean text**





In [7]:
nltk.download('stopwords')
sw = stopwords.words('english')

def clean_text(text):

    text = text.lower()

    text = re.sub(r"[^a-zA-Z?.!,¿]+", " ", text) # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")

    text = re.sub(r"http\S+", "",text) #Removing URLs
    #text = re.sub(r"http", "",text)

    html=re.compile(r'<.*?>')

    text = html.sub(r'',text) #Removing html tags

    punctuations = '@#!?+&*[]-%.:/();$=><|{}^' + "'`" + '_'
    for p in punctuations:
        text = text.replace(p,'') #Removing punctuations

    text = [word.lower() for word in text.split() if word.lower() not in sw]

    text = " ".join(text) #removing stopwords

    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text) #Removing emojis

    return text


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [8]:
df['text'] = df['text'].apply(lambda x: clean_text(x))

In [9]:
tweets = df.text.values
labels = df.target.values


**BERT tokenizer**

In [10]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased',do_lower_case=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [11]:
print('Original: ', tweets[0])
print('Tokenized: ', tokenizer.tokenize(tweets[0]))
print('Token IDs: ',tokenizer.convert_tokens_to_ids(tokenizer.tokenize(tweets[0])))

Original:  deeds reason earthquake may allah forgive us
Tokenized:  ['deeds', 'reason', 'earthquake', 'may', 'allah', 'forgive', 'us']
Token IDs:  [15616, 3114, 8372, 2089, 16455, 9641, 2149]


In [12]:
max_len = 0

for sent in tweets :
  input_ids = tokenizer.encode(sent, add_special_tokens = True)

  max_len = max(max_len, len(input_ids))

print('Max sentence length: ', max_len )

Max sentence length:  45


In [13]:
input_ids = []
attention_masks=[]

for tweet in tweets:
  encoded_dict = tokenizer.encode_plus(tweet, add_special_tokens = True, max_length = max_len,pad_to_max_length= True,return_tensors='pt')
  input_ids.append(encoded_dict['input_ids'])
  attention_masks.append(encoded_dict['attention_mask'])
#chuyển dãy về dạng tesor
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim = 0)
labels = torch.tensor(labels)

print("Original: ", tweets[0])
print("Token IDS: ",input_ids[0])






Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Original:  deeds reason earthquake may allah forgive us
Token IDS:  tensor([  101, 15616,  3114,  8372,  2089, 16455,  9641,  2149,   102,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0])


In [14]:
dataset = TensorDataset(input_ids, attention_masks,labels)

train_size = int(0.8*len(dataset))

val_size = len(dataset) - train_size

train_dataset, val_dataset = random_split(dataset, [train_size,val_size])

print('{:>5,} training samples'.format(train_size))
print('{:>5,} validation samples'.format(val_size))

6,090 training samples
1,523 validation samples


In [15]:
batch_size = 32
train_dataloader = DataLoader(train_dataset, sampler = RandomSampler(train_dataset), batch_size = batch_size)

validation_dataloader = DataLoader(val_dataset, sampler = RandomSampler(val_dataset),batch_size = batch_size)


In [16]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased",num_labels = 2, output_attentions = False, output_hidden_states = False)

model = model.to(device)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [17]:
optimizer = AdamW(model.parameters(),lr = 2e-5 , eps = 1e-8)



**Fine tuning the model**

In [18]:
epochs = 4
total_steps = len(train_dataloader) * epochs
# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps = 0 , num_training_steps = total_steps)



In [19]:
#Tính accuracy
def cal_accuracy(preds, labels) :
  pred_flat = np.argmax(preds, axis = 1 ).flatten()
  labels_flat = labels.flatten()
  return np.sum(pred_flat == labels_flat)/len(labels_flat)

In [20]:
def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [21]:
seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)
training_stats = []
#thoi gian training
total_t0 = time.time()

for epoch_i in range(0,epochs):
  #training
  print("")
  print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
  print('Training....')

  #tinh thoi gian train cho moi epoch
  t0 = time.time()
  total_train_loss = 0
  model.train()
  for step, batch in enumerate(train_dataloader):
    b_input_ids = batch[0].to(device)
    b_input_mask = batch[1].to(device)
    b_labels = batch[2].to(device)
    optimizer.zero_grad()
    output = model(b_input_ids, token_type_ids = None, attention_mask= b_input_mask, labels = b_labels)
    loss = output.loss
    total_train_loss += loss.item()
    #backward de tinh gradient
    loss.backward()
    #chuan hoa gradient ve khoang 0 1 de tranh "exploding gradients"
    torch.nn.utils.clip_grad_norm_(model.parameters(),1.0)
    #cap nhat tham so mo hinh
    optimizer.step()
    #cap nhat learning rate
    scheduler.step()

  #tinh trung binh loi cua tat ca cac batch
  avg_train_loss = total_train_loss/ len(train_dataloader)

  #tinh thoi gian train
  training_time = format_time(time.time()-t0)
  print("")
  print("  Average training loss: {0:.2f}".format(avg_train_loss))
  print("  Training epcoh took: {:}".format(training_time))
  #Validation
  print("")
  print("Running Validation...")
  t0= time.time()
  model.eval()
  total_eval_accuracy = 0
  best_eval_accuracy = 0
  total_eval_loss = 0
  nb_eval_steps = 0
  for batch in validation_dataloader:
    b_input_ids = batch[0].to(device)
    b_input_mask = batch[1].to(device)
    b_labels = batch[2].to(device)
    with torch.no_grad():
        output= model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask,labels=b_labels)
    loss = output.loss
    total_eval_loss += loss.item()
    # Move logits and labels to CPU if we are using GPU
    logits = output.logits
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()
    # Calculate the accuracy for this batch of test sentences, and
    # accumulate it over all batches.
    total_eval_accuracy += cal_accuracy(logits, label_ids)
    # Report the final accuracy for this validation run.
  avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
  print("  Accuracy: {0:.2f}".format(avg_val_accuracy))
  avg_val_loss = total_eval_loss / len(validation_dataloader)
  # Measure how long the validation run took.
  validation_time = format_time(time.time() - t0)
  if avg_val_accuracy > best_eval_accuracy:
    torch.save(model, 'bert_model')
    best_eval_accuracy = avg_val_accuracy
  training_stats.append(
    {
      'epoch': epoch_i + 1,
      'Training Loss': avg_train_loss,
      'Valid. Loss': avg_val_loss,
      'Valid. Accur.': avg_val_accuracy,
      'Training Time': training_time,
      'Validation Time': validation_time
    }
  )
print("")
print("Training complete!")

print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))





Training....

  Average training loss: 0.49
  Training epcoh took: 0:00:44

Running Validation...
  Accuracy: 0.83

Training....

  Average training loss: 0.37
  Training epcoh took: 0:00:44

Running Validation...
  Accuracy: 0.82

Training....

  Average training loss: 0.29
  Training epcoh took: 0:00:45

Running Validation...
  Accuracy: 0.81

Training....

  Average training loss: 0.24
  Training epcoh took: 0:00:46

Running Validation...
  Accuracy: 0.82

Training complete!
Total training took 0:03:19 (h:mm:ss)


**Loading the best model**

In [22]:
model = torch.load('bert_model')

In [23]:
df_test = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/test.csv')
df_test['text'] = df_test['text'].apply(lambda x:clean_text(x))
test_tweets = df_test['text'].values

In [24]:
test_input_ids = []
test_attention_masks = []
for tweet in test_tweets:
    encoded_dict = tokenizer.encode_plus(
                        tweet,
                        add_special_tokens = True,
                        max_length = max_len,
                        pad_to_max_length = True,
                        return_attention_mask = True,
                        return_tensors = 'pt',
                   )
    test_input_ids.append(encoded_dict['input_ids'])
    test_attention_masks.append(encoded_dict['attention_mask'])
test_input_ids = torch.cat(test_input_ids, dim=0)
test_attention_masks = torch.cat(test_attention_masks, dim=0)



In [25]:
test_dataset = TensorDataset(test_input_ids, test_attention_masks)
test_dataloader = DataLoader(
            test_dataset, # The validation samples.
            sampler = SequentialSampler(test_dataset), # Pull out batches sequentially.
            batch_size = batch_size # Evaluate with this batch size.
        )

In [26]:
predictions = []
for batch in test_dataloader:
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        with torch.no_grad():
            output= model(b_input_ids,
                                   token_type_ids=None,
                                   attention_mask=b_input_mask)
            logits = output.logits
            logits = logits.detach().cpu().numpy()
            pred_flat = np.argmax(logits, axis=1).flatten()

            predictions.extend(list(pred_flat))

In [27]:
df_output = pd.DataFrame()
df_output['id'] = df_test['id']
df_output['target'] =predictions
df_output.to_csv('submission.csv',index=False)