In [1]:
import numpy as np
import pandas as pd
import time
import datetime
import gc
import random
import nltk
from nltk.corpus import stopwords
import re
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler,random_split
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

import transformers
from transformers import BertForSequenceClassification, AdamW, BertConfig,BertTokenizer,get_linear_schedule_with_warmup


In [7]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [26]:
import pandas as pd
path="/content/drive/MyDrive/Colab Notebooks/Corona_NLP_train.csv"
df = pd.read_csv(path,encoding='latin-1')


In [27]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [28]:
df.head()

Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment
0,3799,48751,London,16-03-2020,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,Neutral
1,3800,48752,UK,16-03-2020,advice Talk to your neighbours family to excha...,Positive
2,3801,48753,Vagabonds,16-03-2020,Coronavirus Australia: Woolworths to give elde...,Positive
3,3802,48754,,16-03-2020,My food stock is not the only one which is emp...,Positive
4,3803,48755,,16-03-2020,"Me, ready to go at supermarket during the #COV...",Extremely Negative


In [29]:
# Create label map
label2idx = {label:i for i, label in enumerate(df.Sentiment.unique().tolist())}
label2idx

{'Neutral': 0,
 'Positive': 1,
 'Extremely Negative': 2,
 'Negative': 3,
 'Extremely Positive': 4}

In [30]:
idx2label = {v:k for k,v in label2idx.items()}
idx2label

{0: 'Neutral',
 1: 'Positive',
 2: 'Extremely Negative',
 3: 'Negative',
 4: 'Extremely Positive'}

In [31]:
# Create a new column with integer mapping to classes.
df['label'] = df.Sentiment.map(label2idx)

In [32]:
df

Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment,label
0,3799,48751,London,16-03-2020,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,Neutral,0
1,3800,48752,UK,16-03-2020,advice Talk to your neighbours family to excha...,Positive,1
2,3801,48753,Vagabonds,16-03-2020,Coronavirus Australia: Woolworths to give elde...,Positive,1
3,3802,48754,,16-03-2020,My food stock is not the only one which is emp...,Positive,1
4,3803,48755,,16-03-2020,"Me, ready to go at supermarket during the #COV...",Extremely Negative,2
...,...,...,...,...,...,...,...
41152,44951,89903,"Wellington City, New Zealand",14-04-2020,Airline pilots offering to stock supermarket s...,Neutral,0
41153,44952,89904,,14-04-2020,Response to complaint not provided citing COVI...,Extremely Negative,2
41154,44953,89905,,14-04-2020,You know itÂs getting tough when @KameronWild...,Positive,1
41155,44954,89906,,14-04-2020,Is it wrong that the smell of hand sanitizer i...,Neutral,0


In [33]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41157 entries, 0 to 41156
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   UserName       41157 non-null  int64 
 1   ScreenName     41157 non-null  int64 
 2   Location       32567 non-null  object
 3   TweetAt        41157 non-null  object
 4   OriginalTweet  41157 non-null  object
 5   Sentiment      41157 non-null  object
 6   label          41157 non-null  int64 
dtypes: int64(3), object(4)
memory usage: 2.2+ MB


In [34]:
df.isna().sum()

UserName            0
ScreenName          0
Location         8590
TweetAt             0
OriginalTweet       0
Sentiment           0
label               0
dtype: int64

In [35]:
# Drop the category column as we already have label map with us.
df.drop(columns=['Sentiment','Location','UserName','ScreenName','TweetAt',], inplace=True)

**CLEAN TEXT**

In [36]:
nltk.download('stopwords')
sw = stopwords.words('english')

def clean_text(text):

    text = text.lower()

    text = re.sub(r"[^a-zA-Z?.!,¿]+", " ", text) # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")

    text = re.sub(r"http\S+", "",text) #Removing URLs
    #text = re.sub(r"http", "",text)

    html=re.compile(r'<.*?>')

    text = html.sub(r'',text) #Removing html tags

    punctuations = '@#!?+&*[]-%.:/();$=><|{}^' + "'`" + '_'
    for p in punctuations:
        text = text.replace(p,'') #Removing punctuations

    text = [word.lower() for word in text.split() if word.lower() not in sw]

    text = " ".join(text) #removing stopwords

    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text) #Removing emojis

    return text


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [37]:
df['OriginalTweet'] = df['OriginalTweet'].apply(lambda x: clean_text(x))

In [38]:
tweets = df.OriginalTweet.values
labels = df.label.values

In [39]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased',do_lower_case=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [40]:
print('Original: ', tweets[0])
print('Tokenized: ', tokenizer.tokenize(tweets[0]))
print('Token IDs: ',tokenizer.convert_tokens_to_ids(tokenizer.tokenize(tweets[0])))

Original:  menyrbie phil gahan chrisitv tco ifz fan pa tco xx ghgfzcc tco nlzdxno
Tokenized:  ['men', '##yr', '##bie', 'phil', 'ga', '##han', 'chris', '##it', '##v', 'tc', '##o', 'if', '##z', 'fan', 'pa', 'tc', '##o', 'xx', 'g', '##hg', '##f', '##z', '##cc', 'tc', '##o', 'nl', '##zd', '##x', '##no']
Token IDs:  [2273, 12541, 11283, 6316, 11721, 4819, 3782, 4183, 2615, 22975, 2080, 2065, 2480, 5470, 6643, 22975, 2080, 22038, 1043, 25619, 2546, 2480, 9468, 22975, 2080, 17953, 26494, 2595, 3630]


In [41]:
max_len = 0

for sent in tweets :
  input_ids = tokenizer.encode(sent, add_special_tokens = True)

  max_len = max(max_len, len(input_ids))

print('Max sentence length: ', max_len )

Max sentence length:  103


In [42]:
input_ids = []
attention_masks=[]

for tweet in tweets:
  encoded_dict = tokenizer.encode_plus(tweet, add_special_tokens = True, max_length = max_len,pad_to_max_length= True,return_tensors='pt')
  input_ids.append(encoded_dict['input_ids'])
  attention_masks.append(encoded_dict['attention_mask'])
#chuyển dãy về dạng tesor
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim = 0)
labels = torch.tensor(labels)

print("Original: ", tweets[0])
print("Token IDS: ",input_ids[0])






Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Original:  menyrbie phil gahan chrisitv tco ifz fan pa tco xx ghgfzcc tco nlzdxno
Token IDS:  tensor([  101,  2273, 12541, 11283,  6316, 11721,  4819,  3782,  4183,  2615,
        22975,  2080,  2065,  2480,  5470,  6643, 22975,  2080, 22038,  1043,
        25619,  2546,  2480,  9468, 22975,  2080, 17953, 26494,  2595,  3630,
          102,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0])


In [43]:
dataset = TensorDataset(input_ids, attention_masks,labels)

train_size = int(0.8*len(dataset))

val_size = len(dataset) - train_size

train_dataset, val_dataset = random_split(dataset, [train_size,val_size])

print('{:>5,} training samples'.format(train_size))
print('{:>5,} validation samples'.format(val_size))

32,925 training samples
8,232 validation samples


In [44]:
batch_size = 32
train_dataloader = DataLoader(train_dataset, sampler = RandomSampler(train_dataset), batch_size = batch_size)

validation_dataloader = DataLoader(val_dataset, sampler = RandomSampler(val_dataset),batch_size = batch_size)


In [45]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased",num_labels = 5, output_attentions = False, output_hidden_states = False)

model = model.to(device)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [47]:
optimizer = AdamW(model.parameters(),lr = 2e-5 , eps = 1e-8)




In [48]:
epochs = 4
total_steps = len(train_dataloader) * epochs
# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps = 0 , num_training_steps = total_steps)



In [49]:
#Tính accuracy
def cal_accuracy(preds, labels) :
  pred_flat = np.argmax(preds, axis = 1 ).flatten()
  labels_flat = labels.flatten()
  return np.sum(pred_flat == labels_flat)/len(labels_flat)

In [50]:
def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [51]:
seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)
training_stats = []
#thoi gian training
total_t0 = time.time()

for epoch_i in range(0,epochs):
  #training
  print("")
  print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
  print('Training....')

  #tinh thoi gian train cho moi epoch
  t0 = time.time()
  total_train_loss = 0
  model.train()
  for step, batch in enumerate(train_dataloader):
    b_input_ids = batch[0].to(device)
    b_input_mask = batch[1].to(device)
    b_labels = batch[2].to(device)
    optimizer.zero_grad()
    output = model(b_input_ids, token_type_ids = None, attention_mask= b_input_mask, labels = b_labels)
    loss = output.loss
    total_train_loss += loss.item()
    #backward de tinh gradient
    loss.backward()
    #chuan hoa gradient ve khoang 0 1 de tranh "exploding gradients"
    torch.nn.utils.clip_grad_norm_(model.parameters(),1.0)
    #cap nhat tham so mo hinh
    optimizer.step()
    #cap nhat learning rate
    scheduler.step()

  #tinh trung binh loi cua tat ca cac batch
  avg_train_loss = total_train_loss/ len(train_dataloader)

  #tinh thoi gian train
  training_time = format_time(time.time()-t0)
  print("")
  print("  Average training loss: {0:.2f}".format(avg_train_loss))
  print("  Training epcoh took: {:}".format(training_time))
  #Validation
  print("")
  print("Running Validation...")
  t0= time.time()
  model.eval()
  total_eval_accuracy = 0
  best_eval_accuracy = 0
  total_eval_loss = 0
  nb_eval_steps = 0
  for batch in validation_dataloader:
    b_input_ids = batch[0].to(device)
    b_input_mask = batch[1].to(device)
    b_labels = batch[2].to(device)
    with torch.no_grad():
        output= model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask,labels=b_labels)
    loss = output.loss
    total_eval_loss += loss.item()
    # Move logits and labels to CPU if we are using GPU
    logits = output.logits
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()
    # Calculate the accuracy for this batch of test sentences, and
    # accumulate it over all batches.
    total_eval_accuracy += cal_accuracy(logits, label_ids)
    # Report the final accuracy for this validation run.
  avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
  print("  Accuracy: {0:.2f}".format(avg_val_accuracy))
  avg_val_loss = total_eval_loss / len(validation_dataloader)
  # Measure how long the validation run took.
  validation_time = format_time(time.time() - t0)
  if avg_val_accuracy > best_eval_accuracy:
    torch.save(model, 'bert_model')
    best_eval_accuracy = avg_val_accuracy
  training_stats.append(
    {
      'epoch': epoch_i + 1,
      'Training Loss': avg_train_loss,
      'Valid. Loss': avg_val_loss,
      'Valid. Accur.': avg_val_accuracy,
      'Training Time': training_time,
      'Validation Time': validation_time
    }
  )
print("")
print("Training complete!")

print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))





Training....

  Average training loss: 0.92
  Training epcoh took: 0:08:47

Running Validation...
  Accuracy: 0.73

Training....

  Average training loss: 0.61
  Training epcoh took: 0:08:55

Running Validation...
  Accuracy: 0.78

Training....

  Average training loss: 0.48
  Training epcoh took: 0:08:55

Running Validation...
  Accuracy: 0.79

Training....

  Average training loss: 0.40
  Training epcoh took: 0:08:54

Running Validation...
  Accuracy: 0.80

Training complete!
Total training took 0:38:59 (h:mm:ss)


In [52]:
model = torch.load('bert_model')

In [53]:
df_test = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Corona_NLP_test.csv',encoding='latin-1')
df_test['OriginalTweet'] = df_test['OriginalTweet'].apply(lambda x:clean_text(x))
test_tweets = df_test['OriginalTweet'].values

In [54]:
test_input_ids = []
test_attention_masks = []
for tweet in test_tweets:
    encoded_dict = tokenizer.encode_plus(
                        tweet,
                        add_special_tokens = True,
                        max_length = max_len,
                        pad_to_max_length = True,
                        return_attention_mask = True,
                        return_tensors = 'pt',
                   )
    test_input_ids.append(encoded_dict['input_ids'])
    test_attention_masks.append(encoded_dict['attention_mask'])
test_input_ids = torch.cat(test_input_ids, dim=0)
test_attention_masks = torch.cat(test_attention_masks, dim=0)



In [55]:
label2idx = {label:i for i, label in enumerate(df_test.Sentiment.unique().tolist())}
label2idx

{'Extremely Negative': 0,
 'Positive': 1,
 'Extremely Positive': 2,
 'Negative': 3,
 'Neutral': 4}

In [56]:
idx2label = {v:k for k,v in label2idx.items()}
idx2label

{0: 'Extremely Negative',
 1: 'Positive',
 2: 'Extremely Positive',
 3: 'Negative',
 4: 'Neutral'}

In [57]:
test_dataset = TensorDataset(test_input_ids, test_attention_masks)
test_dataloader = DataLoader(
            test_dataset, # The validation samples.
            sampler = SequentialSampler(test_dataset), # Pull out batches sequentially.
            batch_size = batch_size # Evaluate with this batch size.
        )

In [58]:
predictions = []
for batch in test_dataloader:
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        with torch.no_grad():
            output= model(b_input_ids,
                                   token_type_ids=None,
                                   attention_mask=b_input_mask)
            logits = output.logits
            logits = logits.detach().cpu().numpy()
            pred_flat = np.argmax(logits, axis=1).flatten()

            predictions.extend(list(pred_flat))

In [60]:
df_test['label'] = df_test.Sentiment.map(label2idx)
true_labels = df_test['label']

In [61]:
accuracy = np.sum(predictions == true_labels)/len(true_labels)

In [64]:
df_output = pd.DataFrame()
df_output['UserName'] = df_test['UserName']
df_output['target'] =predictions
df_output.to_csv('submission.csv',index=False)