# Covid Tweet Legitmacy Classifier

In [2]:
# the model we gonna train, Covid-Twitter-Bert V2
# check text classification models here: https://huggingface.co/models?filter=text-classification
# https://huggingface.co/digitalepidemiologylab/covid-twitter-bert-v2
model_name = "digitalepidemiologylab/covid-twitter-bert-v2"
tokenizer_name = "digitalepidemiologylab/covid-twitter-bert-v2"
max_length = 96

### Imports

In [3]:
import numpy as np
import pandas as pd
import regex as re
import string

import torch
from transformers.file_utils import is_tf_available, is_torch_available, is_torch_tpu_available
from transformers import (
    AutoConfig,
    AutoTokenizer,
    AutoModelForSequenceClassification,
    AdamW,
    Trainer,
    TrainingArguments
)

import random
from sklearn.model_selection import train_test_split

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

### Set seed for consistency

In [4]:
def set_seed(seed: int):
    """
    Helper function for reproducible behavior to set the seed in ``random``, ``numpy``, ``torch`` and/or ``tf`` (if
    installed).

    Args:
        seed (:obj:`int`): The seed to set.
    """
    random.seed(seed)
    np.random.seed(seed)
    if is_torch_available():
        torch.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        # ^^ safe to call this function even if cuda is not available
    if is_tf_available():
        import tensorflow as tf

        tf.random.set_seed(seed)

set_seed(5)

### Load data and create the datasets

In [5]:
raw_data = pd.read_csv('miscov19_p.csv')
df = raw_data[['text','label']]
df.dropna()
df['text'] = df['text'].astype(str)
df.tail()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['text'] = df['text'].astype(str)


Unnamed: 0,text,label
4568,Da fark is wrong with these people?!\n\nhttps:...,0
4569,Doctor Finds Cure to COVID-19 (without Vaccine...,1
4570,I can’t believe people legitimately think that...,0
4571,"...(wrong with Trump continues), \n-Will push ...",2
4572,For anyone who still believes that COVID-19 is...,0


In [6]:
target_names = ['legitimate','misinformation','irrelevant']

In [7]:
stop = stopwords.words('english')

In [8]:
def clean_text(row):
    # Lower case
    row = row.lower()
    
    # Remove URLs
    row = re.sub('http\S+|www.\S+', '', row)
    
    # Remove @mentions
    row = re.sub('@[A-Za-z0-9]+', '', row)
    
    # Remove non-standard characters
    row = row.encode("ascii", "ignore").decode()
    
    # Remove punctuation
    row = row.translate(str.maketrans('', '', string.punctuation))
    
    # Remove stop words
    pat = r'\b(?:{})\b'.format('|'.join(stop))
    row = row.replace(pat, '')
    row = row.replace(r'\s+', ' ')
    
    # Remove extraneous whitespace
    row = row.strip()
    
    # Lemmatization
    wordnet_lemmatizer = WordNetLemmatizer()
    w_tokenization = nltk.word_tokenize(row)
    final = ""
    for w in w_tokenization:
        final = final + " " + wordnet_lemmatizer.lemmatize(w)
    
    return final

df['text'] = df['text'].apply(clean_text)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['text'] = df['text'].apply(clean_text)


In [9]:
documents = df['text'].tolist()
labels = df['label'].tolist()

In [10]:
test_size = 0.2 # Percentage of dataset used for validation

In [11]:
train_texts, valid_texts, train_labels, valid_labels = train_test_split(documents, labels, test_size=test_size)

In [12]:
# Use only if training notebook from scratch
tokenizer = AutoTokenizer.from_pretrained(model_name, do_lower_case=True)

In [13]:
# tokenize the dataset, truncate when passed `max_length`, 
# and pad with 0's when less than `max_length`
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=max_length)
valid_encodings = tokenizer(valid_texts, truncation=True, padding=True, max_length=max_length)

In [14]:
class MiscovDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        item["labels"] = torch.tensor([self.labels[idx]])
        return item

    def __len__(self):
        return len(self.labels)

In [15]:
# convert our tokenized data into a torch Dataset
train_dataset = MiscovDataset(train_encodings, train_labels)
valid_dataset = MiscovDataset(valid_encodings, valid_labels)

In [16]:
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(target_names)).to("cuda")

Some weights of the model checkpoint at digitalepidemiologylab/covid-twitter-bert-v2 were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClass

In [17]:
from sklearn.metrics import accuracy_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    # calculate accuracy using sklearn's function
    acc = accuracy_score(labels, preds)
    return {
      'accuracy': acc,
    }

In [18]:
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=4,  # batch size per device during training
    per_device_eval_batch_size=20,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    learning_rate = 1e-5,            # learning rate
    logging_dir='./logs',            # directory for storing logs
    load_best_model_at_end=True,     # load the best model when finished training (default metric is loss)
    # but you can specify `metric_for_best_model` argument to change to accuracy or other metric
    metric_for_best_model='accuracy',
    logging_steps=400,               # log & save weights each logging_steps
    save_steps=400,
    evaluation_strategy="steps",     # evaluate each `logging_steps`
)

trainer = Trainer(
    model=model,                         # the instantiated Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=valid_dataset,          # evaluation dataset
    compute_metrics=compute_metrics,     # the callback that computes metrics of interest
)

In [19]:
trainer.train()

***** Running training *****
  Num examples = 3658
  Num Epochs = 3
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 2745


Step,Training Loss,Validation Loss,Accuracy
400,1.0769,1.064717,0.409836
800,0.8874,0.767524,0.698361
1200,0.7004,0.689912,0.739891
1600,0.6304,0.644617,0.749727
2000,0.5737,0.731783,0.749727
2400,0.4974,0.797393,0.746448


***** Running Evaluation *****
  Num examples = 915
  Batch size = 20
Saving model checkpoint to ./results/checkpoint-400
Configuration saved in ./results/checkpoint-400/config.json
Model weights saved in ./results/checkpoint-400/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 915
  Batch size = 20
Saving model checkpoint to ./results/checkpoint-800
Configuration saved in ./results/checkpoint-800/config.json
Model weights saved in ./results/checkpoint-800/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 915
  Batch size = 20
Saving model checkpoint to ./results/checkpoint-1200
Configuration saved in ./results/checkpoint-1200/config.json
Model weights saved in ./results/checkpoint-1200/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 915
  Batch size = 20
Saving model checkpoint to ./results/checkpoint-1600
Configuration saved in ./results/checkpoint-1600/config.json
Model weights saved in ./results/checkpoint-1600/pytorch_model.bin
**

TrainOutput(global_step=2745, training_loss=0.6963238881151098, metrics={'train_runtime': 976.3308, 'train_samples_per_second': 11.24, 'train_steps_per_second': 2.812, 'total_flos': 1578001753029924.0, 'train_loss': 0.6963238881151098, 'epoch': 3.0})

In [20]:
# evaluate the current model after training
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 915
  Batch size = 20


{'eval_loss': 0.6446170806884766,
 'eval_accuracy': 0.7497267759562841,
 'eval_runtime': 14.3569,
 'eval_samples_per_second': 63.733,
 'eval_steps_per_second': 3.204,
 'epoch': 3.0}

In [34]:
# saving the fine tuned model & tokenizer
model_path = "first_miscov19-covid-twitter-bert-v2"
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

Configuration saved in fifth_miscov19-covid-twitter-bert-v2/config.json
Model weights saved in fifth_miscov19-covid-twitter-bert-v2/pytorch_model.bin
tokenizer config file saved in fifth_miscov19-covid-twitter-bert-v2/tokenizer_config.json
Special tokens file saved in fifth_miscov19-covid-twitter-bert-v2/special_tokens_map.json


('fifth_miscov19-covid-twitter-bert-v2/tokenizer_config.json',
 'fifth_miscov19-covid-twitter-bert-v2/special_tokens_map.json',
 'fifth_miscov19-covid-twitter-bert-v2/vocab.txt',
 'fifth_miscov19-covid-twitter-bert-v2/added_tokens.json',
 'fifth_miscov19-covid-twitter-bert-v2/tokenizer.json')

In [22]:
def get_prediction(text):
    # apply preprocessing to text
    inputs = text # clean_text(text)
    # prepare our text into tokenized sequence
    inputs = tokenizer(inputs, padding=True, truncation=True, max_length=max_length, return_tensors="pt").to("cuda")
    # perform inference to our model
    outputs = model(**inputs)
    # get output probabilities by doing softmax
    probs = outputs[0].softmax(1)
    # executing argmax function to get the candidate label
    return target_names[probs.argmax()]

In [23]:
# Example #1
text = "DP Dough is the best restaurant in College Town"
print(get_prediction(text))

irrelevant


In [24]:
# Example #2
text2 = "Vaccines cause autism"
print(get_prediction(text2))

misinformation


In [25]:
# Example #3
text3 = "Vaccinations prevent over 90% of Covid infections! #Science"
print(get_prediction(text3))

legitimate


In [26]:
# Example #4
text4 = "Vaccines will end the pandemic"
print(get_prediction(text4))

misinformation


In [27]:
# Example #5
text5 = "scientists say kaitlyn will prevent covid"
print(get_prediction(text5))

misinformation


In [28]:
# Example #6
text6 = "Biden says vaccines prevent over 90% of Covid infections!"
print(get_prediction(text6))

misinformation


In [29]:
# Example #7
text7 = "Biden says vaccines cause autism!"
print(get_prediction(text7))

misinformation


In [30]:
# Example #8
text8 = "In Portugal, with 89% of the total population fully vaccinated, almost 90% of UCI Covid patients are unvaccinated"
print(get_prediction(text8))

misinformation


In [31]:
# Example #9
text9 = "President Trump has covid"
print(get_prediction(text9))

irrelevant


In [32]:
# Example #10
text10 = "Vaccines don't stop you from getting covid."
print(get_prediction(text10))

misinformation


In [33]:
# Example #11
text11 = "Vaccinations stop you from getting covid."
print(get_prediction(text11))

misinformation
