In [1]:
# Import key libraries and packages
import numpy as np
import os
import pandas as pd

import matplotlib.pyplot as plt
#from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

from datasets import load_dataset, load_metric
import huggingface_hub
from huggingface_hub import notebook_login
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, TrainingArguments, Trainer
import tqdm as notebook_tqdm
import jupyterlab
import ipywidgets

In [2]:
# Login to HF hub
notebook_login()

Token is valid.
Your token has been saved in your configured git credential helpers (manager-core).
Your token has been saved to C:\Users\selas\.huggingface\token
Login successful


In [3]:
# Disabe Weights & Biases
os.environ["WANDB_DISABLED"] = "true"

In [4]:
# Load the datasets
train_df = pd.read_csv("Train.csv")
test_df = pd.read_csv("Test.csv")

In [5]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10001 entries, 0 to 10000
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   tweet_id   10001 non-null  object 
 1   safe_text  10001 non-null  object 
 2   label      10000 non-null  float64
 3   agreement  9999 non-null   float64
dtypes: float64(2), object(2)
memory usage: 312.7+ KB


In [6]:
train_df.dropna(inplace=True)

train_df.isnull().sum()

tweet_id     0
safe_text    0
label        0
agreement    0
dtype: int64

In [7]:
train_df['label'].value_counts()

 0.0    4908
 1.0    4053
-1.0    1038
Name: label, dtype: int64

# Model 2
**Fine-tuning the RoBERTa model**

In [8]:
# Split the train data into train, eval
train, eval = train_test_split(train_df, test_size=0.2, random_state=42, stratify=train_df['label'])

In [9]:
# Instantiating tokenizer
tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment", num_labels=3)

In [10]:
# Save split data subsets
train.to_csv("train_sub1.csv", index=False)
eval.to_csv("eval_sub1.csv", index=False)

In [11]:
# Load the subsetted data

data = load_dataset('csv', data_files={'train': 'train_sub.csv','eval': 'eval_sub.csv'}, encoding = "ISO-8859-1")

Using custom data configuration default-96c282efaa4881a1
Found cached dataset csv (C:/Users/selas/.cache/huggingface/datasets/csv/default-96c282efaa4881a1/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317)


  0%|          | 0/2 [00:00<?, ?it/s]

In [12]:
# Define helper functions
## Function to transform labels
def transform_labels(label):

    label = label['label']
    num = 0
    if label == -1: #'Negative'
        num = 0
    elif label == 0: #'Neutral'
        num = 1
    elif label == 1: #'Positive'
        num = 2

    return {'labels': num}

## Function to tokenize data
def tokenize_data(example):
    return tokenizer(example['safe_text'], padding='max_length',truncation=True, max_length = 256)

# Tokenize the tweets
dataset = data.map(tokenize_data, batched=True)

# Transform	labels and limit the columns
remove_columns = ['tweet_id', 'label', 'safe_text', 'agreement']
dataset = dataset.map(transform_labels, remove_columns=remove_columns)

  0%|          | 0/8 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/7999 [00:00<?, ?ex/s]

  0%|          | 0/2000 [00:00<?, ?ex/s]

In [13]:
# Define training arguments
training_args = TrainingArguments(
    "covid_tweets_sentiment_analysis_Roberta_model", 
    num_train_epochs=3,
    load_best_model_at_end=True,
    evaluation_strategy="epoch",
    save_strategy="epoch"
    )

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [14]:
# Load the pretrained model
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment", num_labels=3)

In [15]:
# Define evaluation metrics
metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

# Instantiate the training and evaluation sets
train_dataset = dataset["train"].shuffle(seed=15) 
eval_dataset = dataset["eval"].shuffle(seed=15)

#converting training data to PyTorch tensors to speed up training and adding padding:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Instantiate the trainer
trainer = Trainer(model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset,compute_metrics=compute_metrics)
trainer.train()

  metric = load_metric("accuracy")
***** Running training *****
  Num examples = 7999
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 3000
  Number of trainable parameters = 124647939


  0%|          | 0/3000 [00:00<?, ?it/s]

{'loss': 0.7161, 'learning_rate': 4.166666666666667e-05, 'epoch': 0.5}
{'loss': 0.6243, 'learning_rate': 3.3333333333333335e-05, 'epoch': 1.0}


***** Running Evaluation *****
  Num examples = 2000
  Batch size = 8


  0%|          | 0/250 [00:02<?, ?it/s]

Saving model checkpoint to covid_tweets_sentiment_analysis_Roberta_model\checkpoint-1000


{'eval_loss': 0.5383804440498352, 'eval_accuracy': 0.796, 'eval_runtime': 1231.4554, 'eval_samples_per_second': 1.624, 'eval_steps_per_second': 0.203, 'epoch': 1.0}


Configuration saved in covid_tweets_sentiment_analysis_Roberta_model\checkpoint-1000\config.json
Model weights saved in covid_tweets_sentiment_analysis_Roberta_model\checkpoint-1000\pytorch_model.bin


{'loss': 0.4516, 'learning_rate': 2.5e-05, 'epoch': 1.5}
{'loss': 0.4653, 'learning_rate': 1.6666666666666667e-05, 'epoch': 2.0}


***** Running Evaluation *****
  Num examples = 2000
  Batch size = 8


  0%|          | 0/250 [00:00<?, ?it/s]

Saving model checkpoint to covid_tweets_sentiment_analysis_Roberta_model\checkpoint-2000
Configuration saved in covid_tweets_sentiment_analysis_Roberta_model\checkpoint-2000\config.json


{'eval_loss': 0.6189253926277161, 'eval_accuracy': 0.7965, 'eval_runtime': 1234.3592, 'eval_samples_per_second': 1.62, 'eval_steps_per_second': 0.203, 'epoch': 2.0}


Model weights saved in covid_tweets_sentiment_analysis_Roberta_model\checkpoint-2000\pytorch_model.bin


{'loss': 0.2969, 'learning_rate': 8.333333333333334e-06, 'epoch': 2.5}
{'loss': 0.3047, 'learning_rate': 0.0, 'epoch': 3.0}


***** Running Evaluation *****
  Num examples = 2000
  Batch size = 8


  0%|          | 0/250 [00:00<?, ?it/s]

Saving model checkpoint to covid_tweets_sentiment_analysis_Roberta_model\checkpoint-3000
Configuration saved in covid_tweets_sentiment_analysis_Roberta_model\checkpoint-3000\config.json


{'eval_loss': 0.8657640218734741, 'eval_accuracy': 0.802, 'eval_runtime': 1256.4109, 'eval_samples_per_second': 1.592, 'eval_steps_per_second': 0.199, 'epoch': 3.0}


Model weights saved in covid_tweets_sentiment_analysis_Roberta_model\checkpoint-3000\pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from covid_tweets_sentiment_analysis_Roberta_model\checkpoint-1000 (score: 0.5383804440498352).


{'train_runtime': 43769.1618, 'train_samples_per_second': 0.548, 'train_steps_per_second': 0.069, 'train_loss': 0.47649632771809897, 'epoch': 3.0}


TrainOutput(global_step=3000, training_loss=0.47649632771809897, metrics={'train_runtime': 43769.1618, 'train_samples_per_second': 0.548, 'train_steps_per_second': 0.069, 'train_loss': 0.47649632771809897, 'epoch': 3.0})

In [16]:
# Reinstantiate the trainer for evaluation
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# Launch the final evaluation 
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 2000
  Batch size = 8
You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


  0%|          | 0/250 [00:00<?, ?it/s]

{'eval_loss': 0.5383804440498352,
 'eval_accuracy': 0.796,
 'eval_runtime': 1218.417,
 'eval_samples_per_second': 1.641,
 'eval_steps_per_second': 0.205}

In [17]:
# Push model and tokenizer to HF Hub
model.push_to_hub("Eva-Gaga/covid-tweet-sentiment-analysis-roberta_model")
tokenizer.push_to_hub("Eva-Gaga/covid-tweet-sentiment-analysis-roberta_model")

Configuration saved in C:\Users\selas\AppData\Local\Temp\tmp80auqz57\config.json
Model weights saved in C:\Users\selas\AppData\Local\Temp\tmp80auqz57\pytorch_model.bin
Uploading the following files to Eva-Gaga/covid-tweet-sentiment-analysis-roberta_model: config.json,pytorch_model.bin
tokenizer config file saved in C:\Users\selas\AppData\Local\Temp\tmp8ejpzghx\tokenizer_config.json
Special tokens file saved in C:\Users\selas\AppData\Local\Temp\tmp8ejpzghx\special_tokens_map.json
Uploading the following files to Eva-Gaga/covid-tweet-sentiment-analysis-roberta_model: merges.txt,special_tokens_map.json,tokenizer.json,tokenizer_config.json,vocab.json


CommitInfo(commit_url='https://huggingface.co/Eva-Gaga/covid-tweet-sentiment-analysis-roberta_model/commit/cc70ed62a4f74441a67e5dbfc13baf72b5374f0a', commit_message='Upload tokenizer', commit_description='', oid='cc70ed62a4f74441a67e5dbfc13baf72b5374f0a', pr_url=None, pr_revision=None, pr_num=None)