In [1]:
# Import key libraries and packages
import numpy as np
import os
import pandas as pd

import matplotlib.pyplot as plt
#from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

from datasets import load_dataset, load_metric
import huggingface_hub
from huggingface_hub import notebook_login
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, TrainingArguments, Trainer
import tqdm as notebook_tqdm
import jupyterlab
import ipywidgets




In [2]:
# Login to HF hub
notebook_login()

Token is valid.
Your token has been saved in your configured git credential helpers (manager-core).
Your token has been saved to C:\Users\selas\.huggingface\token
Login successful


In [3]:
# Disabe Weights & Biases
os.environ["WANDB_DISABLED"] = "true"

In [4]:
# Load the datasets
train_df = pd.read_csv("Train.csv")
test_df = pd.read_csv("Test.csv")

In [5]:
train_df.head()

Unnamed: 0,tweet_id,safe_text,label,agreement
0,CL1KWCMY,Me &amp; The Big Homie meanboy3000 #MEANBOY #M...,0.0,1.0
1,E3303EME,I'm 100% thinking of devoting my career to pro...,1.0,1.0
2,M4IVFSMS,"#whatcausesautism VACCINES, DO NOT VACCINATE Y...",-1.0,1.0
3,1DR6ROZ4,I mean if they immunize my kid with something ...,-1.0,1.0
4,J77ENIIE,Thanks to <user> Catch me performing at La Nui...,0.0,1.0


In [6]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10001 entries, 0 to 10000
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   tweet_id   10001 non-null  object 
 1   safe_text  10001 non-null  object 
 2   label      10000 non-null  float64
 3   agreement  9999 non-null   float64
dtypes: float64(2), object(2)
memory usage: 312.7+ KB


In [7]:
train_df.isnull().sum()

tweet_id     0
safe_text    0
label        1
agreement    2
dtype: int64

In [8]:
train_df.dropna(inplace=True)

train_df.isnull().sum()

tweet_id     0
safe_text    0
label        0
agreement    0
dtype: int64

In [9]:
train_df['label'].value_counts()

 0.0    4908
 1.0    4053
-1.0    1038
Name: label, dtype: int64

# Model 1

**Fine-tuning a DistilBert model**

In [10]:
# Split the train data into train, eval
train, eval = train_test_split(train_df, test_size=0.2, random_state=42, stratify=train_df['label'])

In [11]:
# Instantiating tokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased", num_labels=3)

In [12]:
# Save split data subsets
train.to_csv("train_sub.csv", index=False)
eval.to_csv("eval_sub.csv", index=False)

In [13]:
# Load the subsetted data

data = load_dataset('csv', data_files={'train': 'train_sub.csv','eval': 'eval_sub.csv'}, encoding = "ISO-8859-1")

Using custom data configuration default-96c282efaa4881a1


Downloading and preparing dataset csv/default to C:/Users/selas/.cache/huggingface/datasets/csv/default-96c282efaa4881a1/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating eval split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to C:/Users/selas/.cache/huggingface/datasets/csv/default-96c282efaa4881a1/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

In [14]:
# Define helper functions
## Function to transform labels
def transform_labels(label):

    label = label['label']
    num = 0
    if label == -1: #'Negative'
        num = 0
    elif label == 0: #'Neutral'
        num = 1
    elif label == 1: #'Positive'
        num = 2

    return {'labels': num}

## Function to tokenize data
def tokenize_data(example):
    return tokenizer(example['safe_text'], padding='max_length',truncation=True, max_length = 256)

# Tokenize the tweets
dataset = data.map(tokenize_data, batched=True)

# Transform	labels and limit the columns
remove_columns = ['tweet_id', 'label', 'safe_text', 'agreement']
dataset = dataset.map(transform_labels, remove_columns=remove_columns)

  0%|          | 0/8 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/7999 [00:00<?, ?ex/s]

  0%|          | 0/2000 [00:00<?, ?ex/s]

In [15]:
# Define training arguments
training_args = TrainingArguments(
    "distilbert_covid_tweets_sentiment_analysis_model", 
    num_train_epochs=3,
    load_best_model_at_end=True,
    evaluation_strategy="steps",
    save_strategy="steps"
    )

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [16]:
# Load the pretrained model
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=3)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.weight', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.bias', 'classifier

In [17]:
# Define evaluation metrics
metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

  metric = load_metric("accuracy")


In [18]:
# Instantiate the training and evaluation sets
train_dataset = dataset["train"].shuffle(seed=15) 
eval_dataset = dataset["eval"].shuffle(seed=15)

In [19]:
#converting training data to PyTorch tensors to speed up training and adding padding:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Instantiate the trainer
trainer = Trainer(model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset,compute_metrics=compute_metrics)
trainer.train()

***** Running training *****
  Num examples = 7999
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 3000
  Number of trainable parameters = 66955779


  0%|          | 0/3000 [00:00<?, ?it/s]

***** Running Evaluation *****
  Num examples = 2000
  Batch size = 8


{'loss': 0.7415, 'learning_rate': 4.166666666666667e-05, 'epoch': 0.5}


  0%|          | 0/250 [00:00<?, ?it/s]

Saving model checkpoint to distilbert_covid_tweets_sentiment_analysis_model\checkpoint-500
Configuration saved in distilbert_covid_tweets_sentiment_analysis_model\checkpoint-500\config.json


{'eval_loss': 0.6378083229064941, 'eval_accuracy': 0.7485, 'eval_runtime': 643.3697, 'eval_samples_per_second': 3.109, 'eval_steps_per_second': 0.389, 'epoch': 0.5}


Model weights saved in distilbert_covid_tweets_sentiment_analysis_model\checkpoint-500\pytorch_model.bin
***** Running Evaluation *****
  Num examples = 2000
  Batch size = 8


{'loss': 0.6519, 'learning_rate': 3.3333333333333335e-05, 'epoch': 1.0}


  0%|          | 0/250 [00:00<?, ?it/s]

Saving model checkpoint to distilbert_covid_tweets_sentiment_analysis_model\checkpoint-1000
Configuration saved in distilbert_covid_tweets_sentiment_analysis_model\checkpoint-1000\config.json


{'eval_loss': 0.5900400280952454, 'eval_accuracy': 0.7655, 'eval_runtime': 628.5123, 'eval_samples_per_second': 3.182, 'eval_steps_per_second': 0.398, 'epoch': 1.0}


Model weights saved in distilbert_covid_tweets_sentiment_analysis_model\checkpoint-1000\pytorch_model.bin
***** Running Evaluation *****
  Num examples = 2000
  Batch size = 8


{'loss': 0.493, 'learning_rate': 2.5e-05, 'epoch': 1.5}


  0%|          | 0/250 [00:00<?, ?it/s]

Saving model checkpoint to distilbert_covid_tweets_sentiment_analysis_model\checkpoint-1500
Configuration saved in distilbert_covid_tweets_sentiment_analysis_model\checkpoint-1500\config.json


{'eval_loss': 0.647053599357605, 'eval_accuracy': 0.7795, 'eval_runtime': 630.8436, 'eval_samples_per_second': 3.17, 'eval_steps_per_second': 0.396, 'epoch': 1.5}


Model weights saved in distilbert_covid_tweets_sentiment_analysis_model\checkpoint-1500\pytorch_model.bin
***** Running Evaluation *****
  Num examples = 2000
  Batch size = 8


{'loss': 0.4987, 'learning_rate': 1.6666666666666667e-05, 'epoch': 2.0}


  0%|          | 0/250 [00:00<?, ?it/s]

Saving model checkpoint to distilbert_covid_tweets_sentiment_analysis_model\checkpoint-2000
Configuration saved in distilbert_covid_tweets_sentiment_analysis_model\checkpoint-2000\config.json


{'eval_loss': 0.5789723992347717, 'eval_accuracy': 0.775, 'eval_runtime': 615.85, 'eval_samples_per_second': 3.248, 'eval_steps_per_second': 0.406, 'epoch': 2.0}


Model weights saved in distilbert_covid_tweets_sentiment_analysis_model\checkpoint-2000\pytorch_model.bin
***** Running Evaluation *****
  Num examples = 2000
  Batch size = 8


{'loss': 0.2863, 'learning_rate': 8.333333333333334e-06, 'epoch': 2.5}


  0%|          | 0/250 [00:00<?, ?it/s]

Saving model checkpoint to distilbert_covid_tweets_sentiment_analysis_model\checkpoint-2500
Configuration saved in distilbert_covid_tweets_sentiment_analysis_model\checkpoint-2500\config.json


{'eval_loss': 0.8027164936065674, 'eval_accuracy': 0.771, 'eval_runtime': 617.388, 'eval_samples_per_second': 3.239, 'eval_steps_per_second': 0.405, 'epoch': 2.5}


Model weights saved in distilbert_covid_tweets_sentiment_analysis_model\checkpoint-2500\pytorch_model.bin
***** Running Evaluation *****
  Num examples = 2000
  Batch size = 8


{'loss': 0.2906, 'learning_rate': 0.0, 'epoch': 3.0}


  0%|          | 0/250 [00:00<?, ?it/s]

Saving model checkpoint to distilbert_covid_tweets_sentiment_analysis_model\checkpoint-3000
Configuration saved in distilbert_covid_tweets_sentiment_analysis_model\checkpoint-3000\config.json


{'eval_loss': 0.799372673034668, 'eval_accuracy': 0.7795, 'eval_runtime': 616.4444, 'eval_samples_per_second': 3.244, 'eval_steps_per_second': 0.406, 'epoch': 3.0}


Model weights saved in distilbert_covid_tweets_sentiment_analysis_model\checkpoint-3000\pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from distilbert_covid_tweets_sentiment_analysis_model\checkpoint-2000 (score: 0.5789723992347717).


{'train_runtime': 23195.3047, 'train_samples_per_second': 1.035, 'train_steps_per_second': 0.129, 'train_loss': 0.4936471150716146, 'epoch': 3.0}


TrainOutput(global_step=3000, training_loss=0.4936471150716146, metrics={'train_runtime': 23195.3047, 'train_samples_per_second': 1.035, 'train_steps_per_second': 0.129, 'train_loss': 0.4936471150716146, 'epoch': 3.0})

In [20]:
# Reinstantiate the trainer for evaluation
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# Launch the final evaluation 
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 2000
  Batch size = 8
You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


  0%|          | 0/250 [00:01<?, ?it/s]

{'eval_loss': 0.5789723992347717,
 'eval_accuracy': 0.775,
 'eval_runtime': 696.576,
 'eval_samples_per_second': 2.871,
 'eval_steps_per_second': 0.359}

In [21]:
# Push model and tokenizer to HF Hub
model.push_to_hub("Eva-Gaga/distilbert_covid_tweets_sentiment_analysis_model")
tokenizer.push_to_hub("Eva-Gaga/distilbert_covid_tweets_sentiment_analysis_model")

Configuration saved in distilbert_covid_tweets_sentiment_analysis_model\config.json
Model weights saved in distilbert_covid_tweets_sentiment_analysis_model\pytorch_model.bin
Uploading the following files to Eva-Gaga/distilbert_covid_tweets_sentiment_analysis_model: config.json,pytorch_model.bin
tokenizer config file saved in distilbert_covid_tweets_sentiment_analysis_model\tokenizer_config.json
Special tokens file saved in distilbert_covid_tweets_sentiment_analysis_model\special_tokens_map.json
Uploading the following files to Eva-Gaga/distilbert_covid_tweets_sentiment_analysis_model: special_tokens_map.json,tokenizer.json,tokenizer_config.json,vocab.txt


CommitInfo(commit_url='https://huggingface.co/Eva-Gaga/distilbert_covid_tweets_sentiment_analysis_model/commit/057c796c9171023b680baecc2fab8597373dfdee', commit_message='Upload tokenizer', commit_description='', oid='057c796c9171023b680baecc2fab8597373dfdee', pr_url=None, pr_revision=None, pr_num=None)