# 1. Activate GPU and Install Dependencies

In [35]:
# Activate GPU for faster training by clicking on 'Runtime' > 'Change runtime type' and then selecting GPU as the Hardware accelerator
# Then check if GPU is available
import torch
torch.cuda.is_available()

True

In [36]:
# Install required libraries
!pip install datasets transformers huggingface_hub
!apt-get install git-lfs

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Reading package lists... Done
Building dependency tree       
Reading state information... Done
git-lfs is already the newest version (2.3.4-1).
The following package was automatically installed and is no longer required:
  libnvidia-common-460
Use 'apt autoremove' to remove it.
0 upgraded, 0 newly installed, 0 to remove and 5 not upgraded.


#2. Preprocess data

In [40]:
# Load data
from datasets import load_dataset
tweets = load_dataset("tweet_eval", "sentiment")



  0%|          | 0/3 [00:00<?, ?it/s]

In [43]:
# Create a smaller training dataset for faster training times

# red_train_dataset = tweets["train"].shuffle(seed=21).select([i for i in list(range(20000))])
# red_test_dataset =  tweets["test"].shuffle(seed=21).select([i for i in list(range(20000))])
# print(red_train_dataset[0])
# print(red_test_dataset[0])



train_dataset = tweets["train"].shuffle(seed=21)
test_dataset =  tweets["test"].shuffle(seed=21)




In [44]:
# Set DistilBERT tokenizer
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/043235d6088ecd3dd5fb5ca3592b6913fd516027/config.json
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.24.0",
  "vocab_size": 30522
}

loading file vocab.txt from cache at /root/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/043235d6088ecd3dd5fb5ca3592b6913fd516027/vocab.txt
loading file tokenizer.json from cache at /root/.cache/huggingface/hub/models--distilbert-base-uncased/snapsh

In [45]:
# Prepare the text inputs for the model
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

tokenized_train = train_dataset.map(preprocess_function, batched=True)
tokenized_test =  test_dataset.map(preprocess_function, batched=True)

  0%|          | 0/46 [00:00<?, ?ba/s]

  0%|          | 0/13 [00:00<?, ?ba/s]

In [46]:
# Use data_collector to convert our samples to PyTorch tensors and concatenate them with the correct amount of padding
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# 3. Training the model

In [47]:
# Define DistilBERT as our base model:
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=3)

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/043235d6088ecd3dd5fb5ca3592b6913fd516027/config.json
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.24.0",
  "vocab_size": 30522
}

loading weights file pytorch_model.bin from cache at /root/.cache/huggingface/hub/models--distilbert-base-uncased/

In [48]:
# Define the evaluation metrics 
import numpy as np
from datasets import load_metric

metric = load_metric('precision')
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels, average = 'weighted')


# metric = load_metric('precision')
# metric.compute(predictions=[0,1,2,3,4,4,4,4], references=[2,2,2,3,4,1,1,4], average="weighted")


In [49]:
# Log in to your Hugging Face account 
# Get your API token here https://huggingface.co/settings/token
from huggingface_hub import notebook_login

notebook_login()

Login successful
Your token has been saved to /root/.huggingface/token


In [50]:
# Define a new Trainer with all the objects we constructed so far
from transformers import TrainingArguments, Trainer

repo_name = "tweet_model_sentiment_andersab"

training_args = TrainingArguments(
    output_dir=repo_name,
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=20,
    weight_decay=0.01,
    save_strategy="epoch", 
    push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
/content/tweet_model_sentiment_andersab is already a clone of https://huggingface.co/andersab/tweet_model_sentiment_andersab. Make sure you pull the latest changes with `repo.git_pull()`.


In [51]:
# Train the model
trainer.train()

The following columns in the training set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 45615
  Num Epochs = 20
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 57020
  Number of trainable parameters = 66955779
You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
500,0.7709
1000,0.6905
1500,0.6533
2000,0.64
2500,0.6397
3000,0.5945
3500,0.501
4000,0.5017
4500,0.4984
5000,0.5146


Saving model checkpoint to tweet_model_sentiment_andersab/checkpoint-2851
Configuration saved in tweet_model_sentiment_andersab/checkpoint-2851/config.json
Model weights saved in tweet_model_sentiment_andersab/checkpoint-2851/pytorch_model.bin
tokenizer config file saved in tweet_model_sentiment_andersab/checkpoint-2851/tokenizer_config.json
Special tokens file saved in tweet_model_sentiment_andersab/checkpoint-2851/special_tokens_map.json
tokenizer config file saved in tweet_model_sentiment_andersab/tokenizer_config.json
Special tokens file saved in tweet_model_sentiment_andersab/special_tokens_map.json
Several commits (4) will be pushed upstream.
Saving model checkpoint to tweet_model_sentiment_andersab/checkpoint-5702
Configuration saved in tweet_model_sentiment_andersab/checkpoint-5702/config.json
Model weights saved in tweet_model_sentiment_andersab/checkpoint-5702/pytorch_model.bin
tokenizer config file saved in tweet_model_sentiment_andersab/checkpoint-5702/tokenizer_config.json

Step,Training Loss
500,0.7709
1000,0.6905
1500,0.6533
2000,0.64
2500,0.6397
3000,0.5945
3500,0.501
4000,0.5017
4500,0.4984
5000,0.5146


Saving model checkpoint to tweet_model_sentiment_andersab/checkpoint-28510
Configuration saved in tweet_model_sentiment_andersab/checkpoint-28510/config.json
Model weights saved in tweet_model_sentiment_andersab/checkpoint-28510/pytorch_model.bin
tokenizer config file saved in tweet_model_sentiment_andersab/checkpoint-28510/tokenizer_config.json
Special tokens file saved in tweet_model_sentiment_andersab/checkpoint-28510/special_tokens_map.json
Saving model checkpoint to tweet_model_sentiment_andersab/checkpoint-31361
Configuration saved in tweet_model_sentiment_andersab/checkpoint-31361/config.json
Model weights saved in tweet_model_sentiment_andersab/checkpoint-31361/pytorch_model.bin
tokenizer config file saved in tweet_model_sentiment_andersab/checkpoint-31361/tokenizer_config.json
Special tokens file saved in tweet_model_sentiment_andersab/checkpoint-31361/special_tokens_map.json
Saving model checkpoint to tweet_model_sentiment_andersab/checkpoint-34212
Configuration saved in twee

TrainOutput(global_step=57020, training_loss=0.12956915100409813, metrics={'train_runtime': 4903.6308, 'train_samples_per_second': 186.046, 'train_steps_per_second': 11.628, 'total_flos': 1.0832649507777096e+16, 'train_loss': 0.12956915100409813, 'epoch': 20.0})

In [52]:
# Compute the evaluation metrics
trainer.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 12284
  Batch size = 16


{'eval_loss': 3.321037769317627,
 'eval_precision': 0.6793053334123681,
 'eval_runtime': 15.5758,
 'eval_samples_per_second': 788.658,
 'eval_steps_per_second': 49.307,
 'epoch': 20.0}

# 4. Analyzing new data with the model

In [53]:
# Upload the model to the Hub
trainer.push_to_hub()

Saving model checkpoint to tweet_model_sentiment_andersab
Configuration saved in tweet_model_sentiment_andersab/config.json
Model weights saved in tweet_model_sentiment_andersab/pytorch_model.bin
tokenizer config file saved in tweet_model_sentiment_andersab/tokenizer_config.json
Special tokens file saved in tweet_model_sentiment_andersab/special_tokens_map.json
Several commits (2) will be pushed upstream.
The progress bars may be unreliable.


Upload file pytorch_model.bin:   0%|          | 3.34k/255M [00:00<?, ?B/s]

Upload file runs/Nov11_19-22-48_7d63fe580273/events.out.tfevents.1668199494.7d63fe580273.78.13: 100%|#########…

Upload file runs/Nov11_19-22-48_7d63fe580273/events.out.tfevents.1668194574.7d63fe580273.78.11:  15%|#5       …

remote: Scanning LFS files for validity, may be slow...        
remote: LFS file scan complete.        
To https://huggingface.co/andersab/tweet_model_sentiment_andersab
   4e9daf3..fda2d8f  main -> main

remote: LFS file scan complete.        
To https://huggingface.co/andersab/tweet_model_sentiment_andersab
   4e9daf3..fda2d8f  main -> main

To https://huggingface.co/andersab/tweet_model_sentiment_andersab
   fda2d8f..d81cfda  main -> main

   fda2d8f..d81cfda  main -> main



'https://huggingface.co/andersab/tweet_model_sentiment_andersab/commit/fda2d8f8de31d1da7266786dc5454eb7096b6d94'

In [55]:
# Run inferences with your new model using Pipeline
from transformers import pipeline

sentiment_model = pipeline(model="andersab/tweet_model_sentiment_andersab")



loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--andersab--tweet_model_sentiment_andersab/snapshots/d81cfda4d14bcd9d85213edb3c7d15a056e1fb0a/config.json
Model config DistilBertConfig {
  "_name_or_path": "andersab/tweet_model_sentiment_andersab",
  "activation": "gelu",
  "architectures": [
    "DistilBertForSequenceClassification"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "problem_type": "single_label_classification",
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "torch_dtype": "float32",
  "transformers_version": "4.24.0",
  "vocab_size":

Downloading:   0%|          | 0.00/2.97k [00:00<?, ?B/s]

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--andersab--tweet_model_sentiment_andersab/snapshots/d81cfda4d14bcd9d85213edb3c7d15a056e1fb0a/config.json
Model config DistilBertConfig {
  "_name_or_path": "andersab/tweet_model_sentiment_andersab",
  "activation": "gelu",
  "architectures": [
    "DistilBertForSequenceClassification"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "problem_type": "single_label_classification",
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "torch_dtype": "float32",
  "transformers_version": "4.24.0",
  "vocab_size":

[{'label': 'LABEL_2', 'score': 0.9999991655349731},
 {'label': 'LABEL_0', 'score': 0.9999868869781494}]

In [73]:
r0 = sentiment_model(["The Ukraine is celebrating the liberation of #Kherson. But I am afraid that the Kherson civilian population is going to be relentless bombed out of retaliation if the Russian army regroups. 😕😑 #KhersonFree",
                 "Horrible footage(Nov 10)Ukraine paratroops fast destroy tanks filled to Russia troop in post kherson",
                 "we will see what we need",
                 ])

In [90]:
import pandas as pd
result = r0
print(result)

df = pd.DataFrame.from_dict(result)#, orient='index')

map_result = {'LABEL_0':'Negative',
              'LABEL_1':'Neutral',
              'LABEL_2':'Positive'}
df['label'] = df['label'].map(map_result) 



[{'label': 'LABEL_0', 'score': 0.9998194575309753}, {'label': 'LABEL_0', 'score': 0.9999868869781494}, {'label': 'LABEL_1', 'score': 0.9999700784683228}]


Unnamed: 0,label,score
0,Negative,0.999819
1,Negative,0.999987
2,Neutral,0.99997
