In [None]:
!nvidia-smi

Fri Nov 10 13:29:46 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.105.17   Driver Version: 525.105.17   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   60C    P8    11W /  70W |      0MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

# Libraries & setup

In [None]:
! pip install tqdm boto3 requests regex sentencepiece sacremoses
# Packages must be loaded in this order or "torch.hub.load" does not work.
# Probably there is a version conflict!?
! pip install accelerate -U
! pip install transformers[torch] huggingface_hub datasets
! pip install evaluate
!pip install wandb

Collecting boto3
  Downloading boto3-1.28.83-py3-none-any.whl (135 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m135.8/135.8 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
Collecting sentencepiece
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m27.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting sacremoses
  Downloading sacremoses-0.1.1-py3-none-any.whl (897 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m897.5/897.5 kB[0m [31m63.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting botocore<1.32.0,>=1.31.83 (from boto3)
  Downloading botocore-1.31.83-py3-none-any.whl (11.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.3/11.3 MB[0m [31m116.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting jmespath<2.0.0,>=0.7.1 (from boto3)
  Downloading jmespath-1.0.1-py3-none-any.whl (20 kB)
Collectin

In [None]:
from google.colab import files
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict
# from datasets import load_metric # load_metric is deprecated
from evaluate import load

import torch

from transformers import Trainer, TrainingArguments
from transformers import AutoTokenizer, DataCollatorWithPadding
from transformers import AutoModelForSequenceClassification

import wandb

# Download data

In [None]:
! pip install -q kaggle

In [None]:
# Upload the "kaggle.json" file
files.upload()

In [None]:
# Make directory named kaggle and copy kaggle.json file there
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/

# remove the file from wd
! rm kaggle.json

# Change the permissions of the file.
! chmod 600 ~/.kaggle/kaggle.json

# Check of the file is in the folder
!ls -a ~/.kaggle/
# ! ls ../content

.  ..  kaggle.json


In [None]:
# Download Data: !kaggle competitions download -c 'name-of-competition'
! kaggle competitions download -c "nlp-getting-started"

Downloading nlp-getting-started.zip to /content
  0% 0.00/593k [00:00<?, ?B/s]
100% 593k/593k [00:00<00:00, 136MB/s]


In [None]:
# Create a directory named train,
! mkdir data

# unzip train data there,
! unzip nlp-getting-started.zip -d data
! rm nlp-getting-started.zip

Archive:  nlp-getting-started.zip
  inflating: data/sample_submission.csv  
  inflating: data/test.csv           
  inflating: data/train.csv          


# Import data

In [None]:
df_train_val = pd.read_csv("./data/train.csv")
df_train_val.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


# Create train & validation datasets

In [None]:
train_val_texts = df_train_val["text"]
train_val_labels = df_train_val["target"]

train_texts, val_texts, train_labels, val_labels = train_test_split(
    train_val_texts,
    train_val_labels,
    test_size=0.2,
    random_state=42,
    shuffle=True)

train_dataset = Dataset.from_dict({
    "text": train_texts,
    "label": train_labels
})

val_dataset = Dataset.from_dict({
    "text": val_texts,
    "label": val_labels
})

raw_datasets = DatasetDict({
    "train": train_dataset,
    "val": val_dataset
})

raw_datasets

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 6090
    })
    val: Dataset({
        features: ['text', 'label'],
        num_rows: 1523
    })
})

# Metrics Function

In [None]:
def compute_metrics_fn(eval_preds):
  logits, labels = eval_preds
  predictions = np.argmax(logits, axis=-1)

  accuracy_metric = load("accuracy")
  f1_metric = load("f1")

  metrics = dict()
  metrics.update(accuracy_metric.compute(predictions=predictions, references=labels))
  metrics.update(f1_metric.compute(predictions=predictions, references=labels, average="macro")) # for macro F1

  return metrics

# Hyperparameters search

In [None]:
wandb.login()

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [None]:
# method
sweep_config = {
    'method': 'random',
    "metric": {"goal": "maximize", "name": "eval/f1"},
}

# hyperparameters
parameters_dict = {
    "epochs": {
        'value': 5
    },
    "batch_size": {
        "values": [64, 128, 256] # 256 and 512 is too much!!
    },
    # 'learning_rate': {
    #     'distribution': 'log_uniform_values',
    #     'min': 1e-5,
    #     'max': 1e-3
    # },
}
sweep_config["parameters"] = parameters_dict

In [None]:
sweep_id = wandb.sweep(sweep_config, project='disaster_tweets')

Create sweep with ID: 70d5gx83
Sweep URL: https://wandb.ai/daniele-didino/disaster_tweets/sweeps/70d5gx83


## Model: bert-base-uncased

In [None]:
pretrained_model_id = "bert-base-uncased"

tokenizer = AutoTokenizer.from_pretrained(pretrained_model_id)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

def tokenize_function(example):
  return tokenizer(example["text"], truncation=True)


tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)


def model_init(trial):
  model = AutoModelForSequenceClassification.from_pretrained(
    pretrained_model_id,
    num_labels=2
  )
  return model


def train(config=None):
  with wandb.init(config=config):
    # set sweep configuration
    config = wandb.config

    # set training arguments
    training_args = TrainingArguments(
        output_dir="./results", # output directory
        report_to="wandb",  # Turn on Weights & Biases logging
        run_name=pretrained_model_id, # name of the W&B run
        num_train_epochs=config.epochs, # number of training epochs
        #learning_rate=config.learning_rate,
        per_device_train_batch_size=config.batch_size, # batch size per device during training
        per_device_eval_batch_size=16, # batch size for evaluation (if too high it runs out of memory)
        save_strategy="epoch", # save is done at the end of each epoch
        evaluation_strategy="epoch", # evaluation is done at the end of each epoch
        logging_strategy="epoch", # logging is done at the end of each epoch
        load_best_model_at_end=True,
        fp16=True, # Whether to use fp16 16-bit (mixed) precision training instead of 32-bit training
    )

    # define training loop
    trainer = Trainer(
        # model=model, # the instantiated Transformers model to be trained
        model_init=model_init,
        args=training_args, # training arguments
        train_dataset=tokenized_datasets["train"], # training dataset
        eval_dataset=tokenized_datasets["val"], # evaluation dataset
        data_collator=data_collator,
        compute_metrics=compute_metrics_fn,
    )

    # start training loop
    trainer.train()

    # Clear memory
    torch.cuda.empty_cache() # do I need this?

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/6090 [00:00<?, ? examples/s]

Map:   0%|          | 0/1523 [00:00<?, ? examples/s]

In [None]:
wandb.agent(sweep_id, train, count=1) # count=20

## Model: vinai/bertweet-base

In [None]:
pretrained_model_id = "vinai/bertweet-base"

tokenizer = AutoTokenizer.from_pretrained(pretrained_model_id)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

def tokenize_function(example):
  return tokenizer(example["text"], truncation=True)


tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)


def model_init(trial):
  model = AutoModelForSequenceClassification.from_pretrained(
    pretrained_model_id,
    num_labels=2
  )
  return model


def train(config=None):
  with wandb.init(config=config):
    # set sweep configuration
    config = wandb.config

    # set training arguments
    training_args = TrainingArguments(
        output_dir="./results", # output directory
        report_to="wandb",  # Turn on Weights & Biases logging
        run_name=pretrained_model_id, # name of the W&B run
        num_train_epochs=config.epochs, # number of training epochs
        #learning_rate=config.learning_rate,
        per_device_train_batch_size=config.batch_size, # batch size per device during training
        per_device_eval_batch_size=16, # batch size for evaluation (if too high it runs out of memory)
        save_strategy="epoch", # save is done at the end of each epoch
        evaluation_strategy="epoch", # evaluation is done at the end of each epoch
        logging_strategy="epoch", # logging is done at the end of each epoch
        load_best_model_at_end=True,
        fp16=True, # Whether to use fp16 16-bit (mixed) precision training instead of 32-bit training
    )

    # define training loop
    trainer = Trainer(
        # model=model, # the instantiated Transformers model to be trained
        model_init=model_init,
        args=training_args, # training arguments
        train_dataset=tokenized_datasets["train"], # training dataset
        eval_dataset=tokenized_datasets["val"], # evaluation dataset
        data_collator=data_collator,
        compute_metrics=compute_metrics_fn,
    )

    # start training loop
    trainer.train()

    # Clear memory
    torch.cuda.empty_cache() # do I need this?

Downloading (…)lve/main/config.json:   0%|          | 0.00/558 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/843k [00:00<?, ?B/s]

Downloading (…)solve/main/bpe.codes:   0%|          | 0.00/1.08M [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.91M [00:00<?, ?B/s]

emoji is not installed, thus not converting emoticons or emojis into text. Install emoji: pip3 install emoji==0.6.0


Map:   0%|          | 0/6090 [00:00<?, ? examples/s]

Map:   0%|          | 0/1523 [00:00<?, ? examples/s]

In [None]:
wandb.agent(sweep_id, train, count=1) # count=20

# Refit best model

In [None]:
BATH_SIZE = 64
EPOCHS = 1

pretrained_model_id = "bert-base-uncased"

tokenizer = AutoTokenizer.from_pretrained(pretrained_model_id)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

def tokenize_function(example):
  return tokenizer(example["text"], truncation=True)


tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)

model = AutoModelForSequenceClassification.from_pretrained(
    pretrained_model_id,
    num_labels=2
)

# set training arguments
training_args = TrainingArguments(
    output_dir="./results", # output directory
    run_name=pretrained_model_id, # name of the W&B run
    num_train_epochs=EPOCHS, # number of training epochs
    #learning_rate=config.learning_rate,
    per_device_train_batch_size=BATH_SIZE, # batch size per device during training
    per_device_eval_batch_size=16, # batch size for evaluation (if too high it runs out of memory)
    save_strategy="epoch", # save is done at the end of each epoch
    evaluation_strategy="epoch", # evaluation is done at the end of each epoch
    logging_strategy="epoch", # logging is done at the end of each epoch
    load_best_model_at_end=True,
    fp16=True, # Whether to use fp16 16-bit (mixed) precision training instead of 32-bit training
)

# define training loop
trainer = Trainer(
    model=model, # the instantiated Transformers model to be trained
    args=training_args, # training arguments
    train_dataset=tokenized_datasets["train"], # training dataset
    eval_dataset=tokenized_datasets["val"], # evaluation dataset
    data_collator=data_collator,
    compute_metrics=compute_metrics_fn,
)

# start training loop
trainer.train()

# Clear memory
torch.cuda.empty_cache()

Map:   0%|          | 0/6090 [00:00<?, ? examples/s]

Map:   0%|          | 0/1523 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.




You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.4562,0.390549,0.831254,0.825438


In [None]:
trainer.evaluate()

{'eval_loss': 0.390548974275589,
 'eval_accuracy': 0.8312541037426132,
 'eval_f1': 0.8254378981779422,
 'eval_runtime': 3.0462,
 'eval_samples_per_second': 499.963,
 'eval_steps_per_second': 31.514,
 'epoch': 1.0}

# Test set

In [None]:
df_test = pd.read_csv("./data/test.csv")
df_test.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [None]:
def classify_text(text):
  # pt = pytorch
  inputs = tokenizer(text, truncation=True, padding=True, return_tensors="pt").to("cuda")
  outputs = model(**inputs)
  probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)
  predicted_class = torch.argmax(probabilities).item()
  return predicted_class

In [None]:
classify_text(df_test["text"][4])

1

In [None]:
df_test["text"][4]

'Typhoon Soudelor kills 28 in China and Taiwan'

In [None]:
y_pred_test = df_test["text"].map(classify_text)

In [None]:
y_pred_test.mean()

0.37511492491572174

In [None]:
df_submission = pd.DataFrame({
    "id": df_test.id,
    "target": y_pred_test.astype(int)
})

df_submission.head()

Unnamed: 0,id,target
0,0,1
1,2,1
2,3,1
3,9,1
4,11,1


In [None]:
df_submission.to_csv("./data/test_submission.csv", index=False)