In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
train_path =  "/content/drive/MyDrive/clef_data/data/clean_train.tsv"
test_path = "/content/drive/MyDrive/clef_data/data/1A_EnTest.tsv"
model_path = "/content/drive/MyDrive/clef_data/models/1A_en"

In [None]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.19.2-py3-none-any.whl (4.2 MB)
[K     |████████████████████████████████| 4.2 MB 5.1 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.6.0-py3-none-any.whl (84 kB)
[K     |████████████████████████████████| 84 kB 4.2 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 63.1 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 66.2 MB/s 
Installing collected packages: pyyaml, tokenizers, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninstalling PyYAML-3.13:
      Successfully uninstalled PyYAML-3.13
Successfully installed huggingface-hub-0.

In [None]:
import torch
from transformers.file_utils import is_tf_available, is_torch_available, is_torch_tpu_available
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import Trainer, TrainingArguments
import numpy as np
import nltk
from nltk.corpus import stopwords
import re
from nltk.stem.porter import PorterStemmer
from collections import Counter
import random
import pandas as pd
from sklearn.metrics import accuracy_score, classification_report, f1_score, roc_curve, auc

In [None]:
model_name = "roberta-large"
max_length = 64
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print('DEVICE: ' + device)
model = model.to(device)

Downloading:   0%|          | 0.00/482 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.33G [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-large were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'roberta.pooler.dense.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.out_proj.weight', 'clas

DEVICE: cuda


In [None]:
column_n = ['topic', 'tweet_id', 'tweet_url', 'text', 'label']
remove_c = ['topic', 'tweet_id', 'tweet_url']

df_train = pd.read_csv(train_path, sep='\t')
df_test = pd.read_csv(test_path, sep='\t')
df_train = df_train.rename(columns={'class_label': 'label'})
df_test = df_test.rename(columns={'class_label': 'label'})
df_train = df_train.rename(columns={'tweet_text': 'text'})
df_test = df_test.rename(columns={'tweet_text': 'text'})

In [None]:
def remove_unused_c(df,column_n=remove_c):
    df = df.drop(column_n,axis=1)
    return df

def clean_dataset(df):
    df = remove_unused_c(df)
    return df

def clean_text(text):
    text = str(text).replace(r'http[\w:\/\.]+', '@link')
    text = str(text).replace(r'\s\s+', ' ')
    return text

def nltk_preprocess(text):
    text = clean_text(text)
    return  text

In [None]:
df_train = clean_dataset(df_train)
df_train["text"] = df_train.text.apply(nltk_preprocess)
df_test = clean_dataset(df_test)
df_test["text"] = df_test.text.apply(nltk_preprocess)


train_texts = df_train["text"].tolist()
valid_texts = df_test["text"].tolist()
train_labels = df_train["label"].tolist()
valid_labels = df_test["label"].tolist()

In [None]:
def set_seed(seed: int):
    random.seed(seed)
    np.random.seed(seed)
    if is_torch_available():
        torch.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
    if is_tf_available():
        import tensorflow as tf

        tf.random.set_seed(seed)

set_seed(1)

In [None]:
class TwitterDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        item["labels"] = torch.tensor([self.labels[idx]])
        return item

    def __len__(self):
        return len(self.labels)

train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=max_length)
valid_encodings = tokenizer(valid_texts, truncation=True, padding=True, max_length=max_length)

train_dataset = TwitterDataset(train_encodings, train_labels)
valid_dataset = TwitterDataset(valid_encodings, valid_labels)

In [None]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, pos_label=0)
    fpr, tpr, thresholds = roc_curve(labels, preds, pos_label=0)
    aucM = auc(fpr, tpr);
    #weighted_f1 = f1_score(labels, preds, average='weighted')
    print(classification_report(labels, preds))
    return {
        'accuracy': acc,
        'f1': f1,
        'auc': aucM,
        #'weighted_f1': weighted_f1,
        }

training_args = TrainingArguments(
    output_dir='trainer/results',
    num_train_epochs=15,
    per_device_train_batch_size=16,
    gradient_accumulation_steps=2,
    per_device_eval_batch_size=32,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    disable_tqdm=False,
    load_best_model_at_end=True,
    warmup_ratio=0.06,
    weight_decay=0.01,
    logging_steps=8,
    learning_rate=3e-5,
    logging_dir='trainer/logs',
    run_name='roberta-classification',
    metric_for_best_model="f1"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()


***** Running training *****
  Num examples = 2317
  Num Epochs = 15
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 2
  Total optimization steps = 1080


Epoch,Training Loss,Validation Loss,Accuracy,F1,Auc
0,0.4842,0.582645,0.562718,0.616794,0.304041
1,0.3526,0.397933,0.820557,0.88984,0.319406
2,0.2598,0.42011,0.818815,0.87907,0.218692
3,0.2222,0.595322,0.810105,0.88165,0.315138
4,0.1867,0.638606,0.827526,0.888639,0.246102
5,0.0894,1.195076,0.837979,0.900747,0.297161
6,0.0376,1.086046,0.832753,0.89016,0.21796
7,0.0054,1.358549,0.825784,0.888641,0.260988
8,0.0005,1.361072,0.843206,0.901099,0.252504
9,0.0137,1.55672,0.832753,0.896774,0.295027


***** Running Evaluation *****
  Num examples = 574
  Batch size = 32


              precision    recall  f1-score   support

           0       0.96      0.45      0.62       445
           1       0.33      0.94      0.49       129

    accuracy                           0.56       574
   macro avg       0.65      0.70      0.55       574
weighted avg       0.82      0.56      0.59       574



Saving model checkpoint to trainer/results/checkpoint-72
Configuration saved in trainer/results/checkpoint-72/config.json
Model weights saved in trainer/results/checkpoint-72/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 574
  Batch size = 32


              precision    recall  f1-score   support

           0       0.85      0.93      0.89       445
           1       0.65      0.43      0.52       129

    accuracy                           0.82       574
   macro avg       0.75      0.68      0.70       574
weighted avg       0.81      0.82      0.81       574



Saving model checkpoint to trainer/results/checkpoint-144
Configuration saved in trainer/results/checkpoint-144/config.json
Model weights saved in trainer/results/checkpoint-144/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 574
  Batch size = 32


              precision    recall  f1-score   support

           0       0.91      0.85      0.88       445
           1       0.58      0.71      0.64       129

    accuracy                           0.82       574
   macro avg       0.74      0.78      0.76       574
weighted avg       0.84      0.82      0.83       574



Saving model checkpoint to trainer/results/checkpoint-216
Configuration saved in trainer/results/checkpoint-216/config.json
Model weights saved in trainer/results/checkpoint-216/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 574
  Batch size = 32


              precision    recall  f1-score   support

           0       0.85      0.91      0.88       445
           1       0.60      0.46      0.52       129

    accuracy                           0.81       574
   macro avg       0.73      0.68      0.70       574
weighted avg       0.80      0.81      0.80       574



Saving model checkpoint to trainer/results/checkpoint-288
Configuration saved in trainer/results/checkpoint-288/config.json
Model weights saved in trainer/results/checkpoint-288/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 574
  Batch size = 32


              precision    recall  f1-score   support

           0       0.89      0.89      0.89       445
           1       0.62      0.62      0.62       129

    accuracy                           0.83       574
   macro avg       0.75      0.75      0.75       574
weighted avg       0.83      0.83      0.83       574



Saving model checkpoint to trainer/results/checkpoint-360
Configuration saved in trainer/results/checkpoint-360/config.json
Model weights saved in trainer/results/checkpoint-360/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 574
  Batch size = 32


              precision    recall  f1-score   support

           0       0.86      0.95      0.90       445
           1       0.72      0.46      0.56       129

    accuracy                           0.84       574
   macro avg       0.79      0.70      0.73       574
weighted avg       0.83      0.84      0.82       574



Saving model checkpoint to trainer/results/checkpoint-432
Configuration saved in trainer/results/checkpoint-432/config.json
Model weights saved in trainer/results/checkpoint-432/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 574
  Batch size = 32


              precision    recall  f1-score   support

           0       0.91      0.87      0.89       445
           1       0.61      0.69      0.65       129

    accuracy                           0.83       574
   macro avg       0.76      0.78      0.77       574
weighted avg       0.84      0.83      0.84       574



Saving model checkpoint to trainer/results/checkpoint-504
Configuration saved in trainer/results/checkpoint-504/config.json
Model weights saved in trainer/results/checkpoint-504/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 574
  Batch size = 32


              precision    recall  f1-score   support

           0       0.88      0.90      0.89       445
           1       0.62      0.58      0.60       129

    accuracy                           0.83       574
   macro avg       0.75      0.74      0.74       574
weighted avg       0.82      0.83      0.82       574



Saving model checkpoint to trainer/results/checkpoint-576
Configuration saved in trainer/results/checkpoint-576/config.json
Model weights saved in trainer/results/checkpoint-576/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 574
  Batch size = 32


              precision    recall  f1-score   support

           0       0.88      0.92      0.90       445
           1       0.68      0.57      0.62       129

    accuracy                           0.84       574
   macro avg       0.78      0.75      0.76       574
weighted avg       0.84      0.84      0.84       574



Saving model checkpoint to trainer/results/checkpoint-648
Configuration saved in trainer/results/checkpoint-648/config.json
Model weights saved in trainer/results/checkpoint-648/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 574
  Batch size = 32


              precision    recall  f1-score   support

           0       0.86      0.94      0.90       445
           1       0.69      0.47      0.56       129

    accuracy                           0.83       574
   macro avg       0.77      0.70      0.73       574
weighted avg       0.82      0.83      0.82       574



Saving model checkpoint to trainer/results/checkpoint-720
Configuration saved in trainer/results/checkpoint-720/config.json
Model weights saved in trainer/results/checkpoint-720/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 574
  Batch size = 32


              precision    recall  f1-score   support

           0       0.88      0.90      0.89       445
           1       0.62      0.57      0.60       129

    accuracy                           0.83       574
   macro avg       0.75      0.74      0.74       574
weighted avg       0.82      0.83      0.82       574



Saving model checkpoint to trainer/results/checkpoint-792
Configuration saved in trainer/results/checkpoint-792/config.json
Model weights saved in trainer/results/checkpoint-792/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 574
  Batch size = 32


              precision    recall  f1-score   support

           0       0.89      0.89      0.89       445
           1       0.63      0.63      0.63       129

    accuracy                           0.83       574
   macro avg       0.76      0.76      0.76       574
weighted avg       0.83      0.83      0.83       574



Saving model checkpoint to trainer/results/checkpoint-864
Configuration saved in trainer/results/checkpoint-864/config.json
Model weights saved in trainer/results/checkpoint-864/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 574
  Batch size = 32


              precision    recall  f1-score   support

           0       0.89      0.89      0.89       445
           1       0.63      0.64      0.63       129

    accuracy                           0.83       574
   macro avg       0.76      0.76      0.76       574
weighted avg       0.83      0.83      0.83       574



Saving model checkpoint to trainer/results/checkpoint-936
Configuration saved in trainer/results/checkpoint-936/config.json
Model weights saved in trainer/results/checkpoint-936/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 574
  Batch size = 32


              precision    recall  f1-score   support

           0       0.89      0.89      0.89       445
           1       0.63      0.64      0.63       129

    accuracy                           0.83       574
   macro avg       0.76      0.76      0.76       574
weighted avg       0.83      0.83      0.83       574



Saving model checkpoint to trainer/results/checkpoint-1008
Configuration saved in trainer/results/checkpoint-1008/config.json
Model weights saved in trainer/results/checkpoint-1008/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 574
  Batch size = 32


              precision    recall  f1-score   support

           0       0.89      0.89      0.89       445
           1       0.63      0.63      0.63       129

    accuracy                           0.83       574
   macro avg       0.76      0.76      0.76       574
weighted avg       0.83      0.83      0.83       574



Saving model checkpoint to trainer/results/checkpoint-1080
Configuration saved in trainer/results/checkpoint-1080/config.json
Model weights saved in trainer/results/checkpoint-1080/pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from trainer/results/checkpoint-648 (score: 0.901098901098901).


TrainOutput(global_step=1080, training_loss=0.10593944074832877, metrics={'train_runtime': 1775.3495, 'train_samples_per_second': 19.576, 'train_steps_per_second': 0.608, 'total_flos': 4047144928092672.0, 'train_loss': 0.10593944074832877, 'epoch': 14.99})