In [1]:
!pip install "ray[tune]" transformers==4.28.0 datasets scipy scikit-learn
!pip install torch torchvision torchaudio
!pip install --upgrade accelerate

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
from datasets import load_dataset, load_metric
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments, AutoConfig, EarlyStoppingCallback, IntervalStrategy
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict

In [3]:
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/My Drive/fer/apt/projekt

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/My Drive/fer/apt/projekt


In [4]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
le = LabelEncoder()
train_dataset = pd.read_csv('datasets/train_level_a.csv')
test_dataset = pd.read_csv('datasets/test_level_a.csv')
print(train_dataset['subtask_a'].value_counts(), test_dataset['label'].value_counts())
test_dataset

NOT    8840
OFF    4400
Name: subtask_a, dtype: int64 NOT    620
OFF    240
Name: label, dtype: int64


Unnamed: 0,id,tweet,label
0,15923,#WhoIsQ #WheresTheServer #DumpNike #DECLASFISA...,OFF
1,27014,"#ConstitutionDay is revered by Conservatives, ...",NOT
2,30530,#FOXNews #NRA #MAGA #POTUS #TRUMP #2ndAmendmen...,NOT
3,13876,#Watching #Boomer getting the news that she is...,NOT
4,60133,#NoPasaran: Unity demo to oppose the far-right...,OFF
...,...,...,...
855,73439,#DespicableDems lie again about rifles. Dem Di...,OFF
856,25657,#MeetTheSpeakers 🙌 @USER will present in our e...,NOT
857,67018,3 people just unfollowed me for talking about ...,OFF
858,50665,#WednesdayWisdom Antifa calls the right fascis...,NOT


In [5]:
train_dataset, validation_dataset = train_test_split(train_dataset, test_size=0.1)
train_dataset

Unnamed: 0,id,tweet,subtask_a
5506,59580,@USER But you are Dreg of society!!,NOT
6826,46263,@USER He is a lying stack of shit too.,OFF
10448,81591,@USER Things are definitely fucked up when you...,OFF
7071,85278,@USER This C is back? Liberals are shortsighte...,OFF
971,75263,@USER @USER It’s laughable that you parade Chi...,NOT
...,...,...,...
1441,70789,Mark Judge doesn't want to lie for KAVANAUGH. ...,NOT
9318,25109,@USER Which side has #Antifa and the #Handmaid...,NOT
3578,87779,@USER And the Liberals will supply them with t...,NOT
7559,42212,@USER yet another “good guy with a gun....”🤦‍...,OFF


In [6]:
le.fit(train_dataset["subtask_a"])
le_name_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
le_name_mapping

{'NOT': 0, 'OFF': 1}

In [7]:
def encode(examples):
    global le, tokenizer
    outputs = tokenizer(examples["tweet"], return_tensors="pt", padding='max_length')
    outputs["labels"]=le.transform(examples["subtask_a"])
    return outputs

def encode2(examples):
    global le, tokenizer
    outputs = tokenizer(examples["tweet"], return_tensors="pt", padding='max_length')
    outputs["labels"]=le.transform(examples["label"])
    return outputs

encoded_dataset = DatasetDict()
encoded_dataset["train"] = Dataset.from_pandas(train_dataset).map(encode, batched=True, remove_columns=['subtask_a'])
encoded_dataset["validation"] = Dataset.from_pandas(validation_dataset).map(encode, batched=True, remove_columns=['subtask_a'])
encoded_dataset["test"] = Dataset.from_pandas(test_dataset).map(encode2, batched=True, remove_columns=['label'])
encoded_dataset

Map:   0%|          | 0/11916 [00:00<?, ? examples/s]

Map:   0%|          | 0/1324 [00:00<?, ? examples/s]

Map:   0%|          | 0/860 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'tweet', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 11916
    })
    validation: Dataset({
        features: ['id', 'tweet', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 1324
    })
    test: Dataset({
        features: ['id', 'tweet', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 860
    })
})

In [8]:
def model_init():
  config = AutoConfig.from_pretrained('bert-base-cased', id2label={int(str(v)): k for k, v in le_name_mapping.items()}, label2id={k: int(str(v)) for k, v in le_name_mapping.items()}, num_labels=len(le.classes_))
  return AutoModelForSequenceClassification.from_pretrained('bert-base-cased', config=config)

In [9]:
predictions=dict()

def compute_metrics(eval_preds):
    logits, labels = eval_preds
    labels=labels.flatten()
    predictions = np.argmax(logits, axis=-1)
    predictions=predictions.flatten()
    all_metrics = classification_report(y_true=labels, y_pred=predictions, output_dict=True, zero_division=0)
    return {
        "f1-score": all_metrics["macro avg"]["f1-score"]
    }

In [10]:
training_args = TrainingArguments(
    "bert_checkpoint", 
    evaluation_strategy=IntervalStrategy.EPOCH,
    save_strategy=IntervalStrategy.EPOCH, 
    load_best_model_at_end=True,
    metric_for_best_model='f1-score',
    warmup_ratio=0.1,
    overwrite_output_dir=True,
    num_train_epochs=5,
    per_device_train_batch_size=16,
    do_predict=True,
    do_train=True
    )
trainer = Trainer(
    args=training_args,
    tokenizer=tokenizer,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["validation"],
    model_init=model_init,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=1)]
)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

In [None]:
def my_hp_space_ray(trial):
    from ray import tune

    return {
        "learning_rate": tune.uniform(1e-5, 5e-5),
        "seed": tune.choice(range(1, 41)),
    }

best_run = trainer.hyperparameter_search(
    direction="maximize", 
    backend="ray", 
    hp_space=my_hp_space_ray,
    n_trials=3
)
best_run

In [15]:
for n, v in best_run.hyperparameters.items():
  print(n, v)

learning_rate 2.49816047538945e-05


In [11]:
#for n, v in best_run.hyperparameters.items():
    #setattr(trainer.args, n, v)
setattr(trainer.args, 'learning_rate', 3e-5)
trainer.train()

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

Epoch,Training Loss,Validation Loss,F1-score
1,0.5338,0.432443,0.77974
2,0.42,0.453319,0.769574


TrainOutput(global_step=1490, training_loss=0.4423331548703597, metrics={'train_runtime': 2307.6704, 'train_samples_per_second': 25.818, 'train_steps_per_second': 1.614, 'total_flos': 6270462671339520.0, 'train_loss': 0.4423331548703597, 'epoch': 2.0})

In [12]:
logits= trainer.predict(encoded_dataset['test'])
predictions = np.argmax(logits.predictions, axis=-1).flatten()
predictions = le.inverse_transform(predictions)
labels=logits.label_ids.flatten()
true_labels=le.inverse_transform(labels)
report = classification_report(y_true=true_labels, y_pred=predictions, output_dict=True, zero_division=0)
report

{'NOT': {'precision': 0.8818635607321131,
  'recall': 0.8548387096774194,
  'f1-score': 0.8681408681408681,
  'support': 620},
 'OFF': {'precision': 0.6525096525096525,
  'recall': 0.7041666666666667,
  'f1-score': 0.6773547094188377,
  'support': 240},
 'accuracy': 0.8127906976744186,
 'macro avg': {'precision': 0.7671866066208828,
  'recall': 0.779502688172043,
  'f1-score': 0.7727477887798528,
  'support': 860},
 'weighted avg': {'precision': 0.8178578189025892,
  'recall': 0.8127906976744186,
  'f1-score': 0.8148982191951852,
  'support': 860}}

In [17]:
name_dict = {
    'id': encoded_dataset["test"]['id'],
    'tweet': encoded_dataset["test"]['tweet'],
    'true_label': true_labels,
    'bert_label': predictions
}

df = pd.DataFrame(name_dict)
df

Unnamed: 0,id,tweet,true_label,bert_label
0,15923,#WhoIsQ #WheresTheServer #DumpNike #DECLASFISA...,OFF,OFF
1,27014,"#ConstitutionDay is revered by Conservatives, ...",NOT,OFF
2,30530,#FOXNews #NRA #MAGA #POTUS #TRUMP #2ndAmendmen...,NOT,NOT
3,13876,#Watching #Boomer getting the news that she is...,NOT,NOT
4,60133,#NoPasaran: Unity demo to oppose the far-right...,OFF,NOT
...,...,...,...,...
855,73439,#DespicableDems lie again about rifles. Dem Di...,OFF,OFF
856,25657,#MeetTheSpeakers 🙌 @USER will present in our e...,NOT,NOT
857,67018,3 people just unfollowed me for talking about ...,OFF,OFF
858,50665,#WednesdayWisdom Antifa calls the right fascis...,NOT,OFF


In [18]:
df.to_csv('bert.csv', index=False)