In [1]:
!pip install "ray[tune]" transformers==4.28.0 datasets scipy scikit-learn
!pip install torch torchvision torchaudio
!pip install --upgrade accelerate

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
from datasets import load_dataset, load_metric
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments, AutoConfig, EarlyStoppingCallback, IntervalStrategy
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict

In [3]:
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/My Drive/fer/apt

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/My Drive/fer/apt


In [4]:
tokenizer = AutoTokenizer.from_pretrained("openai-gpt")
le = LabelEncoder()
train_dataset = pd.read_csv('datasets/train_level_a.csv')
test_dataset = pd.read_csv('datasets/test_level_a.csv')
print(train_dataset['subtask_a'].value_counts(), test_dataset['label'].value_counts())
test_dataset

NOT    8840
OFF    4400
Name: subtask_a, dtype: int64 NOT    620
OFF    240
Name: label, dtype: int64


Unnamed: 0,id,tweet,label
0,15923,#WhoIsQ #WheresTheServer #DumpNike #DECLASFISA...,OFF
1,27014,"#ConstitutionDay is revered by Conservatives, ...",NOT
2,30530,#FOXNews #NRA #MAGA #POTUS #TRUMP #2ndAmendmen...,NOT
3,13876,#Watching #Boomer getting the news that she is...,NOT
4,60133,#NoPasaran: Unity demo to oppose the far-right...,OFF
...,...,...,...
855,73439,#DespicableDems lie again about rifles. Dem Di...,OFF
856,25657,#MeetTheSpeakers 🙌 @USER will present in our e...,NOT
857,67018,3 people just unfollowed me for talking about ...,OFF
858,50665,#WednesdayWisdom Antifa calls the right fascis...,NOT


In [5]:
train_dataset, validation_dataset = train_test_split(train_dataset, test_size=0.1)
train_dataset

Unnamed: 0,id,tweet,subtask_a
6553,34473,@USER @USER @USER @USER @USER @USER @USER @USE...,NOT
5948,60909,@USER @USER Okay Blane. Now get back to your b...,NOT
8664,96189,@USER I guess people won’t be voting conservat...,NOT
12736,47662,@USER Guess he forgot to walk the liberal plank!,OFF
7646,76865,@USER @USER That wench is on something. She m...,OFF
...,...,...,...
645,17441,@USER @USER Kisses!😘😘,NOT
8900,83204,@USER @USER @USER @USER He is great! Awesome s...,NOT
6333,49020,#Tyranny #Totalitarianism #Leftism JUST WHEN Y...,NOT
11848,54561,@USER 2/3 You were on TV claiming to be a jour...,NOT


In [6]:
le.fit(train_dataset["subtask_a"])
le_name_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
le_name_mapping

{'NOT': 0, 'OFF': 1}

In [7]:
def encode(examples):
    global le
    outputs = tokenizer(examples["tweet"], return_tensors="pt", padding='max_length')
    outputs["labels"]=le.transform(examples["subtask_a"])
    return outputs

def encode2(examples):
    global le
    outputs = tokenizer(examples["tweet"], return_tensors="pt", padding='max_length')
    outputs["labels"]=le.transform(examples["label"])
    return outputs

if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})

encoded_dataset = DatasetDict()
encoded_dataset["train"] = Dataset.from_pandas(train_dataset).map(encode, batched=True, remove_columns=['subtask_a'])
encoded_dataset["validation"] = Dataset.from_pandas(validation_dataset).map(encode, batched=True, remove_columns=['subtask_a'])
encoded_dataset["test"] = Dataset.from_pandas(test_dataset).map(encode2, batched=True, remove_columns=['label'])
encoded_dataset

Using pad_token, but it is not set yet.


Map:   0%|          | 0/11916 [00:00<?, ? examples/s]

Map:   0%|          | 0/1324 [00:00<?, ? examples/s]

Map:   0%|          | 0/860 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'tweet', '__index_level_0__', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 11916
    })
    validation: Dataset({
        features: ['id', 'tweet', '__index_level_0__', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 1324
    })
    test: Dataset({
        features: ['id', 'tweet', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 860
    })
})

In [8]:
def model_init():
  global tokenizer
  config = AutoConfig.from_pretrained('openai-gpt', id2label={int(str(v)): k for k, v in le_name_mapping.items()}, label2id={k: int(str(v)) for k, v in le_name_mapping.items()}, num_labels=len(le.classes_))
  model=AutoModelForSequenceClassification.from_pretrained('openai-gpt', config=config)
  model.resize_token_embeddings(len(tokenizer))
  model.config.pad_token_id = tokenizer.pad_token_id
  return model


In [9]:
def compute_metrics(eval_preds):
    logits, labels = eval_preds
    labels=labels.flatten()
    predictions = np.argmax(logits, axis=-1)
    predictions=predictions.flatten()
    all_metrics = classification_report(y_true=labels, y_pred=predictions, output_dict=True, zero_division=0)
    return {
        "f1-score": all_metrics["macro avg"]["f1-score"]
    }

In [18]:
training_args = TrainingArguments(
    "gpt_checkpoint", 
    evaluation_strategy=IntervalStrategy.EPOCH,
    save_strategy=IntervalStrategy.EPOCH, 
    load_best_model_at_end=True,
    metric_for_best_model='f1-score',
    warmup_ratio=0.1,
    overwrite_output_dir=True,
    num_train_epochs=5,
    per_device_train_batch_size=4,
    do_predict=True,
    do_train=True
    )
trainer = Trainer(
    args=training_args,
    tokenizer=tokenizer,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["validation"],
    model_init=model_init,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=1)]
)

Some weights of OpenAIGPTForSequenceClassification were not initialized from the model checkpoint at openai-gpt and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
def my_hp_space_ray(trial):
    from ray import tune

    return {
        "learning_rate": tune.uniform(1e-5, 5e-5),
        "seed": tune.choice(range(1, 41)),
    }

best_run = trainer.hyperparameter_search(
    direction="maximize", 
    backend="ray", 
    hp_space=my_hp_space_ray,
    n_trials=3
)
best_run

In [14]:
for n, v in best_run.hyperparameters.items():
  print(n, v)

learning_rate 3.394633936788147e-05


In [None]:
#for n, v in best_run.hyperparameters.items():
    #setattr(trainer.args, n, v)

setattr(trainer.args, 'learning_rate', 3e-5)
trainer.train()

Some weights of OpenAIGPTForSequenceClassification were not initialized from the model checkpoint at openai-gpt and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1-score
1,0.5205,0.615069,0.764903
2,0.4782,0.714385,0.76962


In [21]:
logits= trainer.predict(encoded_dataset["test"])
predictions = np.argmax(logits.predictions, axis=-1).flatten()
predictions = le.inverse_transform(predictions)
labels=logits.label_ids.flatten()
true_labels=le.inverse_transform(labels)
report = classification_report(y_true=true_labels, y_pred=predictions, output_dict=True, zero_division=0)
report

{'NOT': {'precision': 0.8826291079812206,
  'recall': 0.9096774193548387,
  'f1-score': 0.8959491660047657,
  'support': 620},
 'OFF': {'precision': 0.746606334841629,
  'recall': 0.6875,
  'f1-score': 0.7158351409978307,
  'support': 240},
 'accuracy': 0.8476744186046512,
 'macro avg': {'precision': 0.8146177214114247,
  'recall': 0.7985887096774194,
  'f1-score': 0.8058921535012982,
  'support': 860},
 'weighted avg': {'precision': 0.8446692643143577,
  'recall': 0.8476744186046512,
  'f1-score': 0.8456847869330629,
  'support': 860}}

In [22]:
name_dict = {
    'id': encoded_dataset["test"]['id'],
    'tweet': encoded_dataset["test"]['tweet'],
    'true_label': true_labels,
    'bert_label': predictions
}

df = pd.DataFrame(name_dict)
df

Unnamed: 0,id,tweet,true_label,bert_label
0,15923,#WhoIsQ #WheresTheServer #DumpNike #DECLASFISA...,OFF,OFF
1,27014,"#ConstitutionDay is revered by Conservatives, ...",NOT,NOT
2,30530,#FOXNews #NRA #MAGA #POTUS #TRUMP #2ndAmendmen...,NOT,NOT
3,13876,#Watching #Boomer getting the news that she is...,NOT,NOT
4,60133,#NoPasaran: Unity demo to oppose the far-right...,OFF,NOT
...,...,...,...,...
855,73439,#DespicableDems lie again about rifles. Dem Di...,OFF,NOT
856,25657,#MeetTheSpeakers 🙌 @USER will present in our e...,NOT,NOT
857,67018,3 people just unfollowed me for talking about ...,OFF,OFF
858,50665,#WednesdayWisdom Antifa calls the right fascis...,NOT,NOT


In [23]:
df.to_csv('gpt.csv', index=False)