In [None]:
# !pip install transformers
# !pip install keras_preprocessing
!pip install transformers datasets evaluate accelerate
# !pip install emoji

from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/My Drive/Populism


from huggingface_hub import notebook_login
notebook_login()
import pandas as pd
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, DataCollatorWithPadding
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer, BertForSequenceClassification
import evaluate
import numpy as np

# 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

In [None]:
##################################################################################################
# If you would like to fine-tune a pre-trained language model
##################################################################################################

# # prepare dataset
# df_train: dataframe of training data (or just an entire dataset that has not been split to train/validate)
# df_val: dataframe of validation data if already split into train/validate
# need_split: a boolean variable indicating whether the data needs to be split into train/validate
# random_seed: for reproducibility

def prep_data(df_train, df_val=None, need_split=True, random_seed = 42):
  if need_split:
    df_train, df_val = train_test_split(df_train, random_state = random_seed, test_size=0.2) # returns 2 dataframes

  training = pd.DataFrame({
      "label": list(df_train['peoplecentric']),
      "text": list(df_train['Message']),
      # "id": list(df_train['id'])
  })
  train_dataset = Dataset.from_dict(training)
  validation = pd.DataFrame({
      "label": list(df_val['peoplecentric']),
      "text": list(df_val['Message']),
      # "id": list(df_val['id'])
  })
  val_dataset = Dataset.from_dict(validation)
  my_dataset_dict = DatasetDict({"train":train_dataset,"val":val_dataset})

  return my_dataset_dict

In [None]:
df = pd.read_csv('./merged_labeled_data/Germanic_train.csv', index_col=0)
df['peoplecentric'] = df['peoplecentric'].astype(int)
df = df[df['antielite']==1]
# df = pd.read_csv('./merged_labeled_data/Germanic_train.csv')
data = prep_data(df, need_split = True)
data

In [None]:
# model_dir = "./off_the_shelf_models/HateBERT" # "bert-base-uncased"

# model_dir = "google-bert/bert-base-multilingual-cased" # "bigscience/bloom"
model_dir = "bigscience/bloom"
tokenizer = AutoTokenizer.from_pretrained(model_dir) # can be changed to any model hosted on Huggingface
# tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

tokenized_data = data.map(preprocess_function, batched=True)
# tokenizer(my_dataset_dict['test'][0]["text"], truncation=True)

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(
    model_dir,
    num_labels = 2 # increase for multi-class tasks
)

In [None]:
# import evaluate
# accuracy = evaluate.load("accuracy")
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    accuracy = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average='binary') # report 1's f1
    return {
        'accuracy': accuracy,
        'f1': f1
    }
    # return accuracy.compute(predictions=predictions, references=labels)


training_args = TrainingArguments(
    output_dir="germanic_peoplecentric_bloom",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_steps=100,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=True,
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["val"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train() # fff78532048001cab923f927722da80c829ec7a5
trainer.push_to_hub()