## Initialize

In [None]:
!pip install sentencepiece transformers transformers[sentencepiece] pandas datasets evaluate tensorflow torch huggingface_hub

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
main_dir = '/content/gdrive/MyDrive/fallacy_classifier/'

In [None]:
!huggingface-cli login

## Data

In [None]:
import pandas as pd
from datasets import Dataset
import numpy as np
import evaluate
from transformers import AutoTokenizer
from transformers import DataCollatorWithPadding

In [None]:
df = pd.read_csv(main_dir+'edu_all.csv')
df.head()

In [None]:
df = df.drop(labels=['original_url','old_label','explanations','rationale'], axis=1)
df = df.rename(columns={'updated_label':'label', 'source_article':'text'})
df.head()

In [None]:
dataset = Dataset.from_pandas(df)
dataset = dataset.class_encode_column('label')
class_names = dataset.features['label'].names
class_names

In [None]:
dataset = dataset.train_test_split(test_size=0.2, stratify_by_column='label', seed=42)

In [None]:
dataset['train'].features['label']

In [None]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [None]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True, padding=True)

In [None]:
tokenized_dataset = dataset.map(preprocess_function, batched=True)
tokenized_dataset

In [None]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
id2label = {i: label for i, label in enumerate(class_names)}
label2id = {label: i for i, label in enumerate(class_names)}
id2label

## Train

In [None]:
from transformers import create_optimizer
from transformers import TFAutoModelForSequenceClassification
from transformers.keras_callbacks import KerasMetricCallback
import tensorflow as tf
from keras.callbacks import TensorBoard, EarlyStopping

In [None]:
model = TFAutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=len(class_labels), id2label=id2label, label2id=label2id
)

In [None]:
batch_size = 16
num_epochs = 8
batches_per_epoch = len(tokenized_dataset["train"]) // batch_size
total_train_steps = int(batches_per_epoch * num_epochs)
optimizer, schedule = create_optimizer(init_lr=2e-5, num_warmup_steps=0, num_train_steps=total_train_steps)

In [None]:
model.compile(optimizer=optimizer)

In [None]:
tf_train_set = model.prepare_tf_dataset(
    tokenized_dataset["train"],
    shuffle=True,
    batch_size=16,
    collate_fn=data_collator,
)

tf_validation_set = model.prepare_tf_dataset(
    tokenized_dataset["test"],
    shuffle=False,
    batch_size=16,
    collate_fn=data_collator,
)

In [None]:
early_stopping_callback = EarlyStopping(monitor='val_accuracy', min_delta=0.01, patience=2, mode='max')
tensorboard_callback = TensorBoard(log_dir='logs', histogram_freq=1, write_graph=True, write_images=True, update_freq='batch')

callbacks = [early_stopping_callback, tensorboard_callback]

In [None]:
model.fit(x=tf_train_set, validation_data=tf_validation_set, epochs=num_epochs, callbacks=callbacks)

In [None]:
model.push_to_hub("q3fer/distilbert-base-fallacy-classification", commit_message="Upload Tensorflow model")

## Inference 

In [None]:
import torch
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from transformers import pipeline

In [None]:
tokenizer = AutoTokenizer.from_pretrained("q3fer/distilbert-base-fallacy-classification")
model = AutoModelForSequenceClassification.from_pretrained("q3fer/distilbert-base-fallacy-classification", from_tf=True)

model.push_to_hub("q3fer/distilbert-base-fallacy-classification", commit_message="Upload Pytorch model")

In [None]:
text = "We know that the earth is flat because it looks and feels flat."
inputs = tokenizer(text, return_tensors='pt')

In [None]:
with torch.no_grad():
  logits = model(**inputs)
  scores = logits[0][0]
  scores = torch.nn.Softmax(dim=0)(scores)

  _, ranking = torch.topk(scores, k=scores.shape[0])
  ranking = ranking.tolist()

results = [f"{i+1}) {model.config.id2label[ranking[i]]} {scores[ranking[i]]:.4f}" for i in range(scores.shape[0])]
print('\n'.join(results))

In [None]:
text = "We know that the earth is flat because it looks and feels flat."
model_path = "q3fer/distilbert-base-fallacy-classification"
pipe = pipeline("text-classification", model=model_path, tokenizer=model_path)
pipe(text)

## Evaluation

In [None]:
from evaluate import evaluator

task_evaluator = evaluator("text-classification")

eval_results = task_evaluator.compute(
    model_or_pipeline="q3fer/distilbert-base-fallacy-classification",
    data=dataset["test"],
    label_mapping=label2id
)

eval_results