**Parameter Check**


In [None]:
!pip install datasets
!pip install accelerate -U

import pandas as pd
from datasets import load_dataset, Dataset
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
import numpy as np
from datasets import load_metric
from transformers import Trainer, TrainingArguments, DistilBertForSequenceClassification, DistilBertTokenizer, DistilBertConfig, AdamW, get_linear_schedule_with_warmup

In [None]:
from google.colab import auth
from google.colab import drive

#auth.authenticate_user()
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
colnames = ['ID', 'text', 'label']
mapping_label = {'false': 0, 'true': 1, 'unverified': 2, 'non-rumor': 3}

In [None]:
train_df = pd.read_csv('drive/MyDrive/1516/1516_cleaned.train', sep='\t', names=colnames, header=None)
train_df = train_df.drop('ID', axis=1)
train_df['label'] = train_df['label'].replace(mapping_label)

test_df = pd.read_csv('drive/MyDrive/1516/1516_cleaned.test', sep='\t', names=colnames, header=None)
test_df = test_df.drop('ID', axis=1)
test_df['label'] = test_df['label'].replace(mapping_label)

train_df['label'] = train_df['label'].astype(int)
test_df['label'] = test_df['label'].astype(int)

In [None]:
dev_df = pd.read_csv('drive/MyDrive/1516/1516_cleaned1.dev', sep='\t', names=colnames, header=None)
dev_df = dev_df.drop('ID', axis=1)
dev_df['label'] = dev_df['label'].replace(mapping_label)
dev_df['label'] = dev_df['label'].astype(int)
dev_dataset = Dataset.from_pandas(dev_df)

In [None]:
def tokenize_function(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True)

In [None]:
# Split the dataset into training and validation sets
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

In [None]:

def compute_metrics(eval_pred):
  # Load the metrics to use
  logits, labels = eval_pred
  predictions = np.argmax(logits, axis=-1)
  cnf_matrix = confusion_matrix(labels, predictions)
  TP = cnf_matrix[1][1]  # True Positives
  TN = cnf_matrix[0][0]  # True Negatives
  FP = cnf_matrix[0][1]  # False Positives
  FN = cnf_matrix[1][0]  # False Negatives
    # Calculate other metrics
  accuracy = (TP+TN)/(TP+TN+FP+FN)
  f1 = 2 * (TP)/(2*TP+FP+FN)
  recall = (TP)/(TP+FN)
  precision = (TP)/(TP+FP)
  # Calculate confusion matrix and extract TP, TN, FP, FN
  return {
        "accuracy": accuracy,
        "f1score": f1,
        "recall": recall,
        "precision": precision,
        "TP": TP,
        "TN": TN,
        "FP": FP,
        "FN": FN
    }

In [None]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

In [None]:

# Apply the tokenizer
train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)
dev_dataset = dev_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/1557 [00:00<?, ? examples/s]

Map:   0%|          | 0/520 [00:00<?, ? examples/s]

Map:   0%|          | 0/231 [00:00<?, ? examples/s]

In [None]:
new_model = DistilBertForSequenceClassification.from_pretrained('drive/MyDrive/DistilBert_checkpoint/checkpoint-4680')


In [None]:
trainer = Trainer(
    model=new_model,
    train_dataset=train_dataset,
    eval_dataset=dev_dataset,
    compute_metrics=compute_metrics
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [None]:
trainer.evaluate()

{'eval_loss': 1.6467111110687256,
 'eval_accuracy': 0.9,
 'eval_f1score': 0.9056603773584906,
 'eval_recall': 0.9056603773584906,
 'eval_precision': 0.9056603773584906,
 'eval_TP': 48,
 'eval_TN': 42,
 'eval_FP': 5,
 'eval_FN': 5,
 'eval_runtime': 4.4088,
 'eval_samples_per_second': 52.395,
 'eval_steps_per_second': 6.578}