**Training Phase**


In [None]:
# Install necessary libraries
!pip install transformers[torch]
!pip install datasets
!pip install accelerate -U

In [None]:
import pandas as pd
from datasets import load_dataset, Dataset
import shutil
import os

import numpy as np
from sklearn.metrics import mean_squared_error
from datasets import load_metric
from transformers import Trainer, TrainingArguments, DistilBertForSequenceClassification, DistilBertTokenizer, DistilBertConfig, AdamW, get_linear_schedule_with_warmup

In [None]:
from google.colab import auth
from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# tokenizer and mode
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=4)  # Assuming 4 classes for rumor detection

colnames=['ID', 'text', 'label']
mapping_label = {'false': 0, 'true': 1, 'unverified': 2, 'non-rumor': 3 }

train_df = pd.read_csv('drive/MyDrive/1516/1516_cleaned.train', sep='\t', names=colnames, header=None)
train_df = train_df.drop('ID',axis=1)
train_df['label'] = train_df['label'].replace(mapping_label)

test_df = pd.read_csv('drive/MyDrive/1516/1516_cleaned.test', sep='\t', names=colnames, header=None)
test_df = test_df.drop('ID',axis=1)
test_df['label'] = test_df['label'].replace(mapping_label)

train_df['label'] = train_df['label'].astype(int)
test_df['label'] = test_df['label'].astype(int)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True)

# Split the dataset into training and validation sets
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

# Apply the tokenizer
train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/1557 [00:00<?, ? examples/s]

Map:   0%|          | 0/520 [00:00<?, ? examples/s]

In [None]:
# Training arguments with early stopping
training_args = TrainingArguments(
    output_dir='./results_distilbert',
    num_train_epochs=25,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs_distilbert',
    logging_steps=10,
    eval_steps=500,
    load_best_model_at_end=True,
    evaluation_strategy="epoch",
    save_strategy="epoch"
)

In [None]:
def compute_metrics(eval_pred):
    # load the metrics to use
    load_accuracy = load_metric("accuracy")
    load_f1 = load_metric("f1")

    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    # calculate the metric using the predicted and true value
    accuracy = load_accuracy.compute(predictions=predictions, references=labels)
    f1 = load_f1.compute(predictions=predictions, references=labels, average="weighted")
    return {"accuracy": accuracy['accuracy'], "f1score": f1['f1']}

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)

In [None]:
optimizer = AdamW(model.parameters(), lr=5e-5)
num_training_steps = len(train_dataset) // training_args.per_device_train_batch_size * training_args.num_train_epochs
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps=training_args.warmup_steps,
                                            num_training_steps=num_training_steps)



In [None]:
trainer.train()


In [None]:
model_dir = '/content/drive/MyDrive/test/'
trainer.save_model(model_dir + 'distilbert')

In [None]:
shutil.copytree('results_distilbert','drive/MyDrive/DistilBert(tw16)')

**Inference Test**


In [None]:
new_model = DistilBertForSequenceClassification.from_pretrained('drive/MyDrive/DistilBert_checkpoint/checkpoint-4680')
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

In [None]:
test = ["This is to just test the rumor efficiency of the model"]

In [None]:
test1 =["paul walker died at the age of 40 saturday. our hearts go out to his family and friends"]

In [None]:
tokenizer(test)

{'input_ids': [[101, 2023, 2003, 2000, 2074, 3231, 1996, 19075, 8122, 1997, 1996, 2944, 102]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}

In [None]:
tokenizer(test1)

{'input_ids': [[101, 2703, 5232, 2351, 2012, 1996, 2287, 1997, 2871, 5095, 1012, 2256, 8072, 2175, 2041, 2000, 2010, 2155, 1998, 2814, 102]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}

In [None]:
import torch

In [None]:
with torch.no_grad():
  logits = new_model(**tokenizer(test,return_tensors="pt"))

In [None]:
with torch.no_grad():
  logits = new_model(**tokenizer(test1,return_tensors="pt")) # example (new test 1)

In [None]:
logits

SequenceClassifierOutput(loss=None, logits=tensor([[-3.5655,  8.8932, -3.4322, -2.9430]]), hidden_states=None, attentions=None)

In [None]:
mappinglabel = { 0:'false', 1:'true', 2:'unverified', 3:'non-rumor' }

In [None]:
mappinglabel[int(np.argmax(logits[0],axis=-1)[0])]

'false'

In [None]:
mappinglabel[int(np.argmax(logits[0],axis=-1)[0])] #new test 1

'true'