In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
! nvidia-smi
! pip install transformers -q

Mon Mar  7 15:15:35 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   55C    P8    29W / 149W |      0MiB / 11441MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
import re, string

def lower(text):
  return text.lower()

def remove_urls(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'', text)

def remove_nonascii(sent):
  return "".join([i for i in sent if i.isascii()])

def remove_punctuations(text):
  res = re.sub(r'[^\w\s]', '', text)
  return res

def remove_num(text):
  return "".join([c for c in text if not c.isdigit()])

def remove_mul_space(text):
  return " ".join(text.split())

def clean(text):
  
  # text = lower(text)
  text = remove_urls(text)
  text = remove_nonascii(text)
  # text = remove_punctuations(text)
  # text = remove_num(text)
  text = remove_mul_space(text)

  return text

In [None]:
'''

Importing the WELFake dataset for finetuning.

'''

import pandas as pd

df = pd.read_csv(r"/content/drive/Shareddrives/[FYP] Fake News Detection/Datasets/WELFake/WELFake.csv")

In [None]:
df['total_text'] = df['title'].fillna('') + " " + df['text'].fillna('')

df = df[['total_text', 'label']].copy(deep=True)
df = df.drop_duplicates(subset=["total_text"])

df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 63678 entries, 0 to 72132
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   total_text  63678 non-null  object
 1   label       63678 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 1.5+ MB


In [None]:
df['total_text'] = df['total_text'].apply(clean)

In [None]:
texts = df['total_text'].tolist()
labels = df['label'].tolist()

from sklearn.model_selection import train_test_split

RANDOM_SEED = 42

train_texts, other_texts, train_labels, other_labels = train_test_split(texts, labels, test_size=.8, stratify = labels, random_state=RANDOM_SEED)
val_texts, test_texts, val_labels, test_labels = train_test_split(other_texts, other_labels, test_size=.5, stratify = other_labels, random_state=RANDOM_SEED)

In [None]:
from transformers import DistilBertTokenizerFast

tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

In [None]:
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)
test_encodings = tokenizer(test_texts, truncation=True, padding=True)

In [None]:
import torch

class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = Dataset(train_encodings, train_labels)
val_dataset = Dataset(val_encodings, val_labels)
test_dataset = Dataset(test_encodings, test_labels)

In [None]:
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
import numpy as np

# Define Trainer parameters
def compute_metrics(p):
    pred, labels = p
    pred = np.argmax(pred, axis=1)

    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    recall = recall_score(y_true=labels, y_pred=pred)
    precision = precision_score(y_true=labels, y_pred=pred)
    f1 = f1_score(y_true=labels, y_pred=pred)

    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

In [None]:
from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback

# Define Trainer
args = TrainingArguments(
    output_dir=r"/content/drive/Shareddrives/[FYP] Fake News Detection/Kogul_Language_Modelling/Fine tuning WELFake/results_improved_v2",
    evaluation_strategy="steps",
    eval_steps=250,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    num_train_epochs=5,
    seed=0,
    load_best_model_at_end=True,
    logging_dir=r"/content/drive/Shareddrives/[FYP] Fake News Detection/Kogul_Language_Modelling/Fine tuning WELFake/logs_improved_v2",            # directory for storing logs
    logging_steps=10
)

model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=10)],
)


# training_args = TrainingArguments(
#     output_dir=r"/content/drive/Shareddrives/[FYP] Fake News Detection/Kogul_Language_Modelling/Fine tuning WELFake/results_improved",          # output directory
#     overwrite_output_dir=False,
#     num_train_epochs=5,              # total number of training epochs
#     per_device_train_batch_size=16,  # batch size per device during training
#     per_device_eval_batch_size=64,   # batch size for evaluation
#     warmup_steps=500,                # number of warmup steps for learning rate scheduler
#     weight_decay=0.01,               # strength of weight decay
#     logging_dir=r"/content/drive/Shareddrives/[FYP] Fake News Detection/Kogul_Language_Modelling/Fine tuning WELFake/logs_improved",            # directory for storing logs
#     logging_steps=10,
# )

# # model = DistilBertForSequenceClassification.from_pretrained("/content/drive/Shareddrives/[FYP] Fake News Detection/Kogul_Language_Modelling/Fine tuning WELFake/results/checkpoint-7500")


# trainer = Trainer(
#     model=model,                         # the instantiated 🤗 Transformers model to be trained
#     args=training_args,                  # training arguments, defined above
#     train_dataset=train_dataset,         # training dataset
#     eval_dataset=val_dataset             # evaluation dataset
# )

Downloading:   0%|          | 0.00/256M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_projector.bias', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier

In [None]:
%%time
# trainer.train(resume_from_checkpoint=r"/content/drive/Shareddrives/[FYP] Fake News Detection/Kogul_Language_Modelling/Fine tuning WELFake/results/checkpoint-7500")
trainer.train(resume_from_checkpoint = True)
# trainer.train()

Loading model from /content/drive/Shareddrives/[FYP] Fake News Detection/Kogul_Language_Modelling/Fine tuning WELFake/results_improved_v2/checkpoint-5500).
***** Running training *****
  Num examples = 54126
  Num Epochs = 5
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 16915
  Continuing training from checkpoint, will skip to saved global_step
  Continuing training from epoch 1
  Continuing training from global step 5500
  Will skip the first 1 epochs then the first 2117 batches in the first epoch. If this takes a lot of time, you can add the `--ignore_data_skip` flag to your launch command, but you will resume the training on data already seen by your model.


  0%|          | 0/2117 [00:00<?, ?it/s]

Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
5750,0.0438,0.040028,0.992881,0.997203,0.987079,0.992115
6000,0.0097,0.034728,0.993928,0.994907,0.991694,0.993298
6250,0.0167,0.034934,0.993719,0.991264,0.994924,0.993091
6500,0.0014,0.020997,0.994975,0.996755,0.992155,0.99445
6750,0.0331,0.022548,0.995812,0.995843,0.994924,0.995383
7000,0.0452,0.03237,0.9933,0.989454,0.995847,0.99264


***** Running Evaluation *****
  Num examples = 4776
  Batch size = 64
***** Running Evaluation *****
  Num examples = 4776
  Batch size = 64
Saving model checkpoint to /content/drive/Shareddrives/[FYP] Fake News Detection/Kogul_Language_Modelling/Fine tuning WELFake/results_improved_v2/checkpoint-6000
Configuration saved in /content/drive/Shareddrives/[FYP] Fake News Detection/Kogul_Language_Modelling/Fine tuning WELFake/results_improved_v2/checkpoint-6000/config.json
Model weights saved in /content/drive/Shareddrives/[FYP] Fake News Detection/Kogul_Language_Modelling/Fine tuning WELFake/results_improved_v2/checkpoint-6000/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 4776
  Batch size = 64
***** Running Evaluation *****
  Num examples = 4776
  Batch size = 64
Saving model checkpoint to /content/drive/Shareddrives/[FYP] Fake News Detection/Kogul_Language_Modelling/Fine tuning WELFake/results_improved_v2/checkpoint-6500
Configuration saved in /content/drive/Shareddr

Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
5750,0.0438,0.040028,0.992881,0.997203,0.987079,0.992115
6000,0.0097,0.034728,0.993928,0.994907,0.991694,0.993298
6250,0.0167,0.034934,0.993719,0.991264,0.994924,0.993091
6500,0.0014,0.020997,0.994975,0.996755,0.992155,0.99445
6750,0.0331,0.022548,0.995812,0.995843,0.994924,0.995383
7000,0.0452,0.03237,0.9933,0.989454,0.995847,0.99264
7250,0.0322,0.034093,0.992253,0.985415,0.997693,0.991516
7500,0.0001,0.024624,0.994556,0.991732,0.996308,0.994015
7750,0.0001,0.036275,0.994137,0.996288,0.990771,0.993522


***** Running Evaluation *****
  Num examples = 4776
  Batch size = 64
***** Running Evaluation *****
  Num examples = 4776
  Batch size = 64
Saving model checkpoint to /content/drive/Shareddrives/[FYP] Fake News Detection/Kogul_Language_Modelling/Fine tuning WELFake/results_improved_v2/checkpoint-7500
Configuration saved in /content/drive/Shareddrives/[FYP] Fake News Detection/Kogul_Language_Modelling/Fine tuning WELFake/results_improved_v2/checkpoint-7500/config.json
Model weights saved in /content/drive/Shareddrives/[FYP] Fake News Detection/Kogul_Language_Modelling/Fine tuning WELFake/results_improved_v2/checkpoint-7500/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 4776
  Batch size = 64


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from /content/drive/Shareddrives/[FYP] Fake News Detection/Kogul_Language_Modelling/Fine tuning WELFake/results_improved_v2/checkpoint-6500 (score: 0.02099708653986454).


CPU times: user 1h 19min 33s, sys: 20.5 s, total: 1h 19min 54s
Wall time: 1h 20min 27s


TrainOutput(global_step=7750, training_loss=0.004141122956713286, metrics={'train_runtime': 4809.1596, 'train_samples_per_second': 56.274, 'train_steps_per_second': 3.517, 'total_flos': 1.6425427563749376e+16, 'train_loss': 0.004141122956713286, 'epoch': 2.29})

In [None]:
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 4776
  Batch size = 64


{'epoch': 2.29,
 'eval_accuracy': 0.9949748743718593,
 'eval_f1': 0.9944495837187789,
 'eval_loss': 0.02099708653986454,
 'eval_precision': 0.9967547519703291,
 'eval_recall': 0.9921550530687586,
 'eval_runtime': 160.9609,
 'eval_samples_per_second': 29.672,
 'eval_steps_per_second': 0.466}

In [None]:
trainer.save_model(r"/content/drive/Shareddrives/[FYP] Fake News Detection/Kogul_Language_Modelling/Fine tuning WELFake/Fine-tuned Model Improved V3")

In [None]:
import numpy as np

# Make prediction
predictions = trainer.predict(test_dataset) 

# Preprocess raw predictions
# y_pred = np.argmax(raw_pred, axis=1)

preds = np.argmax(predictions.predictions, axis=-1)

***** Running Prediction *****
  Num examples = 4776
  Batch size = 64


In [None]:
from sklearn.metrics import classification_report

print(classification_report(test_labels, preds))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99      2610
           1       0.99      0.98      0.99      2166

    accuracy                           0.99      4776
   macro avg       0.99      0.99      0.99      4776
weighted avg       0.99      0.99      0.99      4776

              precision    recall  f1-score   support

           0       0.99      1.00      0.99      2610
           1       0.99      0.98      0.99      2166

    accuracy                           0.99      4776
   macro avg       0.99      0.99      0.99      4776
weighted avg       0.99      0.99      0.99      4776

