In [2]:
!pip install torch torchvision transformers



In [3]:
import pandas as pd
import torch

In [4]:
df = pd.read_csv('test_data.csv')

In [17]:
df.head()

Unnamed: 0,text,labels
0,london reuters lawmakers from britain s ru...,1
1,on friday more than half the nation will mour...,0
2,nothing says thanksgiving or family like a...,0
3,century wire asks will this be the beginning ...,0
4,florence italy reuters prime minister the...,1


In [6]:
df.shape

(4488, 2)

In [None]:
df.columns

# Tokenize Test-dataset

In [9]:
from transformers import DistilBertTokenizer

# Load tokenizer
tokenizer = DistilBertTokenizer.from_pretrained("./model/fake_news_detection_model")

In [11]:
# Tokenization function
"""this function, tokenizes the text column of the dataset and returns a dictionary with tokenized values."""

def tokenize_data(data, max_len=128):
    encodings = tokenizer(
        data['text'].tolist(),
        add_special_tokens=True,
        max_length=max_len,
        padding="max_length",
        truncation=True,
        return_tensors="pt"
    )
    labels = torch.tensor(data['labels'].tolist(), dtype=torch.long)
    return encodings, labels

In [12]:
from torch.utils.data import Dataset

""" This class takes pre-tokenized data (encodings) and labels, then returns individual samples """

class FakeNewsDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings  # Tokenized input_ids & attention_mask
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, index):
        return {
            'input_ids': self.encodings['input_ids'][index],
            'attention_mask': self.encodings['attention_mask'][index],
            'labels': self.labels[index]
        }

In [13]:
# Tokenize tes_data
test_encodings, test_labels = tokenize_data(df, max_len=128)

In [29]:
print(test_labels)

tensor([1, 0, 0,  ..., 0, 1, 0])


# Create Dataset Objects

In [15]:
test_dataset = FakeNewsDataset(test_encodings, test_labels)

In [30]:
print(test_dataset)

<__main__.FakeNewsDataset object at 0x7de4d190bbd0>


In [23]:
test_dataset.labels

tensor([1, 0, 0,  ..., 0, 1, 0])

# Load the fine-tuned model

In [24]:
from transformers import DistilBertForSequenceClassification

model = DistilBertForSequenceClassification.from_pretrained("./drive/MyDrive/news-model")


In [26]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting datasets>=2.0.0 (from evaluate)
  Downloading datasets-3.3.1-py3-none-any.whl.metadata (19 kB)
Collecting dill (from evaluate)
  Downloading dill-0.3.9-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from evaluate)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from evaluate)
  Downloading multiprocess-0.70.17-py311-none-any.whl.metadata (7.2 kB)
Collecting dill (from evaluate)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting multiprocess (from evaluate)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading datasets-3.3.1-py3-none-any.whl (484 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━

In [27]:
import numpy as np
import evaluate

metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

# Evaluation

In [33]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(output_dir="test")

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=None,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)

trainer.evaluate()



<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


{'eval_loss': 0.006683704908937216,
 'eval_model_preparation_time': 0.0025,
 'eval_accuracy': 0.9986631016042781,
 'eval_runtime': 734.3411,
 'eval_samples_per_second': 6.112,
 'eval_steps_per_second': 0.764}

# Test the model on small subset of unseen data with trainer.predict

In [36]:
data = pd.read_csv('manual_testing.csv')

In [37]:
data.head()

Unnamed: 0.1,Unnamed: 0,title,text,subject,date,class
0,23471,Seven Iranians freed in the prisoner swap have...,"21st Century Wire says This week, the historic...",Middle-east,"January 20, 2016",0
1,23472,#Hashtag Hell & The Fake Left,By Dady Chery and Gilbert MercierAll writers ...,Middle-east,"January 19, 2016",0
2,23473,Astroturfing: Journalist Reveals Brainwashing ...,Vic Bishop Waking TimesOur reality is carefull...,Middle-east,"January 19, 2016",0
3,23474,The New American Century: An Era of Fraud,Paul Craig RobertsIn the last years of the 20t...,Middle-east,"January 19, 2016",0
4,23475,Hillary Clinton: ‘Israel First’ (and no peace ...,Robert Fantina CounterpunchAlthough the United...,Middle-east,"January 18, 2016",0


In [38]:
data.drop(['title', 'subject', 'date'], axis=1, inplace=True)

In [39]:
data.head()

Unnamed: 0.1,Unnamed: 0,text,class
0,23471,"21st Century Wire says This week, the historic...",0
1,23472,By Dady Chery and Gilbert MercierAll writers ...,0
2,23473,Vic Bishop Waking TimesOur reality is carefull...,0
3,23474,Paul Craig RobertsIn the last years of the 20t...,0
4,23475,Robert Fantina CounterpunchAlthough the United...,0


In [40]:
data.reset_index(inplace=True)

In [42]:
data.drop(['index'], axis=1, inplace=True)

In [48]:
data.head()

Unnamed: 0,text,class
0,"21st Century Wire says This week, the historic...",0
1,By Dady Chery and Gilbert MercierAll writers ...,0
2,Vic Bishop Waking TimesOur reality is carefull...,0
3,Paul Craig RobertsIn the last years of the 20t...,0
4,Robert Fantina CounterpunchAlthough the United...,0


In [49]:
data.shape

(20, 2)

In [50]:
data.rename(columns={'class': 'labels'}, inplace=True)

In [51]:
data.head(10)

Unnamed: 0,text,labels
0,"21st Century Wire says This week, the historic...",0
1,By Dady Chery and Gilbert MercierAll writers ...,0
2,Vic Bishop Waking TimesOur reality is carefull...,0
3,Paul Craig RobertsIn the last years of the 20t...,0
4,Robert Fantina CounterpunchAlthough the United...,0
5,21st Century Wire says As 21WIRE reported earl...,0
6,21st Century Wire says It s a familiar theme. ...,0
7,Patrick Henningsen 21st Century WireRemember ...,0
8,21st Century Wire says Al Jazeera America will...,0
9,21st Century Wire says As 21WIRE predicted in ...,0


In [53]:
data.dropna(inplace=True)

In [54]:
data.shape

(20, 2)

In [55]:
data_encodings, data_labels = tokenize_data(data, max_len=128)

In [56]:
pred_dataset = FakeNewsDataset(data_encodings, data_labels)

In [57]:
pred_dataset.labels

tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [58]:
from sklearn.metrics import classification_report
import numpy as np

predictions = trainer.predict(pred_dataset)
predicted_labels = np.argmax(predictions.predictions, axis=1)
true_labels = pred_dataset.labels

print(classification_report(true_labels, predicted_labels))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       1.00      1.00      1.00        10

    accuracy                           1.00        20
   macro avg       1.00      1.00      1.00        20
weighted avg       1.00      1.00      1.00        20

