In [1]:
from transformers import (
    DistilBertTokenizerFast,
    DistilBertForSequenceClassification,
    TrainingArguments,
    Trainer
)
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

from torch.utils.data import Dataset
import torch
import pandas as pd
import numpy as np

  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
  import pkg_resources
  _torch_pytree._register_pytree_node(


In [2]:
import accelerate
print(accelerate.__version__)

0.20.3


In [3]:
df = pd.read_csv("../Data/preprocessed/fakenews_preprocessed.csv")

In [4]:
texts = df["text"].fillna("").astype(str).tolist()
labels = df["real"].astype(int).tolist()

In [5]:
df["real"].value_counts()

real
1    35619
0    26507
Name: count, dtype: int64

In [6]:
train_texts, val_texts, train_labels, val_labels = train_test_split(
    texts, labels, test_size=0.2, random_state=42, stratify=labels
)

In [7]:
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")
max_len = 128



In [8]:
train_enc = tokenizer(
    train_texts,
    padding="max_length",
    truncation=True,
    max_length=max_len
)

In [9]:
val_enc = tokenizer(
    val_texts,
    padding="max_length",
    truncation=True,
    max_length=max_len
)

In [10]:
class FakeNewsDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels  # list of ints

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        # Each field in encodings is a list/array of token ids per example
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

In [11]:
train_dataset = FakeNewsDataset(train_enc, train_labels)
val_dataset   = FakeNewsDataset(val_enc,   val_labels)

In [12]:
model = DistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=2
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)

    acc  = accuracy_score(labels, preds)
    prec = precision_score(labels, preds)
    rec  = recall_score(labels, preds)
    f1   = f1_score(labels, preds)

    return {
        "accuracy":  acc,
        "precision": prec,
        "recall":    rec,
        "f1":        f1
    }

In [14]:
training_args = TrainingArguments(
    output_dir="./distilbert_fake_news_results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=2,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    logging_steps=50,
    logging_dir="./logs",

    # CPU fixes
    no_cuda=True,
    fp16=False,
    bf16=False,
    torch_compile=False,
)



In [15]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [16]:
trainer.train()

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.0002,0.012607,0.997103,0.997613,0.997333,0.997473
2,0.0001,0.01465,0.997103,0.996915,0.998035,0.997475




TrainOutput(global_step=12426, training_loss=0.02105777116429658, metrics={'train_runtime': 20476.4314, 'train_samples_per_second': 4.854, 'train_steps_per_second': 0.607, 'total_flos': 3291814856601600.0, 'train_loss': 0.02105777116429658, 'epoch': 2.0})

In [17]:
metrics = trainer.evaluate()
print(metrics)



{'eval_loss': 0.014650153927505016, 'eval_accuracy': 0.9971028488652824, 'eval_precision': 0.9969153112731352, 'eval_recall': 0.998034811903425, 'eval_f1': 0.9974747474747475, 'eval_runtime': 653.7447, 'eval_samples_per_second': 19.007, 'eval_steps_per_second': 2.377, 'epoch': 2.0}


In [18]:
trainer.save_model("distilbert_fake_news")
tokenizer.save_pretrained("distilbert_tokenizer_fake_news")

('distilbert_tokenizer_fake_news\\tokenizer_config.json',
 'distilbert_tokenizer_fake_news\\special_tokens_map.json',
 'distilbert_tokenizer_fake_news\\vocab.txt',
 'distilbert_tokenizer_fake_news\\added_tokens.json',
 'distilbert_tokenizer_fake_news\\tokenizer.json')

In [19]:
text1 = "The Associated Press and reams of other media outlets reported that JD Vance said ‚Äúschool shootings are a ‚Äòfact of life‚Äô.In fact, Vance said that ‚Äúpsychos‚Äù who ‚Äúwant to make headlines‚Äù are a ‚Äúfact of life‚Äù‚Äînot ‚Äúschool shootings.‚Äù He then said, ‚ÄúWe have got to bolster security at our schools."

In [20]:
text2 = "CNN‚Äôs Jake Tapper reported that Donald Trump said ‚Äúthat as commander in chief, he will contemplate using the United States military or National Guard to go after his political opponents, including Democrats‚Äù like ‚ÄúAdam Schiff. In fact, Trump was answering a question about ‚Äúagitators‚Äù who would sow ‚Äúchaos on election day,‚Äù like the ‚ÄúAfghan refugee charged with plotting a U.S. election day massacre.‚Äù He was not talking about Americans who ‚Äúdon‚Äôt support him‚Äù but ‚Äúsick people, radical-left lunatics,‚Äù who‚Äôve rioted, committed arson, and murdered people. Furthermore, he was talking about the 2024 election while the military is not under his command."

In [21]:
labels = df["real"].astype(int).tolist()
labels

[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,


In [22]:
df["real"].value_counts()

real
1    35619
0    26507
Name: count, dtype: int64

In [26]:
set(labels)

{0, 1}

In [23]:
LABELS = ["Fake", "Real"]

In [24]:
def predict_text(text):
    model.eval()
    inputs = tokenizer(
        [text],
        padding=True,
        truncation=True,
        return_tensors="pt"
    )

    inputs = {k: v.to(model.device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model(**inputs)
        probs = torch.softmax(outputs.logits, dim=1)[0]

    pred_idx = probs.argmax().item()
    confidence = probs[pred_idx].item()

    return LABELS[pred_idx], confidence

In [25]:
label1, conf1 = predict_text(text1)
label2, conf2 = predict_text(text2)

print("Text 1 Prediction:", label1, f"(confidence {conf1:.4f})")
print("Text 2 Prediction:", label2, f"(confidence {conf2:.4f})")


Text 1 Prediction: Real (confidence 1.0000)
Text 2 Prediction: Real (confidence 1.0000)


In [27]:
df["real"].value_counts(normalize=True)

real
1    0.573335
0    0.426665
Name: proportion, dtype: float64

In [28]:
df.sample(10)[["text","real"]]

Unnamed: 0,text,real
9902,,0
39684,Venezuela s vice president said on Wednesday t...,1
35970,"Argentina will not increase defense spending, ...",1
61922,President Barack Obama has started to intervie...,1
49331,"Donald Trump, at the second presidential debat...",0
18365,(Reuters) - Highlights of the day for U.S. Pre...,1
41225,Apple Inc (AAPL.O) Chief Executive Tim Cook‚Äôs ...,1
52319,Despite recent assertions from certain Democra...,0
26805,TOKYO (Reuters) - A Japanese court sentenced a...,1
56362,Somebody buy that cop a beer Antifa protesters...,0


In [29]:
df.index.is_monotonic_increasing

True

In [30]:
df[df["real"] == 0].head(5)

Unnamed: 0,title,text,subject,real,clean_title,clean_text,text_len,avg_word_len,num_sents,polarity,...,flesch_grade,gunning_fog,smog,ari,coleman_liau,flesch_interpretation,punct_count,word_count,punct_ratio,entity_count
0,Donald Trump Sends Out Embarrassing New Year‚Äô...,Donald Trump just couldn t wish all Americans ...,News,0,donald trump sends out embarrassing new ear s...,donald trump just couldn t wish all americans ...,503,4.337972,1,-0.014387,...,8.726552,10.363443,11.812371,9.679789,9.062195,Standard - 8th-9th grade,121,495,0.244444,28
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,0,drunk bragging trump staffer started russian ...,house intelligence committee chairman devin nu...,309,4.951456,1,0.033536,...,10.942923,13.07286,12.745085,11.595993,11.947541,Difficult - College,39,305,0.127869,24
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,0,sheriff david clarke becomes an internet joke...,on frida it was revealed that former milwaukee...,598,4.658863,1,-0.020018,...,8.61687,10.647496,11.645159,10.957494,10.928843,Standard - 8th-9th grade,148,580,0.255172,37
3,Trump Is So Obsessed He Even Has Obama‚Äôs Name...,"On Christmas day, Donald Trump announced that ...",News,0,trump is so obsessed he even has obama s name...,on christmas da donald trump announced that he...,458,4.665939,1,-0.026923,...,9.966244,12.498177,13.023867,11.788649,11.285261,Fairly Difficult - 10th-12th grade,118,444,0.265766,26
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,0,pope francis just called out donald trump dur...,pope francis used his annual christmas da mess...,425,4.322353,1,-0.06981,...,9.76619,12.209524,11.50563,10.522929,8.655714,Standard - 8th-9th grade,40,420,0.095238,24


In [31]:
df[df.duplicated("text", keep=False)].head(10)

Unnamed: 0,title,text,subject,real,clean_title,clean_text,text_len,avg_word_len,num_sents,polarity,...,flesch_grade,gunning_fog,smog,ari,coleman_liau,flesch_interpretation,punct_count,word_count,punct_ratio,entity_count
0,Donald Trump Sends Out Embarrassing New Year‚Äô...,Donald Trump just couldn t wish all Americans ...,News,0,donald trump sends out embarrassing new ear s...,donald trump just couldn t wish all americans ...,503,4.337972,1,-0.014387,...,8.726552,10.363443,11.812371,9.679789,9.062195,Standard - 8th-9th grade,121,495,0.244444,28
3,Trump Is So Obsessed He Even Has Obama‚Äôs Name...,"On Christmas day, Donald Trump announced that ...",News,0,trump is so obsessed he even has obama s name...,on christmas da donald trump announced that he...,458,4.665939,1,-0.026923,...,9.966244,12.498177,13.023867,11.788649,11.285261,Fairly Difficult - 10th-12th grade,118,444,0.265766,26
5,Racist Alabama Cops Brutalize Black Boy While...,The number of cases of cops brutalizing and ki...,News,0,racist alabama cops brutali e black bo while ...,the number of cases of cops brutali ing and ki...,316,4.297468,1,-0.010088,...,8.99641,11.646154,11.208143,9.771635,8.466026,Standard - 8th-9th grade,34,312,0.108974,7
6,"Fresh Off The Golf Course, Trump Lashes Out A...",Donald Trump spent a good portion of his day a...,News,0,fresh off the golf course trump lashes out at...,donald trump spent a good portion of his da at...,355,4.802817,1,-0.005357,...,10.575604,13.092344,12.759959,11.945147,11.157386,Fairly Difficult - 10th-12th grade,69,352,0.196023,25
7,Trump Said Some INSANELY Racist Stuff Inside ...,In the wake of yet another court decision that...,News,0,trump said some insanel racist stuff inside t...,in the wake of et another court decision that ...,379,4.659631,1,0.034151,...,14.111809,16.356465,14.985894,16.067948,10.834574,Difficult - College,46,376,0.12234,18
9,WATCH: Brand-New Pro-Trump Ad Features So Muc...,Just when you might have thought we d get a br...,News,0,watch brandnew protrump ad features so much a...,just when ou might have thought we d get a bre...,295,4.355932,1,0.148958,...,8.836211,11.065076,11.442367,9.197814,9.067832,Standard - 8th-9th grade,36,286,0.125874,9
10,"Papa John‚Äôs Founder Retires, Figures Out Raci...","A centerpiece of Donald Trump s campaign, and ...",News,0,papa john s founder retires figures out racis...,a centerpiece of donald trump s campaign and n...,371,4.218329,1,-0.01498,...,10.308876,11.96676,11.578366,11.373506,8.76648,Standard - 8th-9th grade,45,358,0.125698,15
11,WATCH: Paul Ryan Just Told Us He Doesn‚Äôt Care...,Republicans are working overtime trying to sel...,News,0,watch paul r an just told us he doesn t care ...,republicans are working overtime tr ing to sel...,286,4.465035,1,0.140865,...,12.02682,14.762438,13.463847,14.304805,10.881319,Fairly Difficult - 10th-12th grade,40,273,0.14652,9
14,Heiress To Disney Empire Knows GOP Scammed Us...,Abigail Disney is an heiress with brass ovarie...,News,0,heiress to disne empire knows gop scammed us ...,abigail disne is an heiress with brass ovaries...,510,4.219608,1,0.105234,...,7.523491,9.980606,10.161005,7.595098,7.8244,Standard - 8th-9th grade,72,500,0.144,16
15,Tone Deaf Trump: Congrats Rep. Scalise On Los...,Donald Trump just signed the GOP tax scam into...,News,0,tone deaf trump congrats rep scalise on losin...,donald trump just signed the gop tax scam into...,375,4.402667,1,0.094041,...,9.673293,12.427642,11.855464,10.88935,9.146883,Standard - 8th-9th grade,50,369,0.135501,12


In [32]:
df.sample(5)[["text","real"]]


Unnamed: 0,text,real
53340,Democratic U.S. senators tried to force a vote...,1
15502,HOUSTON (Reuters) - State and local government...,1
29901,VIENNA (Reuters) - The head of the United Nati...,1
21421,(Reuters) - Billionaire investor Warren Buffet...,1
57231,Dr Can Erimtan 21st Century WireDid Donald J. ...,0
