In [None]:
!pip uninstall -y torch torchvision torchaudio transformers accelerate fastai timm


In [None]:
!pip install \
torch==2.3.1 \
torchvision==0.18.1 \
transformers==4.40.2 \
accelerate==0.29.3 \
scikit-learn pandas


In [None]:
!pip uninstall -y sentence-transformers


In [None]:
!pip uninstall -y peft


In [None]:
import importlib.util

print("peft installed?", importlib.util.find_spec("peft") is not None)


In [None]:
!pip install peft==0.9.0


In [None]:
import torch
from transformers import Trainer, TrainingArguments

print("Torch:", torch.__version__)
print("Trainer import OK ✅")


In [None]:
from google.colab import files
files.upload()


In [None]:
!unzip liar_dataset.zip
!unzip Fake.csv.zip
!unzip True.csv.zip
!ls


In [None]:
import os

print("train.tsv exists:", os.path.exists("train.tsv"))
print("Fake.csv exists:", os.path.exists("Fake.csv"))
print("True.csv exists:", os.path.exists("True.csv"))


In [None]:
import pandas as pd

liar = pd.read_csv("train.tsv", sep="\t", header=None)

# column 2 = statement, column 1 = label
liar = liar[[2, 1]]
liar.columns = ["text", "label"]

fake_labels = ["false", "pants-fire", "barely-true"]

liar["label"] = liar["label"].apply(
    lambda x: 0 if x in fake_labels else 1
)

liar.head()


In [None]:
fake = pd.read_csv("Fake.csv")
real = pd.read_csv("True.csv")

fake["label"] = 0
real["label"] = 1

fake = fake[["text", "label"]]
real = real[["text", "label"]]

fake.head(), real.head()


In [None]:
df = pd.concat([liar, fake, real], ignore_index=True)
df = df.dropna()
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

df["label"].value_counts()


In [None]:
import re

def clean_text(text):
    text = str(text).lower()
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"[^a-zA-Z ]", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

df["text"] = df["text"].apply(clean_text)
df.sample(5)


In [None]:
print("Total samples:", len(df))
print(df.head())


In [None]:
from sklearn.model_selection import train_test_split

train_texts, temp_texts, train_labels, temp_labels = train_test_split(
    df["text"],
    df["label"],
    test_size=0.2,
    random_state=42,
    stratify=df["label"]
)

val_texts, test_texts, val_labels, test_labels = train_test_split(
    temp_texts,
    temp_labels,
    test_size=0.5,
    random_state=42,
    stratify=temp_labels
)

print(len(train_texts), len(val_texts), len(test_texts))


In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

train_enc = tokenizer(
    list(train_texts),
    truncation=True,
    padding=True,
    max_length=256
)

val_enc = tokenizer(
    list(val_texts),
    truncation=True,
    padding=True,
    max_length=256
)

test_enc = tokenizer(
    list(test_texts),
    truncation=True,
    padding=True,
    max_length=256
)


In [None]:
import torch

class NewsDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels.tolist()

    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_ds = NewsDataset(train_enc, train_labels)
val_ds   = NewsDataset(val_enc, val_labels)
test_ds  = NewsDataset(test_enc, test_labels)


In [None]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=2
)


In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./model",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate=2e-5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    report_to="none"
)


In [None]:
from transformers import Trainer
from sklearn.metrics import accuracy_score, f1_score

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(axis=1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1": f1_score(labels, preds)
    }

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    compute_metrics=compute_metrics
)

trainer.train()


In [None]:
from sklearn.metrics import classification_report

preds = trainer.predict(test_ds)
y_pred = preds.predictions.argmax(axis=1)

print(classification_report(test_labels, y_pred))


In [None]:
model.save_pretrained("fake-news-bert")
tokenizer.save_pretrained("fake-news-bert")


In [None]:
!zip -r fake-news-bert.zip fake-news-bert


In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

tokenizer = AutoTokenizer.from_pretrained("fake-news-bert")
model = AutoModelForSequenceClassification.from_pretrained("fake-news-bert")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()


In [None]:
import torch

def predict_news(text):
    inputs = tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        padding=True,
        max_length=256
    ).to(device)

    with torch.no_grad():
        outputs = model(**inputs)
        probs = torch.softmax(outputs.logits, dim=1)

    idx = probs.argmax(dim=1).item()
    label = "REAL" if idx == 1 else "FAKE"
    confidence = round(probs[0][idx].item(), 4)

    return label, confidence


def predict_news_final(text):
    label, confidence = predict_news(text)

    if confidence >= 0.90:
        return label, confidence

    if 0.60 <= confidence < 0.90:
        return "UNCERTAIN", confidence

    return "UNCERTAIN", confidence





In [None]:
cases = [
    "Drinking bleach cures all diseases.",
    "NASA confirms water on Mars.",
    "President announces new economic policy today.",
    "Aliens are living among us according to secret documents.",
    """
    According to the World Health Organization, vaccines are safe
    and effective in preventing infectious diseases.
    """
]

for c in cases:
    print(c.strip()[:80])
    print("→", predict_news_final(c))
    print("-" * 50)


In [None]:
test_cases = [
    "Drinking bleach cures all diseases.",
    "NASA confirms water on Mars.",
    "President announces new economic policy today.",
    "Aliens are living among us according to secret documents.",
    """
    According to the World Health Organization, vaccines are safe
    and effective in preventing infectious diseases.
    """
]

for i, text in enumerate(test_cases, 1):
    label, confidence = predict_news_safe(text)
    print(f"Case {i}:")
    print(text.strip()[:100], "...")
    print("→", label, confidence)
    print("-" * 50)


In [None]:
print(predict_news_safe("Breaking news"))


In [None]:
print(predict_news_safe(""))
print(predict_news_safe("!!! ??? ###"))


In [None]:
long_text = """
The government today announced a comprehensive policy reform aimed at
improving economic stability, reducing inflation, and encouraging
foreign investment. Officials stated that the policy will be reviewed
annually and adjusted based on economic indicators.
"""

print(predict_news_safe(long_text))


In [None]:
cases = {
    "Fake medical claim": "Drinking bleach cures all diseases.",
    "Short ambiguous headline": "NASA confirms water on Mars.",
    "Real scientific report": """
        NASA scientists confirmed the presence of water molecules
        on the surface of Mars using data from the Mars Reconnaissance Orbiter.
    """
}

for name, text in cases.items():
    label, confidence = predict_news_safe(text)
    print(f"{name}: {label} ({confidence})")
