In [None]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from datasets import load_dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

In [None]:
from huggingface_hub import login
from google.colab import userdata
login(userdata.get('WRITE_TOKEN'))

In [None]:
true_df=pd.read_csv('True.csv')
true_df['Label'] = 'True'

fake_df =pd.read_csv('Fake.csv', lineterminator='\n')
fake_df['Label'] = 'False'

df=pd.concat([true_df,fake_df],ignore_index=True).sample(frac=1,random_state=42)
df.head()

Unnamed: 0,title,text,subject,date,Label,date\r
22216,BREAKING: GOP Chairman Grassley Has Had Enoug...,"Donald Trump s White House is in chaos, and th...",News,,False,"July 21, 2017\r"
27917,Failed GOP Candidates Remembered In Hilarious...,Now that Donald Trump is the presumptive GOP n...,News,,False,"May 7, 2016\r"
25007,Mike Pence’s New DC Neighbors Are HILARIOUSLY...,Mike Pence is a huge homophobe. He supports ex...,News,,False,"December 3, 2016\r"
1377,California AG pledges to defend birth control ...,SAN FRANCISCO (Reuters) - California Attorney ...,politicsNews,"October 6, 2017",True,
32476,AZ RANCHERS Living On US-Mexico Border Destroy...,Twisted reasoning is all that comes from Pelos...,politics,,False,"Apr 25, 2017\r"


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 44898 entries, 22216 to 15795
Data columns (total 6 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    44898 non-null  object
 1   text     44898 non-null  object
 2   subject  44898 non-null  object
 3   date     21417 non-null  object
 4   Label    44898 non-null  object
 5   date    23481 non-null  object
dtypes: object(6)
memory usage: 3.4+ MB


In [None]:
df.duplicated().sum()

np.int64(209)

In [None]:
df.drop_duplicates()

Unnamed: 0,title,text,subject,date,Label,date\r
22216,BREAKING: GOP Chairman Grassley Has Had Enoug...,"Donald Trump s White House is in chaos, and th...",News,,False,"July 21, 2017\r"
27917,Failed GOP Candidates Remembered In Hilarious...,Now that Donald Trump is the presumptive GOP n...,News,,False,"May 7, 2016\r"
25007,Mike Pence’s New DC Neighbors Are HILARIOUSLY...,Mike Pence is a huge homophobe. He supports ex...,News,,False,"December 3, 2016\r"
1377,California AG pledges to defend birth control ...,SAN FRANCISCO (Reuters) - California Attorney ...,politicsNews,"October 6, 2017",True,
32476,AZ RANCHERS Living On US-Mexico Border Destroy...,Twisted reasoning is all that comes from Pelos...,politics,,False,"Apr 25, 2017\r"
...,...,...,...,...,...,...
6265,New York protesters camp out at Goldman Sachs ...,NEW YORK (Reuters) - Dozens of protesters gath...,politicsNews,"January 18, 2017",True,
44732,Boiler Room #62 – Fatal Illusions,Tune in to the Alternate Current Radio Network...,Middle-east,,False,"June 29, 2016\r"
38158,ATHEISTS SUE GOVERNOR OF TEXAS Over Display on...,I m convinced the Freedom From Religion group...,Government News,,False,"Feb 27, 2016\r"
860,Republican tax plan would deal financial hit t...,WASHINGTON (Reuters) - The Republican tax plan...,politicsNews,"November 2, 2017",True,


In [None]:
df['title'].duplicated().sum() # The dataset is not real too many duplicated entres

np.int64(6169)

In [None]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download("punkt_tab")

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    text =text.lower()
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'\d+', '', text)

    words = nltk.word_tokenize(text)
    words=[word for word in words if word not in stop_words]
    words = [lemmatizer.lemmatize(word) for word in words]

    return ' '.join(words)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [None]:
df['text'] = df['text'].apply(clean_text)
df['title'] = df['title'].apply(clean_text)
df.head()

Unnamed: 0,title,text,subject,date,Label,date\r
22216,breaking gop chairman grassley enough demand t...,donald trump white house chaos trying cover ru...,News,,False,"July 21, 2017\r"
27917,failed gop candidate remembered hilarious mock...,donald trump presumptive gop nominee time reme...,News,,False,"May 7, 2016\r"
25007,mike penny new dc neighbor hilariously trollin...,mike penny huge homophobe support ex gay conve...,News,,False,"December 3, 2016\r"
1377,california ag pledge defend birth control insu...,san francisco reuters california attorney gene...,politicsNews,"October 6, 2017",True,
32476,az rancher living u mexico border destroy nanc...,twisted reasoning come pelosi day especially p...,politics,,False,"Apr 25, 2017\r"


In [None]:
df.drop(['subject','date', 'date\r'],axis=1,inplace=True)
df.head()

Unnamed: 0,title,text,Label
22216,breaking gop chairman grassley enough demand t...,donald trump white house chaos trying cover ru...,False
27917,failed gop candidate remembered hilarious mock...,donald trump presumptive gop nominee time reme...,False
25007,mike penny new dc neighbor hilariously trollin...,mike penny huge homophobe support ex gay conve...,False
1377,california ag pledge defend birth control insu...,san francisco reuters california attorney gene...,True
32476,az rancher living u mexico border destroy nanc...,twisted reasoning come pelosi day especially p...,False


In [None]:
df['Label'] = df['Label'].map({'True': 1, 'False': 0})
df.head()

Unnamed: 0,title,text,Label
22216,breaking gop chairman grassley enough demand t...,donald trump white house chaos trying cover ru...,0
27917,failed gop candidate remembered hilarious mock...,donald trump presumptive gop nominee time reme...,0
25007,mike penny new dc neighbor hilariously trollin...,mike penny huge homophobe support ex gay conve...,0
1377,california ag pledge defend birth control insu...,san francisco reuters california attorney gene...,1
32476,az rancher living u mexico border destroy nanc...,twisted reasoning come pelosi day especially p...,0


In [None]:
X = df.drop('Label', axis=1)
y = df['Label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Training set shape:", X_train.shape, y_train.shape)
print("Testing set shape:", X_test.shape, y_test.shape)

Training set shape: (35918, 2) (35918,)
Testing set shape: (8980, 2) (8980,)


In [None]:
# Save the processed dataframe (Train, Test)
train_df = pd.concat([X_train, y_train], axis=1)
test_df = pd.concat([X_test, y_test], axis=1)

train_df.to_csv('final_data_train.csv', index=False)
test_df.to_csv('final_data_test.csv', index=False)

In [None]:
df.isnull().sum()

Unnamed: 0,0
title,0
text,0
Label,0


## Load the dataset for the llm use

In [None]:
from huggingface_hub import login
from google.colab import userdata


login(userdata.get('HF_TOKEN'))

In [None]:
from datasets import load_dataset


dataset = load_dataset("csv", data_files={"train": "final_data_train.csv", "test": "final_data_test.csv"})

train_valid = dataset["train"].train_test_split(test_size=0.1, seed=42)

dataset["train"] = train_valid["train"]
dataset["validation"] = train_valid["test"] # So for summary i make a train , validation and test datasets for the llm

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [None]:
# Process the input text that goes into the llm
def preprocess(example):
    example["input_text"] = "Title: " + str(example["title"]) + " Content: " + str(example["text"])
    return example

dataset = dataset.map(preprocess)

Map:   0%|          | 0/32326 [00:00<?, ? examples/s]

Map:   0%|          | 0/8980 [00:00<?, ? examples/s]

Map:   0%|          | 0/3592 [00:00<?, ? examples/s]

In [None]:
print(dataset['train'][0])

{'title': 'left loses third quarter economic estimate explodes', 'text': 'news forecast third quarter gdp set expand bad news democrat trying take president trump ignored mainstream medium focused statue protest great news trump economic team brilliant successful businessmen even greater news american maga reuters reported u economy track expand percent annualized pace third quarter inventory investment contributing percentage point growth atlanta federal reserve gdp forecast model showed thursday last friday government said first reading gross domestic product second quarter percent growth pace point atlanta fed final estimate pro growth president finally president trump unleashed beast pro growth less regulation within industry america allowed economy nearly double gdp within president first year amazing coal industry back along steel industry ripping away unnecessary regulation key growth trump know get america back work help industry left protest pretty much everything trump win pr

In [None]:
from transformers import AutoTokenizer

MODEL = 'roberta-base'

tokenizer = AutoTokenizer.from_pretrained(MODEL)

# Process the dataset
def tokenize(example):
    return tokenizer(example["input_text"], truncation=True, padding="max_length", max_length=512)

tokenized_ds = dataset.map(tokenize, batched=True)

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Map:   0%|          | 0/32326 [00:00<?, ? examples/s]

Map:   0%|          | 0/8980 [00:00<?, ? examples/s]

Map:   0%|          | 0/3592 [00:00<?, ? examples/s]

In [None]:
tokenized_ds = tokenized_ds.rename_column("Label", "labels")
tokenized_ds = tokenized_ds.remove_columns(["title", "text", "input_text"])
tokenized_ds.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

In [None]:
print(tokenized_ds["train"][0])

{'labels': tensor(0), 'input_ids': tensor([    0, 46525,    35,   314, 13585,   371,   297,   776,  3278, 41354,
        12803,    35,   340,  1914,   371,   297,   821, 43923,   278,  3003,
         1099,   340, 26232,   667,   185,   394, 20125,  8266,  7302,  4761,
         2061,  9577,  2790,   372,   340, 20125,   776,   165,  6967,  1800,
        20658,   190,  2388,   340, 38187,   260,  9931,   102,   769, 13188,
          431,  1717,   866,  1349,  3003,   135,  1013,  1538,  2877,   371,
          297,  7834,   915,  8216,  3164,   477,   434,    23,   462, 11485,
          752,  6114,   821, 43923,  1914,  1421,   969,  3553, 46806,    94,
         6664, 21746,   168,    26,    78,  2600,  4200,  1897,  1152,   200,
          297,   135,   434,  2877,   477,    23,   462, 11485,  9789,   507,
         3278,  1759,   434,   394,  1747,   394, 20125, 19128, 22067,  1759,
          434,   540,  5746,   624,   539, 38187,   102,  1220,   866,   823,
         1457,   821, 43923, 

In [None]:
tokenized_ds.column_names

{'train': ['labels', 'input_ids', 'attention_mask'],
 'test': ['labels', 'input_ids', 'attention_mask'],
 'validation': ['labels', 'input_ids', 'attention_mask']}

In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(MODEL, num_labels=2, id2label={0: "Fake", 1: "True"}, label2id={"Fake": 0, "True": 1})

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Training Phase

In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    logging_steps=10,
    num_train_epochs=1,
    weight_decay=0.01,
)

In [None]:
# Compute the accuracy_score, f1_score, precision_score, recall_score
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    preds = predictions.argmax(axis=-1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1": f1_score(labels, preds),
        "precision": precision_score(labels, preds),
        "recall": recall_score(labels, preds)
    }

In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

# Start training
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()

  trainer = Trainer(
  | |_| | '_ \/ _` / _` |  _/ -_)


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mahmed15-habeb[0m ([33mahmed15-habeb-itsharks[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
10,0.703
20,0.4653
30,0.2819
40,0.1437
50,0.2769
60,0.2098
70,0.1747
80,0.0879
90,0.0752
100,0.0084


TrainOutput(global_step=2021, training_loss=0.026491347679195597, metrics={'train_runtime': 3240.8617, 'train_samples_per_second': 9.975, 'train_steps_per_second': 0.624, 'total_flos': 8505327975567360.0, 'train_loss': 0.026491347679195597, 'epoch': 1.0})

In [None]:
# trainer.model.save_pretrained("RoBERTa-base_fake_news_classifier")
repo_id = "AhmedYusri/RoBERTa-base_fake_news_classifier"
token = userdata.get('WRITE_TOKEN') # Get the write token
model.push_to_hub(repo_id, token=token) # Pass the token to push_to_hub
tokenizer.push_to_hub(repo_id, token=token) # Pass the token to push_to_hub

Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

  ...ssifier/model.safetensors:   0%|          |  550kB /  499MB            

CommitInfo(commit_url='https://huggingface.co/AhmedYusri/RoBERTa-base_fake_news_classifier/commit/4b0de81677cd2e88f1c78950601e601207cf831a', commit_message='Upload tokenizer', commit_description='', oid='4b0de81677cd2e88f1c78950601e601207cf831a', pr_url=None, repo_url=RepoUrl('https://huggingface.co/AhmedYusri/RoBERTa-base_fake_news_classifier', endpoint='https://huggingface.co', repo_type='model', repo_id='AhmedYusri/RoBERTa-base_fake_news_classifier'), pr_revision=None, pr_num=None)

In [None]:
results = trainer.evaluate(tokenized_ds["test"])
print(results)

{'eval_loss': 0.009322077967226505, 'eval_accuracy': 0.9981069042316258, 'eval_f1': 0.9980271556226065, 'eval_precision': 0.9986065954482118, 'eval_recall': 0.9974483878450475, 'eval_runtime': 263.9292, 'eval_samples_per_second': 34.024, 'eval_steps_per_second': 2.129, 'epoch': 1.0}


In [None]:
results

## Create a simple ui using `Gradio`

In [None]:
import gradio as gr

In [None]:
def predict_fake_news(title, text):
    # Tokenize
    inputs = tokenizer(f"Title: {title} Content: {text}", return_tensors="pt", truncation=True, padding="max_length", max_length=512)
    inputs = {key: value for key, value in inputs.items()}

    # Predict
    predictions = trainer.predict(inputs)
    y_pred = predictions.predictions.argmax(axis=-1)
    if y_pred == 0:
        return "Fake"
    else:
        return "True"

In [None]:
interface = gr.Interface(
        fn=predict_fake_news,
        inputs=[gr.Textbox(label="News Title", placeholder="Enter News title...", lines=5),
                gr.Textbox(label="News content", placeholder="Enter News content...", lines=8)],
        outputs=gr.Textbox(label="Predicted Results"),
        title="Fake News detction using RoBERTa-base LLM...",
)

In [None]:
interface.launch(debug=True)

In [None]:
import gradio as gr
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

id2label = {0: "Fake", 1: "True"}

def predict(title, content):
    text = f"Title: {title}\nContent: {content}"

    # pick device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    # tokenize and send to device
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    # forward pass
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits  # [batch_size, num_labels]

        # move logits to CPU before numpy/argmax
        probs = torch.softmax(logits, dim=-1).cpu().numpy()[0]
        pred_id = torch.argmax(logits, dim=-1).cpu().item()

    return {
        "Fake": float(probs[0]),
        "True": float(probs[1])
    }

# 4. Build Gradio UI
with gr.Blocks() as demo:
    gr.Markdown("## 📰 Fake News Detector")
    title = gr.Textbox(label="News Title")
    content = gr.Textbox(label="News Content", lines=6)
    output = gr.Label(label="Result")
    btn = gr.Button("Check News")
    btn.click(predict, inputs=[title, content], outputs=output)

In [None]:
demo.launch(debug=True)