<a href="https://colab.research.google.com/github/Atia6/Fake_News_Detection/blob/main/Fake_News_Detection_Fine_tuned.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q -U bitsandbytes
!pip install -q -U git+https://github.com/huggingface/transformers.git
!pip install -q -U git+https://github.com/huggingface/peft.git
!pip install -q -U git+https://github.com/huggingface/accelerate.git
!pip install -q datasets


!pip install -q  einops langchain
!pip install langchain-community langchain-core

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m122.4/122.4 MB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for transformers (pyproject.toml) ... [?25l[?25hdone
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for peft (pyproject.toml) ... [?25l[?25hdone
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for accelerate (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m15.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━

In [None]:
from langchain.embeddings import HuggingFaceEmbeddings
from peft import LoraConfig, get_peft_model, TaskType
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from datasets import Dataset




In [None]:
import pandas as pd
import string
from google.colab import drive
drive.mount('/content/drive')


In [None]:
# Load the CSV dataset
csv_path = '/content/drive/MyDrive/data/Fake News Detection Fine-tuning/data.csv'

# Load the CSV file into a DataFrame
df = pd.read_csv(csv_path)

In [None]:
df.columns

Index(['URLs', 'Headline', 'Body', 'Label'], dtype='object')

In [None]:
# Combine Headline and Body (if necessary)
df['text'] = df['Headline'] + " " + df['Body']

# Convert the 'text' column to strings, handling both lists and single strings
df["text"] = df["text"].apply(lambda x: " ".join(x) if isinstance(x, list) else str(x))


In [None]:

import pandas as pd

# Create a new DataFrame with 'text' and 'Label' as the columns
df_combined = pd.DataFrame({
    'text': df['text'],
    'label': df['Label']
})

# Split into train and test sets
train_df, test_df = train_test_split(df_combined, test_size=0.2, random_state=42)





In [None]:
train_df.head()

Unnamed: 0,text,label
2473,Rugby League World Cup: Kear names uncapped tr...,1
1338,"Execution Still Haunts Village, 50 Years After...",1
1613,Police take shot at actor on movie set Chat wi...,1
1610,Catalan leader under pressure to drop independ...,1
2600,Dunderhead of the Week Dunderhead of the Week\...,0


In [None]:
print(type(train_df['text']))

<class 'pandas.core.series.Series'>


In [None]:
# Tokenization
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
#tokenizer.pad_token = tokenizer.eos_token

def tokenize_function(examples):

  #print(examples)

  return tokenizer(examples, truncation=True, padding="max_length", max_length=128)

train_df['tokenized'] = train_df["text"].apply(tokenize_function)
test_df['tokenized'] = test_df["text"].apply(tokenize_function)





In [None]:
# Extract the tokenized components
train_df['input_ids'] = train_df['tokenized'].apply(lambda x: x['input_ids'])
train_df['attention_mask'] = train_df['tokenized'].apply(lambda x: x['attention_mask'])
train_df['token_type_ids'] = train_df['tokenized'].apply(lambda x: x.get('token_type_ids', None))  # in case some don't have token_type_ids

# Convert the DataFrame to a Hugging Face Dataset
train_dataset = Dataset.from_pandas(train_df[['input_ids', 'attention_mask', 'token_type_ids', 'label']])


In [None]:
# Extract the tokenized components
test_df['input_ids'] = test_df['tokenized'].apply(lambda x: x['input_ids'])
test_df['attention_mask'] = test_df['tokenized'].apply(lambda x: x['attention_mask'])
test_df['token_type_ids'] = test_df['tokenized'].apply(lambda x: x.get('token_type_ids', None))  # in case some don't have token_type_ids

# Convert the DataFrame to a Hugging Face Dataset
test_dataset = Dataset.from_pandas(test_df[['input_ids', 'attention_mask', 'token_type_ids', 'label']])


In [None]:
train_df.head()

Unnamed: 0,text,label,tokenized
2473,Rugby League World Cup: Kear names uncapped tr...,1,"[input_ids, token_type_ids, attention_mask]"
1338,"Execution Still Haunts Village, 50 Years After...",1,"[input_ids, token_type_ids, attention_mask]"
1613,Police take shot at actor on movie set Chat wi...,1,"[input_ids, token_type_ids, attention_mask]"
1610,Catalan leader under pressure to drop independ...,1,"[input_ids, token_type_ids, attention_mask]"
2600,Dunderhead of the Week Dunderhead of the Week\...,0,"[input_ids, token_type_ids, attention_mask]"


In [None]:
# Apply LoRA with PEFT
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)  # Binary classification

# Configure LoRA
peft_config = LoraConfig(
    r=8,
    lora_alpha=32,
       target_modules=["attention.self.query", "attention.self.value"],  # Common targets for BERT models # Specify modules to adapt; check your model's layers
    lora_dropout=0.1,
    bias="none",
    task_type=TaskType.SEQ_CLS  # Sequence Classification task
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Apply PEFT to the model
model = get_peft_model(model, peft_config)

In [None]:
# Define TrainingArguments and Trainer
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01,
    save_strategy="epoch",
    logging_dir="./logs",
    load_best_model_at_end=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer
)



  trainer = Trainer(


In [None]:
# Fine-Tune the Model
trainer.train()



Epoch,Training Loss,Validation Loss
1,No log,0.337974
2,No log,0.191068
3,0.362000,0.141899
4,0.362000,0.143098
5,0.119200,0.114458
6,0.119200,0.120757
7,0.119200,0.108029
8,0.077900,0.107968
9,0.077900,0.102834
10,0.059500,0.104824


TrainOutput(global_step=2010, training_loss=0.15421532282781839, metrics={'train_runtime': 575.5568, 'train_samples_per_second': 55.72, 'train_steps_per_second': 3.492, 'total_flos': 2116794378700800.0, 'train_loss': 0.15421532282781839, 'epoch': 10.0})

In [None]:
# Evaluate the model
results = trainer.evaluate()

print(results)

{'eval_loss': 0.10283368080854416, 'eval_runtime': 6.063, 'eval_samples_per_second': 132.279, 'eval_steps_per_second': 8.412, 'epoch': 10.0}


In [None]:
# Save the model
trainer.save_model("./finetuned_model")
tokenizer.save_pretrained("./finetuned_model")

('./finetuned_model/tokenizer_config.json',
 './finetuned_model/special_tokens_map.json',
 './finetuned_model/vocab.txt',
 './finetuned_model/added_tokens.json',
 './finetuned_model/tokenizer.json')

In [None]:
from sklearn.metrics import classification_report, accuracy_score

# Step 6: Evaluate Performance
# Get predictions on test dataset
raw_pred, _, _ = trainer.predict(test_dataset)

# Convert raw predictions to class labels
predictions = raw_pred.argmax(axis=1)
labels = test_dataset["label"]

# Compute metrics
print("\nClassification Report:")
print(classification_report(labels, predictions))

print("\nAccuracy:")
print(accuracy_score(labels, predictions))


Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.98      0.98       455
           1       0.97      0.98      0.97       347

    accuracy                           0.98       802
   macro avg       0.98      0.98      0.98       802
weighted avg       0.98      0.98      0.98       802


Accuracy:
0.9775561097256857
