In [None]:
# Import statements for the entire file
# If unable to upload any of these modules/libraries, simply run 'pip install {module_name}'

from datasets import Dataset
from gensim.models import Word2Vec
from os.path import isfile
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from spacy.lang.en import English
from transformers import GPT2LMHeadModel, GPT2Tokenizer, DataCollatorForLanguageModeling, Trainer, TrainingArguments
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split, StratifiedKFold

import os
import pandas as pd
import numpy as np
import urllib.request

First, we will read in the three datasets we have
- Snopes: This is a single file where we will have to separate true and false articles
- Kaggle Fake News: Two separate files -- one containing true, the other containing false

In [None]:
# Download the WELFake_Dataset data from webpage
if not isfile("WELFake_Dataset.csv"):
    url = "https://zenodo.org/records/4561253/files/WELFake_Dataset.csv"
    print("Downloading WELFake_dataset.csv...")
    filename, headers = urllib.request.urlretrieve(url, filename="WELFake_Dataset.csv")
    print("Download complete\n")

# Download the FN detection datasets
if not os.path.isdir('News_dataset'):
    os.system('kaggle datasets download -d emineyetm/fake-news-detection-datasets')

    print("Decompressing the file ...")
    os.system('unzip fake-news-detection-datasets.zip')
    print("Download complete")

In [None]:
# Load Kaggle Fake News Data (Bozkus)

# Bozkus True articles
df_fn_true = pd.read_csv("/content/News_dataset/True.csv")
print("Read {} sentences".format(df_fn_true.shape))

# Bozkus False Articles
df_fn_false = pd.read_csv("/content/News_dataset/Fake.csv")
print("Read {} sentences".format(df_fn_false.shape))

# Load WElFake Kaggle Fake News
df_WELFAKE = pd.read_csv("WELFake_Dataset.csv")
print("Read {} sentences".format(df_WELFAKE.shape))

In [None]:
# Initialize the English tokenizer from spaCy and define a custom tokenizer function
nlp = English()
def tokenizer(s):
    tokenize = nlp.tokenizer
    return [token.text for token in tokenize(s)]

# Initialize CountVectorizer with the custom tokenizer
vectorizer = CountVectorizer(lowercase=True, tokenizer=tokenizer)

# Function to process text data and print vocabulary size and shape of the transformed array
def process_text_data(df, dataset_name):
     # List to hold non-null text entries
    no_nan_text = []

    # Iterate over each text entry in the DataFrame column 'text'
    for text in df['text']:
        if not pd.isna(text):
            no_nan_text.append(text)

    # Fit and transform the text data using CountVectorizer
    train_array = vectorizer.fit_transform(no_nan_text)

    # Print the number of unique words in the vocabulary
    print(f'Vocabulary Size for {dataset_name}: {len(vectorizer.get_feature_names_out())}')

    # Print the shape of the transformed array (documents x features)
    print(f'Shape of Transformed Array for {dataset_name}: {train_array.shape}\n')

In [None]:
# Process all the datasets and print details
process_text_data(df_WELFAKE, "WELFake Dataset")
process_text_data(df_fn_false, "False News Dataset")
process_text_data(df_fn_true, "True News Dataset")

In [None]:
# Extract the text columns
true_matrix = df_fn_true[['text']].copy() #Use copy here to avoid SettingWithCopyWarning -- thinks we are not modifiying the originial DF
false_matrix = df_fn_false[['text']].copy()
welfake_matrix = df_WELFAKE[['text', 'label']]

# Add labels to each respective matrix
true_matrix.loc[:, 'label'] = 1
false_matrix.loc[:, 'label'] = 0

# Combine matricies
combined_matrix = pd.concat([true_matrix, false_matrix, welfake_matrix])
combined_matrix = combined_matrix.dropna(subset=['text'])

# Shuffle each row into random order
# Frac = 1 (shuffle 100% of datafram), drop = True (makes sure old indices aren't added back as columns)
combined_matrix = combined_matrix.sample(frac=1).reset_index(drop=True)

# Split the dataset into train, validation, and test set

# Split into train (80%) and temp (20% test)
train_val_data, test_data = train_test_split(combined_matrix, test_size=0.2, random_state=6501)

# Perform 5-fold cross-validation on validation set
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)  # Stratified ensures class balance in each fold

X = train_val_data['text'] # Validation features
y = train_val_data['label']  # Validation labels

print(X.shape)
print(X.head())

3.1.1: Logistic Regression Classifier Based on TF-IDF Vectors

In [None]:
# Part 1: Simple Classifier

# Initialize TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=30000)

# Automatically tokenizes inputs
# Fit and transform the training and validation data
X_tfidf = tfidf_vectorizer.fit_transform(X)

# Split into train (80%) and temp (20% test)
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Initialize the logistic regression model
logreg = LogisticRegression()

# Cross-validation with TF-IDF features
for fold, (train_index, val_index) in enumerate(kf.split(X_tfidf, y)):
    print(f"Fold {fold + 1}:")

    # Split the TF-IDF features
    X_train_fold, X_val_fold = X_tfidf[train_index], X_tfidf[val_index]
    y_train_fold, y_val_fold = y.iloc[train_index], y.iloc[val_index]

    # Train the model on the training fold
    logreg.fit(X_train_fold, y_train_fold)

    # Training accuracy
    y_pred_train = logreg.predict(X_train_fold)
    train_accuracy = accuracy_score(y_train_fold, y_pred_train)
    print(f"Train accuracy for fold {fold + 1}: {train_accuracy:.4f}")

    # Validation accuracy
    y_pred_val = logreg.predict(X_val_fold)
    val_accuracy = accuracy_score(y_val_fold, y_pred_val)
    print(f"Val accuracy for fold {fold + 1}: {val_accuracy:.4f}")

# Evaluate performance on test set
X_test = test_data['text']
y_test = test_data['label']

# Transform the test set using the same TF-IDF vectorizer
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Make predictions on the test set
y_test_pred = logreg.predict(X_test_tfidf)

#Statistics:

# Test set accuracy
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f"Test Set Accuracy: {test_accuracy:.4f}")

# Test set precision
test_precision = precision_score(y_test, y_test_pred, average='weighted')
print(f"Test Set Precision: {test_precision:.4f}")

# Test set recall
test_recall = recall_score(y_test, y_test_pred, average='weighted')
print(f"Test Set Recall: {test_recall:.4f}")

# Test set F1 score
test_f1 = f1_score(y_test, y_test_pred, average='weighted')
print(f"Test Set F1 Score: {test_f1:.4f}")

3.1.2 Logistic Regression Classifier Based on Skip-Gram Embeddings

In [None]:
# Skipgram Model

# Split sentences into words (tokenization)
X_tokenized = X.apply(lambda x: x.split())

# Step 3: Train a Skip-Gram Word2Vec model
# sg=1 --> skipgram
word2vec_model = Word2Vec(sentences=X_tokenized, vector_size=70, window=5, min_count=3, sg=1, workers=4, seed=6501)

# Convert documents to vectors by averaging word embeddings
def document_vector(doc):
    # List to store word vectors found in the document
    word_vectors = []

    # Iterate over each word in the document
    for word in doc:
        # Check if the word is in the Word2Vec model's vocabulary (word2vec_model.wv)
        if word in word2vec_model.wv:
            # Append the word vector to the list
            word_vectors.append(word2vec_model.wv[word])

    # Check if the list of word vectors is not empty (i.e., at least one word was found in the vocabulary)
    if len(word_vectors) > 0:
        # Average the word vectors to get a single fixed-length vector for the document
        return np.mean(word_vectors, axis=0)
    else:
        # If no words were found in the vocabulary --> return a zero vector of the same length as the word vectors (70 = dim)
        return np.zeros(word2vec_model.vector_size)

    """

    [0.5, 0.2, 0.1],  # "cat"
    [0.3, 0.4, 0.2],  # "sits"
    [0.2, 0.1, 0.1],  # "on"
    [0.4, 0.3, 0.2]   # "mat"

    -->

    [0.35, 0.25, 0.15] document represented as a vector

    """

# Apply the function to create document vectors for each text in the dataset

# Initialize an empty list to store the document vectors
X_vectors = []

# Iterate over each tokenized document in X_tokenized
for doc in X_tokenized:
    # Apply the document_vector() function to convert the document (list of words) to a vector
    doc_vector = document_vector(doc)

    # Append the resulting vector to the X_vectors list
    X_vectors.append(doc_vector)

# We convert this list into a 2D NumPy array, where each row corresponds to a document vector
X_vectors = np.array(X_vectors)

# Step 5: Perform cross-validation and logistic regression
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Initialize the logistic regression model
logreg = LogisticRegression()

# Cross-validation with word embeddings
for fold, (train_index, val_index) in enumerate(kf.split(X_vectors, y)):
    print(f"Fold {fold + 1}:")

    # Split the vectorized text data
    X_train_fold, X_val_fold = X_vectors[train_index], X_vectors[val_index]
    y_train_fold, y_val_fold = y.iloc[train_index], y.iloc[val_index]

    # Train the logistic regression model
    logreg.fit(X_train_fold, y_train_fold)

    # Training accuracy
    y_pred_train = logreg.predict(X_train_fold)
    train_accuracy = accuracy_score(y_train_fold, y_pred_train)
    print(f"Train accuracy for fold {fold + 1}: {train_accuracy:.4f}")

    # Validation accuracy
    y_pred_val = logreg.predict(X_val_fold)
    val_accuracy = accuracy_score(y_val_fold, y_pred_val)
    print(f"Val accuracy for fold {fold + 1}: {val_accuracy:.4f}")

# Step 6: Evaluate the model on the test set

# Tokenize the test set and compute document vectors
X_test_tokenized = X_test.apply(lambda x: x.split())

# Same as above: Apply the function to create document vectors for each text in the dataset

# Initialize an empty list to store the document vectors
X_test_vectors = []

# Iterate over each tokenized document in X_tokenized
for doc_test in X_test_tokenized:
    doc_vector_test = document_vector(doc_test)

    X_test_vectors.append(doc_vector_test)

# We convert this list into a 2D NumPy array, where each row corresponds to a document vector
X_test_vectors = np.array(X_test_vectors)

# Make predictions on the test set
y_test_pred = logreg.predict(X_test_vectors)

# Statistics

# Test set accuracy
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f"Test Set Accuracy: {test_accuracy:.4f}")

# Test set precision
test_precision = precision_score(y_test, y_test_pred, average='weighted')
print(f"Test Set Precision: {test_precision:.4f}")

# Test set recall
test_recall = recall_score(y_test, y_test_pred, average='weighted')
print(f"Test Set Recall: {test_recall:.4f}")

# Test set F1 score
test_f1 = f1_score(y_test, y_test_pred, average='weighted')
print(f"Test Set F1 Score: {test_f1:.4f}")

Part 2: The next main component of the project is Fake News Generation, which consists of the following:

Explores the use of large language models (LLMs) to generate fake news articles
Understand how these models can create realistic-looking fake news that might evade detection systems
Generated content is then used to test the robustness of the detection models

In [None]:
# Load pre-trained model and tokenizer
model_name = 'gpt2'
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

# Ensure pad token is set
if tokenizer.pad_token is None:
  tokenizer.pad_token = tokenizer.eos_token
  model.config.pad_token_id = tokenizer.pad_token_id

# Prepare fake news data for fine-tuning
def prepare_data_for_training(df, sample_frac=0.3):
  # Combine title and text for training if needed
  df_sample = df.sample(frac=sample_frac, random_state=42)
  texts = df_sample['text'].tolist()
  # Tokenize all texts
  encodings = tokenizer(texts, truncation=True, padding=True, max_length=512)
  dataset = Dataset.from_dict(encodings)
  return dataset

# Create dataset from fake news examples
train_dataset = prepare_data_for_training(df_fn_false, 0.3)

# Set up data collator
data_collator = DataCollatorForLanguageModeling(
  tokenizer=tokenizer,
  mlm=False  # We're not using masked language modeling
)

# Set up training arguments
training_args = TrainingArguments(
  output_dir="./gpt2-fake-news",
  overwrite_output_dir=True,
  num_train_epochs=1,
  per_device_train_batch_size=6,
  save_steps=5000,
  save_total_limit=2,
  logging_steps=500,
  report_to=[],
  fp16=True
)

# Initialize trainer
trainer = Trainer(
  model=model,
  args=training_args,
  data_collator=data_collator,
  train_dataset=train_dataset,
)

# Fine-tune the model
trainer.train()

In [None]:
# Function to generate fake news with the fine-tuned model
def generate_fake_news(prompt, max_length=200):
  inputs = tokenizer.encode(prompt, return_tensors='pt')

  outputs = model.generate(
      inputs,
      max_length=max_length,
      num_return_sequences=1,
      temperature=0.9,  # Control randomness (higher = more random)
      top_k=50,        # Limit vocabulary choices
      top_p=0.95,      # Nucleus sampling
      no_repeat_ngram_size=2,  # Prevent repetition of phrases
      do_sample=True
  )
  text = tokenizer.decode(outputs[0], skip_special_tokens=True)
  return text

# Create fake news articles
# Create fake news articles with additional prompts
prompts = [
    "Breaking News: Scientists discover",
    "Latest Update: Government announces",
    "Exclusive: Celebrity reveals",
    "Sports: Team wins championship",
    "Technology: New gadget released",
    "Health Alert: New virus outbreak",
    "Economy: Stock market hits record high",
    "Politics: Leader resigns amid scandal",
    "Environment: Major climate change initiative",
    "Education: Schools adopt new curriculum",
    "Travel: New airline routes announced",
    "Science: Researchers develop new vaccine",
    "Crime: Suspect arrested in major case",
    "Entertainment: Film wins top award",
    "Weather: Severe storm warning issued",
    "Business: Company announces major merger",
    "Culture: Festival celebrates diversity",
    "Space: New planet discovered by astronomers",
    "Technology: Breakthrough in artificial intelligence",
    "Health: Study reveals benefits of new diet"
]
generated_articles = []
for prompt in prompts:
  fake_news_article = generate_fake_news(prompt)
  generated_articles.append({'prompt': prompt, 'article': fake_news_article})

# Save the generated articles to a DataFrame
df_fake_news = pd.DataFrame(generated_articles)

# Save the DataFrame to a CSV file for later testing
df_fake_news.to_csv('generated_fake_news.csv', index=False)

In [None]:
# Sample 20 articles from the true dataset then combine datasets
df_sampled_true = df_fn_true.sample(n=20, random_state=42)['text']
df_fake_news = pd.read_csv('generated_fake_news.csv')['article']
df_combined = pd.concat([df_sampled_true, df_fake_news], ignore_index=True)

# Extract the articles for testing
X_combined = df_combined
y_combined = [1] * len(df_sampled_true) + [0] * len(df_fake_news)

# TF-IDF Vectorizer Testing
# Transform the combined articles using the same TF-IDF vectorizer
X_combined_tfidf = tfidf_vectorizer.transform(X_combined)

# Make predictions on the combined articles using the TF-IDF based logistic regression model
y_combined_pred_tfidf = logreg.predict(X_combined_tfidf)

# Calculate and print evaluation metrics for TF-IDF model
print("TF-IDF Logistic Regression Evaluation for Combined Articles:")
print(f"Accuracy: {accuracy_score(y_combined, y_combined_pred_tfidf):.4f}")
print(f"Precision: {precision_score(y_combined, y_combined_pred_tfidf, average='weighted'):.4f}")
print(f"Recall: {recall_score(y_combined, y_combined_pred_tfidf, average='weighted'):.4f}")
print(f"F1 Score: {f1_score(y_combined, y_combined_pred_tfidf, average='weighted'):.4f}\n")

In [None]:
# Skip-Gram Embeddings Testing (if applicable)
# Tokenize the combined articles
X_combined_tokenized = X_combined.apply(lambda x: x.split())

# Convert the tokenized articles to vectors using the document_vector function
X_combined_vectors = np.array([document_vector(doc) for doc in X_combined_tokenized])

# Make predictions on the combined articles using the Skip-Gram based logistic regression model
y_combined_pred_skipgram = logreg.predict(X_combined_vectors)

# Calculate and print evaluation metrics for Skip-Gram model
print("Skip-Gram Logistic Regression Evaluation for Combined Articles:")
print(f"Accuracy: {accuracy_score(y_combined, y_combined_pred_skipgram):.4f}")
print(f"Precision: {precision_score(y_combined, y_combined_pred_skipgram, average='weighted'):.4f}")
print(f"Recall: {recall_score(y_combined, y_combined_pred_skipgram, average='weighted'):.4f}")
print(f"F1 Score: {f1_score(y_combined, y_combined_pred_skipgram, average='weighted'):.4f}")