In [1]:
# Load raw datasets, remove irrelevant columns, merge using a 
# balanced hybrid strategy (50% Fake / 50% Real split between WELFake and BBC),
# and truncate text to mitigate length bias before saving.

import pandas as pd
import numpy as np

# Define the character limit per article to force the model to focus on content 
# rather than length (Fake articles tend to be longer than BBC ones).
CHAR_LIMIT = 600
OUTPUT_FILE = "../data/news_dataset.csv"

# LOAD DATASETS
print("1. Loading raw datasets...")

try:
    # Load WELFake dataset
    df_large = pd.read_csv("../data/WELFake_Dataset.csv")
    
    # Load BBC News dataset
    df_bbc = pd.read_csv("../data/bbc_news.csv")
    
    # Drop metadata columns not required for training
    cols_to_drop = ['pubDate', 'guid', 'link']
    df_bbc = df_bbc.drop(columns=cols_to_drop, errors='ignore')
    print(f"   - Dropped {cols_to_drop} from BBC.")
    
    # Standardize column names: rename description to text
    df_bbc = df_bbc.rename(columns={'description': 'text'})
        
except FileNotFoundError as e:
    print(f"CRITICAL ERROR: File not found. {e}")
    raise

# STANDARDIZE LABELS
print("2. Standardizing Labels...")

# Invert WELFake labels to match target schema: 0 = Fake, 1 = Real.
# Original: 1=Fake, 0=Real -> New: 0=Fake, 1=Real.
df_large['label'] = 1 - df_large['label'] 

# BBC articles to 1 (Real)
df_bbc['label'] = 1

# Filter to keep only essential columns and remove nulls
df_large = df_large[['title', 'text', 'label']].dropna()
df_bbc = df_bbc[['title', 'text', 'label']].dropna()

# HYBRID SAMPLING STRATEGY
# Step A: Isolate all Fake news (Label 0) to serve as the anchor
df_fakes = df_large[df_large['label'] == 0]
n_fakes = len(df_fakes)
print(f"--> Total Fake News: {n_fakes}")

# Step B: Determine the target count for Real news to ensure a balanced dataset
target_total_real = n_fakes
half_real = target_total_real // 2

print(f"--> Target Real News: {target_total_real}")
print(f"    - From WELFake: {half_real}")
print(f"    - From BBC: {target_total_real - half_real}")

# Step C: Sample 50% of the required Real news from WELFake
df_real_welfake = df_large[df_large['label'] == 1].sample(n=half_real, random_state=42)

# Step D: Sample the remaining Real news from BBC
needed_bbc = target_total_real - half_real

if len(df_bbc) >= needed_bbc:
    df_real_bbc = df_bbc.sample(n=needed_bbc, random_state=42)
else:
    print(f"WARNING: BBC only has {len(df_bbc)} articles. Taking all.")
    df_real_bbc = df_bbc

# MERGE AND SHUFFLE
# Combine the Fake, WELFake Real, and BBC Real dataframes
df_final = pd.concat([df_fakes, df_real_welfake, df_real_bbc], axis=0)

# Shuffle the dataset and reset the index
df_final = df_final.sample(frac=1, random_state=42).reset_index(drop=True)

# Rename label to labels
df_final = df_final.rename(columns={'label': 'labels'})

# TRUNCATION
print(f"5. Truncating texts to max {CHAR_LIMIT} chars...")
# Ensure text is string format and slice to the character limit
df_final['text'] = df_final['text'].astype(str).apply(lambda x: x[:CHAR_LIMIT])

# SAVE TO DISK
print(f"4. Saving combined dataset to {OUTPUT_FILE}...")
df_final.to_csv(OUTPUT_FILE, index=False)

print("="*30)
print("SUCCESS! Final Dataset Stats:")
print(f"Total Rows: {len(df_final)}")
print(f"Class Balance (0=Fake, 1=Real):")
print(df_final['labels'].value_counts())
print("="*30)

1. Loading raw datasets...
   - Dropped ['pubDate', 'guid', 'link'] from BBC.
2. Standardizing Labels...
--> Total Fake News: 36509
--> Target Real News: 36509
    - From WELFake: 18254
    - From BBC: 18255
5. Truncating texts to max 600 chars...
4. Saving combined dataset to ../data/news_dataset.csv...
SUCCESS! Final Dataset Stats:
Total Rows: 73018
Class Balance (0=Fake, 1=Real):
labels
0    36509
1    36509
Name: count, dtype: int64
