In [3]:
# --- Cell 1: Imports ---
import pandas as pd
import numpy as np
import os

# Set base data path
DATA_PATH = os.path.join("..", "data")  # one level up from notebooks/

# --- Cell 2: Load the datasets ---
fake = pd.read_csv(os.path.join(DATA_PATH, "Fake1.csv"))
true = pd.read_csv(os.path.join(DATA_PATH, "True1.csv"))

# Optional additional datasets
try:
    ifnd = pd.read_csv(os.path.join(DATA_PATH, "IFND.csv"))
    news_extra = pd.read_csv(os.path.join(DATA_PATH, "news_dataset.csv"))
    print("✅ Loaded all datasets successfully.")
except Exception as e:
    print("⚠️ Some optional datasets not found:", e)

# --- Cell 3: Add labels ---
fake['label'] = 0
true['label'] = 1

# --- Cell 4: Combine base datasets ---
combined = pd.concat([fake[['text', 'label']], true[['text', 'label']]], axis=0)
combined = combined.drop_duplicates().dropna()

# --- Cell 5: Optional — merge with extra datasets if columns match ---
try:
    if 'text' in ifnd.columns and 'label' in ifnd.columns:
        combined = pd.concat([combined, ifnd[['text', 'label']]], axis=0)
    print("✅ IFND merged successfully.")
except:
    pass

# --- Cell 6: Clean text ---
import re

def clean_text(text):
    text = str(text).lower()
    text = re.sub(r"http\S+", "", text)           # remove URLs
    text = re.sub(r"[^a-zA-Z\s]", "", text)       # remove punctuation/numbers
    text = re.sub(r"\s+", " ", text).strip()      # remove extra spaces
    return text

combined['text'] = combined['text'].apply(clean_text)
combined = combined[combined['text'].str.len() > 20]

print("✅ Cleaned text and removed very short samples.")
print("Final dataset size:", combined.shape)

# --- Cell 7: Shuffle and Save ---
combined = combined.sample(frac=1, random_state=42).reset_index(drop=True)

output_path = os.path.join(DATA_PATH, "final_dataset.csv")
combined.to_csv(output_path, index=False)
print(f"✅ Saved cleaned dataset to {output_path}")
combined.head()


⚠️ Some optional datasets not found: 'utf-8' codec can't decode byte 0xd4 in position 5362: invalid continuation byte
✅ Cleaned text and removed very short samples.
Final dataset size: (38561, 2)
✅ Saved cleaned dataset to ..\data\final_dataset.csv


Unnamed: 0,text,label
0,on thursday donald trump tweeted that if china...,0
1,sacramento calif reuters california governor j...,1
2,washington reuters the fbi report scolding dem...,1
3,springfield ill reuters the democratcontrolled...,1
4,while trump says he s like a smart person and ...,0
