In [1]:
import os
import json
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import re
import nltk
from nltk.corpus import stopwords
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from PIL import Image

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\karbi\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


In [2]:
fake_path = "../data/datasets/fake-and-real-news-dataset/Fake.csv"
true_path = "../data/datasets/fake-and-real-news-dataset/True.csv"

fake_df = pd.read_csv(fake_path)
true_df = pd.read_csv(true_path)

fake_df['label'] = 0
true_df['label'] = 1

news_df = pd.concat([fake_df, true_df], ignore_index=True)
news_df = news_df.sample(frac=1, random_state=42).reset_index(drop=True)

print(news_df.head())


                                               title  \
0  Ben Stein Calls Out 9th Circuit Court: Committ...   
1  Trump drops Steve Bannon from National Securit...   
2  Puerto Rico expects U.S. to lift Jones Act shi...   
3   OOPS: Trump Just Accidentally Confirmed He Le...   
4  Donald Trump heads for Scotland to reopen a go...   

                                                text       subject  \
0  21st Century Wire says Ben Stein, reputable pr...       US_News   
1  WASHINGTON (Reuters) - U.S. President Donald T...  politicsNews   
2  (Reuters) - Puerto Rico Governor Ricardo Rosse...  politicsNews   
3  On Monday, Donald Trump once again embarrassed...          News   
4  GLASGOW, Scotland (Reuters) - Most U.S. presid...  politicsNews   

                  date  label  
0    February 13, 2017      0  
1       April 5, 2017       1  
2  September 27, 2017       1  
3         May 22, 2017      0  
4       June 24, 2016       1  


In [3]:
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'http\S+|www\S+', '', text)
    text = re.sub(r'[^a-z\s]', '', text)
    text = ' '.join([word for word in text.split() if word not in stop_words])
    return text

news_df['clean_text'] = news_df['text'].apply(clean_text)


In [4]:
tokenizer = Tokenizer(num_words=20000, oov_token="<OOV>")
tokenizer.fit_on_texts(news_df['clean_text'])

sequences = tokenizer.texts_to_sequences(news_df['clean_text'])
padded = pad_sequences(sequences, maxlen=300, padding='post', truncating='post')

X_text = padded
y_text = news_df['label'].values

X_train_text, X_val_text, y_train_text, y_val_text = train_test_split(X_text, y_text, test_size=0.2, random_state=42)


In [5]:
fakeddit_dir = "../data/datasets/fakeddit_subset/fakeddit_subset"
train_json = os.path.join(fakeddit_dir, "training_data_fakeddit.jsonl")

image_text_data = []

with open(train_json, 'r', encoding='utf-8') as f:
    for line in f:
        entry = json.loads(line)
        text = entry["contents"][0]["parts"][1]["text"]
        image_uri = entry["contents"][0]["parts"][0]["fileData"]["fileUri"]
        label = entry["contents"][1]["parts"][0]["text"]
        label = 1 if label.strip().lower() == "yes" else 0
        
        local_image = image_uri.replace("gs://my_trial_bucket_finetune", fakeddit_dir)
        if os.path.exists(local_image):
            image_text_data.append((text, local_image, label))

print(f"Loaded {len(image_text_data)} multimodal samples")


Loaded 4000 multimodal samples


In [6]:
datagen = ImageDataGenerator(rescale=1./255, validation_split=0.2)

# Create TensorFlow-ready generator later in model training phase
def load_and_preprocess_image(path, target_size=(224, 224)):
    try:
        img = Image.open(path).convert('RGB').resize(target_size)
        return np.array(img) / 255.0
    except:
        return None


In [8]:
import os

os.makedirs("../data/processed", exist_ok=True)


In [10]:
import random

np.savez_compressed("../data/processed/news_text_data.npz", 
                    X_train=X_train_text, X_val=X_val_text, 
                    y_train=y_train_text, y_val=y_val_text)

# Save a smaller subset of multimodal for prototyping
subset = random.sample(image_text_data, min(1000, len(image_text_data)))
pd.DataFrame(subset, columns=["text", "image_path", "label"]).to_csv(
    "../data/processed/fakeddit_sample.csv", index=False
)
