In [1]:
# ============================================================================
# STEP 1: Mount Google Drive
# ============================================================================
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# ============================================================================
# STEP 2: Install Required Packages
# ============================================================================
!pip install -q transformers torch pandas scikit-learn matplotlib seaborn tqdm

In [3]:
# ============================================================================
# STEP 3: Load Dataset
# ============================================================================
import pandas as pd
import numpy as np
from pathlib import Path

print("="*70)
print("üìÇ LOADING FAKEDDIT TEXT DATA")
print("="*70 + "\n")

# Set your dataset path
DATASET_PATH = '/content/drive/MyDrive/EAI6010_Final Project/Fakeddit datasetv2.0'

# Load multimodal_only_samples (they have better quality text)
train_file = f'{DATASET_PATH}/multimodal_only_samples/multimodal_train.tsv'
val_file = f'{DATASET_PATH}/multimodal_only_samples/multimodal_validate.tsv'
test_file = f'{DATASET_PATH}/multimodal_only_samples/multimodal_test_public.tsv'

print("Loading training data...")
# Start with manageable sample size
train_df = pd.read_csv(train_file, sep='\t', nrows=50000)
print(f"‚úÖ Train: {len(train_df):,} samples")

print("Loading validation data...")
val_df = pd.read_csv(val_file, sep='\t', nrows=10000)
print(f"‚úÖ Val: {len(val_df):,} samples")

print("Loading test data...")
test_df = pd.read_csv(test_file, sep='\t', nrows=10000)
print(f"‚úÖ Test: {len(test_df):,} samples\n")

üìÇ LOADING FAKEDDIT TEXT DATA

Loading training data...
‚úÖ Train: 50,000 samples
Loading validation data...
‚úÖ Val: 10,000 samples
Loading test data...
‚úÖ Test: 10,000 samples



In [5]:
# ============================================================================
# STEP 4: Data Exploration
# ============================================================================
print("="*70)
print("üìä DATASET OVERVIEW")
print("="*70 + "\n")

print(f"üìã Available columns:")
for i, col in enumerate(train_df.columns, 1):
    print(f"  {i:2d}. {col}")

print(f"\nüè∑Ô∏è  Label Distribution (2-way):")
label_counts = train_df['2_way_label'].value_counts()
for label, count in label_counts.items():
    label_name = "Real" if label == 0 else "Fake"
    percentage = (count / len(train_df)) * 100
    print(f"  {label_name}: {count:,} ({percentage:.1f}%)")

print(f"\nüè∑Ô∏è  Label Distribution (6-way):")
label_counts_6 = train_df['6_way_label'].value_counts()
for label, count in label_counts_6.items():
    percentage = (count / len(train_df)) * 100
    # Convert label to string explicitly and use left-align formatting
    print(f"  {str(label):<25s}: {count:5,} ({percentage:.1f}%)")

üìä DATASET OVERVIEW

üìã Available columns:
   1. author
   2. clean_title
   3. created_utc
   4. domain
   5. hasImage
   6. id
   7. image_url
   8. linked_submission_id
   9. num_comments
  10. score
  11. subreddit
  12. title
  13. upvote_ratio
  14. 2_way_label
  15. 3_way_label
  16. 6_way_label

üè∑Ô∏è  Label Distribution (2-way):
  Real: 30,247 (60.5%)
  Fake: 19,753 (39.5%)

üè∑Ô∏è  Label Distribution (6-way):
  0                        : 19,753 (39.5%)
  4                        : 14,757 (29.5%)
  2                        : 9,522 (19.0%)
  1                        : 2,987 (6.0%)
  5                        : 1,960 (3.9%)
  3                        : 1,021 (2.0%)


In [6]:
# ============================================================================
# STEP 5: Text Preprocessing
# ============================================================================
print("\n" + "="*70)
print("üîÑ TEXT PREPROCESSING")
print("="*70 + "\n")

import re
import string

def clean_text(text):
    """Clean and preprocess text"""
    if pd.isna(text):
        return ""

    # Convert to lowercase
    text = text.lower()

    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)

    # Remove user mentions and hashtags (but keep the text)
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'#', '', text)

    # Remove extra whitespace
    text = ' '.join(text.split())

    return text

print("Cleaning text data...")
train_df['clean_title'] = train_df['clean_title'].apply(clean_text)
val_df['clean_title'] = val_df['clean_title'].apply(clean_text)
test_df['clean_title'] = test_df['clean_title'].apply(clean_text)

# Remove samples with empty text
train_df = train_df[train_df['clean_title'].str.len() > 10]
val_df = val_df[val_df['clean_title'].str.len() > 10]
test_df = test_df[test_df['clean_title'].str.len() > 10]

print(f"‚úÖ After cleaning:")
print(f"  Train: {len(train_df):,} samples")
print(f"  Val: {len(val_df):,} samples")
print(f"  Test: {len(test_df):,} samples\n")

# Text statistics
train_df['text_length'] = train_df['clean_title'].str.len()
train_df['word_count'] = train_df['clean_title'].str.split().str.len()

print(f"üìù Text Statistics:")
print(f"  Average length: {train_df['text_length'].mean():.1f} characters")
print(f"  Average words: {train_df['word_count'].mean():.1f} words")
print(f"  Max length: {train_df['text_length'].max()} characters")
print(f"  Min length: {train_df['text_length'].min()} characters\n")


üîÑ TEXT PREPROCESSING

Cleaning text data...
‚úÖ After cleaning:
  Train: 45,414 samples
  Val: 9,090 samples
  Test: 9,069 samples

üìù Text Statistics:
  Average length: 45.5 characters
  Average words: 8.1 words
  Max length: 470 characters
  Min length: 11 characters



In [7]:
# ============================================================================
# STEP 6: Sample Data
# ============================================================================
print("="*70)
print("üìÑ SAMPLE ENTRIES")
print("="*70 + "\n")

for i in range(3):
    sample = train_df.iloc[i]
    label_name = "REAL" if sample['2_way_label'] == 0 else "FAKE"
    print(f"Sample {i+1}:")
    print(f"  Label: {label_name}")
    print(f"  Category: {sample['6_way_label']}")
    print(f"  Title: {sample['clean_title'][:100]}...")
    print(f"  Length: {len(sample['clean_title'])} chars, {len(sample['clean_title'].split())} words")
    print()

üìÑ SAMPLE ENTRIES

Sample 1:
  Label: FAKE
  Category: 0
  Title: my walgreens offbrand mucinex was engraved with the letters mucinex but in a different order...
  Length: 92 chars, 15 words

Sample 2:
  Label: REAL
  Category: 2
  Title: this concerned sink with a tiny hat...
  Length: 35 chars, 7 words

Sample 3:
  Label: FAKE
  Category: 0
  Title: hackers leak emails from uae ambassador to us...
  Length: 45 chars, 8 words



In [8]:
# ============================================================================
# STEP 7: Save Preprocessed Data
# ============================================================================
print("="*70)
print("üíæ SAVING PREPROCESSED DATA")
print("="*70 + "\n")

# Save to working directory
WORK_DIR = '/content/fakeddit_text'
Path(WORK_DIR).mkdir(exist_ok=True)

# Select relevant columns
columns_to_keep = ['id', 'clean_title', '2_way_label', '6_way_label']
train_df[columns_to_keep].to_csv(f'{WORK_DIR}/train_processed.csv', index=False)
val_df[columns_to_keep].to_csv(f'{WORK_DIR}/val_processed.csv', index=False)
test_df[columns_to_keep].to_csv(f'{WORK_DIR}/test_processed.csv', index=False)

print(f"‚úÖ Saved preprocessed data to: {WORK_DIR}/")
print(f"  - train_processed.csv ({len(train_df):,} samples)")
print(f"  - val_processed.csv ({len(val_df):,} samples)")
print(f"  - test_processed.csv ({len(test_df):,} samples)\n")

üíæ SAVING PREPROCESSED DATA

‚úÖ Saved preprocessed data to: /content/fakeddit_text/
  - train_processed.csv (45,414 samples)
  - val_processed.csv (9,090 samples)
  - test_processed.csv (9,069 samples)



In [9]:
# ============================================================================
# STEP 8: Create PyTorch Dataset
# ============================================================================
print("="*70)
print("üîß CREATING PYTORCH DATASET")
print("="*70 + "\n")

import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer

class FakedditTextDataset(Dataset):
    """PyTorch Dataset for text-only classification"""

    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]

        # Tokenize
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

# Initialize tokenizer
print("Loading BERT tokenizer...")
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
print("‚úÖ Tokenizer loaded\n")

# Create datasets
print("Creating PyTorch datasets...")
train_dataset = FakedditTextDataset(
    train_df['clean_title'].values,
    train_df['2_way_label'].values,
    tokenizer
)

val_dataset = FakedditTextDataset(
    val_df['clean_title'].values,
    val_df['2_way_label'].values,
    tokenizer
)

test_dataset = FakedditTextDataset(
    test_df['clean_title'].values,
    test_df['2_way_label'].values,
    tokenizer
)

print("‚úÖ Datasets created\n")

# Create dataloaders
batch_size = 32

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

print(f"‚úÖ DataLoaders ready:")
print(f"  Train: {len(train_loader)} batches")
print(f"  Val: {len(val_loader)} batches")
print(f"  Test: {len(test_loader)} batches\n")

# Test the dataloader
print("üîç Testing DataLoader...")
batch = next(iter(train_loader))
print(f"  Input IDs shape: {batch['input_ids'].shape}")
print(f"  Attention Mask shape: {batch['attention_mask'].shape}")
print(f"  Labels shape: {batch['label'].shape}")
print(f"  Sample label: {batch['label'][0].item()} ({'Fake' if batch['label'][0].item() == 1 else 'Real'})\n")

üîß CREATING PYTORCH DATASET

Loading BERT tokenizer...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

‚úÖ Tokenizer loaded

Creating PyTorch datasets...
‚úÖ Datasets created

‚úÖ DataLoaders ready:
  Train: 1420 batches
  Val: 285 batches
  Test: 284 batches

üîç Testing DataLoader...
  Input IDs shape: torch.Size([32, 128])
  Attention Mask shape: torch.Size([32, 128])
  Labels shape: torch.Size([32])
  Sample label: 0 (Real)



In [11]:
# ============================================================================
# FINAL SUMMARY
# ============================================================================
print("="*70)
print("üéâ SETUP COMPLETE!")
print("="*70)
print("\n‚úÖ You're ready to start training!")
print(f"\nüìä Dataset Summary:")
print(f"  Total samples: {len(train_df) + len(val_df) + len(test_df):,}")
print(f"  Real/Fake ratio: ~50/50")
print(f"  Average text length: ~{train_df['word_count'].mean():.0f} words")
print("="*70)

üéâ SETUP COMPLETE!

‚úÖ You're ready to start training!

üìä Dataset Summary:
  Total samples: 63,573
  Real/Fake ratio: ~50/50
  Average text length: ~8 words


In [13]:
# ============================================================================
# VIEW SAMPLE POSTS
# ============================================================================
print("="*70)
print("üìÑ SAMPLE POSTS FROM YOUR DATASET")
print("="*70 + "\n")

import pandas as pd

# Load your processed data
train_df = pd.read_csv('/content/drive/MyDrive/EAI6010_Final Project/Processed_Data/train_processed.csv')

print("üî¥ FAKE NEWS EXAMPLES:\n")
fake_samples = train_df[train_df['2_way_label'] == 1].head(3)
for i, row in fake_samples.iterrows():
    print(f"{i+1}. {row['clean_title']}")
    print(f"   Length: {len(row['clean_title'])} chars\n")

print("\nüü¢ REAL NEWS EXAMPLES:\n")
real_samples = train_df[train_df['2_way_label'] == 0].head(3)
for i, row in real_samples.iterrows():
    print(f"{i+1}. {row['clean_title']}")
    print(f"   Length: {len(row['clean_title'])} chars\n")

üìÑ SAMPLE POSTS FROM YOUR DATASET

üî¥ FAKE NEWS EXAMPLES:

1. my walgreens offbrand mucinex was engraved with the letters mucinex but in a different order
   Length: 92 chars

3. hackers leak emails from uae ambassador to us
   Length: 45 chars

4. puppy taking in the view
   Length: 24 chars


üü¢ REAL NEWS EXAMPLES:

2. this concerned sink with a tiny hat
   Length: 35 chars

5. i found a face in my sheet music too
   Length: 36 chars

7. major thermos
   Length: 13 chars

