In [None]:
# Data Exploration Notebook for Fake Image Detection
# This notebook explores the dataset and provides insights into real vs fake images

# Import necessary libraries
import os
import cv2
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from PIL import Image
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

# Set style for better visualizations
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

# Set up paths
DATA_DIR = '../data'
REAL_DIR = os.path.join(DATA_DIR, 'real')
FAKE_DIR = os.path.join(DATA_DIR, 'fake')
TEST_DIR = os.path.join(DATA_DIR, 'test')

print("🔍 FAKE IMAGE DETECTION - DATA EXPLORATION")
print("=" * 50)

# Function to get image information
def get_image_info(image_path):
    """Extract basic information from an image"""
    try:
        # Using PIL
        with Image.open(image_path) as img:
            width, height = img.size
            mode = img.mode
            format_type = img.format
        
        # Using OpenCV for additional info
        cv_img = cv2.imread(image_path)
        if cv_img is not None:
            cv_height, cv_width, channels = cv_img.shape
            file_size = os.path.getsize(image_path)
            
            return {
                'width': width,
                'height': height,
                'channels': channels,
                'mode': mode,
                'format': format_type,
                'file_size': file_size,
                'aspect_ratio': width / height,
                'total_pixels': width * height
            }
    except Exception as e:
        print(f"Error processing {image_path}: {e}")
        return None

# Function to analyze directory
def analyze_directory(directory, label):
    """Analyze all images in a directory"""
    if not os.path.exists(directory):
        print(f"⚠️  Directory {directory} does not exist!")
        return pd.DataFrame()
    
    image_data = []
    image_extensions = ['.jpg', '.jpeg', '.png', '.bmp', '.tiff', '.webp']
    
    print(f"\n📁 Analyzing {label} images in: {directory}")
    
    files = [f for f in os.listdir(directory) 
             if any(f.lower().endswith(ext) for ext in image_extensions)]
    
    print(f"Found {len(files)} images")
    
    for filename in files:
        filepath = os.path.join(directory, filename)
        info = get_image_info(filepath)
        if info:
            info['filename'] = filename
            info['label'] = label
            info['filepath'] = filepath
            image_data.append(info)
    
    return pd.DataFrame(image_data)

# Analyze real and fake images
print("\n🔍 ANALYZING DATASET...")
real_df = analyze_directory(REAL_DIR, 'real')
fake_df = analyze_directory(FAKE_DIR, 'fake')

# Combine datasets
if not real_df.empty and not fake_df.empty:
    df = pd.concat([real_df, fake_df], ignore_index=True)
elif not real_df.empty:
    df = real_df
elif not fake_df.empty:
    df = fake_df
else:
    print("⚠️  No images found in either directory!")
    df = pd.DataFrame()

if not df.empty:
    print(f"\n📊 DATASET OVERVIEW")
    print("-" * 30)
    print(f"Total images: {len(df)}")
    print(f"Real images: {len(real_df)}")
    print(f"Fake images: {len(fake_df)}")
    
    # Display basic statistics
    print(f"\n📈 BASIC STATISTICS")
    print("-" * 30)
    print(df.describe())
    
    # Dataset distribution
    print(f"\n🎯 CLASS DISTRIBUTION")
    print("-" * 30)
    class_counts = df['label'].value_counts()
    print(class_counts)
    
    # Calculate percentages
    total_images = len(df)
    real_percentage = (len(real_df) / total_images) * 100
    fake_percentage = (len(fake_df) / total_images) * 100
    
    print(f"\nReal images: {real_percentage:.1f}%")
    print(f"Fake images: {fake_percentage:.1f}%")
    
    # VISUALIZATIONS
    print(f"\n📊 CREATING VISUALIZATIONS...")
    
    # 1. Class Distribution Pie Chart
    fig, axes = plt.subplots(2, 2, figsize=(15, 12))
    fig.suptitle('Dataset Overview and Distribution Analysis', fontsize=16, fontweight='bold')
    
    # Pie chart for class distribution
    colors = ['#FF6B6B', '#4ECDC4']
    axes[0, 0].pie(class_counts.values, labels=class_counts.index, autopct='%1.1f%%', 
                   colors=colors, startangle=90)
    axes[0, 0].set_title('Class Distribution', fontweight='bold')
    
    # 2. Image dimensions scatter plot
    axes[0, 1].scatter(df[df['label'] == 'real']['width'], 
                      df[df['label'] == 'real']['height'], 
                      alpha=0.6, label='Real', color='#4ECDC4')
    axes[0, 1].scatter(df[df['label'] == 'fake']['width'], 
                      df[df['label'] == 'fake']['height'], 
                      alpha=0.6, label='Fake', color='#FF6B6B')
    axes[0, 1].set_xlabel('Width (pixels)')
    axes[0, 1].set_ylabel('Height (pixels)')
    axes[0, 1].set_title('Image Dimensions Distribution', fontweight='bold')
    axes[0, 1].legend()
    axes[0, 1].grid(True, alpha=0.3)
    
    # 3. File size distribution
    axes[1, 0].hist(df[df['label'] == 'real']['file_size'], 
                   bins=30, alpha=0.7, label='Real', color='#4ECDC4')
    axes[1, 0].hist(df[df['label'] == 'fake']['file_size'], 
                   bins=30, alpha=0.7, label='Fake', color='#FF6B6B')
    axes[1, 0].set_xlabel('File Size (bytes)')
    axes[1, 0].set_ylabel('Frequency')
    axes[1, 0].set_title('File Size Distribution', fontweight='bold')
    axes[1, 0].legend()
    axes[1, 0].grid(True, alpha=0.3)
    
    # 4. Aspect ratio distribution
    axes[1, 1].hist(df[df['label'] == 'real']['aspect_ratio'], 
                   bins=30, alpha=0.7, label='Real', color='#4ECDC4')
    axes[1, 1].hist(df[df['label'] == 'fake']['aspect_ratio'], 
                   bins=30, alpha=0.7, label='Fake', color='#FF6B6B')
    axes[1, 1].set_xlabel('Aspect Ratio')
    axes[1, 1].set_ylabel('Frequency')
    axes[1, 1].set_title('Aspect Ratio Distribution', fontweight='bold')
    axes[1, 1].legend()
    axes[1, 1].grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    
    # Advanced visualizations with Plotly
    print("\n🎨 ADVANCED INTERACTIVE VISUALIZATIONS")
    
    # Interactive scatter plot
    fig_scatter = px.scatter(df, x='width', y='height', color='label',
                            title='Interactive Image Dimensions Analysis',
                            labels={'width': 'Width (pixels)', 'height': 'Height (pixels)'},
                            hover_data=['filename', 'file_size', 'aspect_ratio'])
    fig_scatter.show()
    
    # Box plots for various metrics
    fig_box = make_subplots(
        rows=2, cols=2,
        subplot_titles=('File Size Distribution', 'Aspect Ratio Distribution', 
                       'Width Distribution', 'Height Distribution'),
        specs=[[{"secondary_y": False}, {"secondary_y": False}],
               [{"secondary_y": False}, {"secondary_y": False}]]
    )
    
    # File size box plot
    for label in df['label'].unique():
        data = df[df['label'] == label]['file_size']
        fig_box.add_trace(go.Box(y=data, name=f'{label.title()} - File Size'), row=1, col=1)
    
    # Aspect ratio box plot
    for label in df['label'].unique():
        data = df[df['label'] == label]['aspect_ratio']
        fig_box.add_trace(go.Box(y=data, name=f'{label.title()} - Aspect Ratio'), row=1, col=2)
    
    # Width box plot
    for label in df['label'].unique():
        data = df[df['label'] == label]['width']
        fig_box.add_trace(go.Box(y=data, name=f'{label.title()} - Width'), row=2, col=1)
    
    # Height box plot
    for label in df['label'].unique():
        data = df[df['label'] == label]['height']
        fig_box.add_trace(go.Box(y=data, name=f'{label.title()} - Height'), row=2, col=2)
    
    fig_box.update_layout(height=800, title_text="Statistical Distribution Analysis")
    fig_box.show()
    
    # Detailed statistics comparison
    print(f"\n📋 DETAILED STATISTICS COMPARISON")
    print("-" * 50)
    
    # Group by label and calculate statistics
    stats_comparison = df.groupby('label').agg({
        'width': ['mean', 'std', 'min', 'max'],
        'height': ['mean', 'std', 'min', 'max'],
        'file_size': ['mean', 'std', 'min', 'max'],
        'aspect_ratio': ['mean', 'std', 'min', 'max'],
        'total_pixels': ['mean', 'std', 'min', 'max']
    }).round(2)
    
    print(stats_comparison)
    
    # Format analysis
    print(f"\n🎭 IMAGE FORMAT ANALYSIS")
    print("-" * 30)
    format_analysis = df.groupby(['label', 'format']).size().unstack(fill_value=0)
    print(format_analysis)
    
    # Color mode analysis
    print(f"\n🎨 COLOR MODE ANALYSIS")
    print("-" * 30)
    mode_analysis = df.groupby(['label', 'mode']).size().unstack(fill_value=0)
    print(mode_analysis)
    
    # Sample images display
    print(f"\n🖼️  SAMPLE IMAGES VISUALIZATION")
    print("-" * 30)
    
    # Function to display sample images
    def display_sample_images(df, label, num_samples=5):
        """Display sample images from the dataset"""
        sample_df = df[df['label'] == label].sample(min(num_samples, len(df[df['label'] == label])))
        
        fig, axes = plt.subplots(1, len(sample_df), figsize=(15, 3))
        fig.suptitle(f'Sample {label.title()} Images', fontsize=14, fontweight='bold')
        
        if len(sample_df) == 1:
            axes = [axes]
        
        for idx, (_, row) in enumerate(sample_df.iterrows()):
            try:
                img = cv2.imread(row['filepath'])
                img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
                axes[idx].imshow(img_rgb)
                axes[idx].set_title(f"{row['filename']}\n{row['width']}x{row['height']}")
                axes[idx].axis('off')
            except Exception as e:
                axes[idx].text(0.5, 0.5, f'Error loading\n{row["filename"]}', 
                              ha='center', va='center', transform=axes[idx].transAxes)
                axes[idx].axis('off')
        
        plt.tight_layout()
        plt.show()
    
    # Display sample images for each class
    if len(real_df) > 0:
        display_sample_images(df, 'real', 5)
    if len(fake_df) > 0:
        display_sample_images(df, 'fake', 5)
    
    # Generate summary report
    print(f"\n📄 SUMMARY REPORT")
    print("=" * 50)
    print(f"Dataset Analysis Complete!")
    print(f"• Total Images Analyzed: {len(df)}")
    print(f"• Real Images: {len(real_df)} ({real_percentage:.1f}%)")
    print(f"• Fake Images: {len(fake_df)} ({fake_percentage:.1f}%)")
    print(f"• Average Image Size: {df['width'].mean():.0f}x{df['height'].mean():.0f}")
    print(f"• Most Common Format: {df['format'].mode()[0] if not df['format'].empty else 'N/A'}")
    print(f"• Most Common Color Mode: {df['mode'].mode()[0] if not df['mode'].empty else 'N/A'}")
    print(f"• Average File Size: {df['file_size'].mean()/1024:.1f} KB")
    
    # Data quality checks
    print(f"\n🔍 DATA QUALITY CHECKS")
    print("-" * 30)
    print(f"• Missing Values: {df.isnull().sum().sum()}")
    print(f"• Duplicate Files: {df['filename'].duplicated().sum()}")
    print(f"• Images with Unusual Dimensions: {len(df[(df['width'] < 50) | (df['height'] < 50)])}")
    print(f"• Very Large Images (>10MB): {len(df[df['file_size'] > 10*1024*1024])}")
    
    # Save analysis results
    print(f"\n💾 SAVING ANALYSIS RESULTS")
    print("-" * 30)
    
    # Save detailed statistics to CSV
    df.to_csv('../data/dataset_analysis.csv', index=False)
    print("✅ Dataset analysis saved to '../data/dataset_analysis.csv'")
    
    # Save summary statistics
    summary_stats = {
        'total_images': len(df),
        'real_images': len(real_df),
        'fake_images': len(fake_df),
        'real_percentage': real_percentage,
        'fake_percentage': fake_percentage,
        'avg_width': df['width'].mean(),
        'avg_height': df['height'].mean(),
        'avg_file_size': df['file_size'].mean(),
        'most_common_format': df['format'].mode()[0] if not df['format'].empty else 'N/A',
        'most_common_mode': df['mode'].mode()[0] if not df['mode'].empty else 'N/A'
    }
    
    summary_df = pd.DataFrame([summary_stats])
    summary_df.to_csv('../data/dataset_summary.csv', index=False)
    print("✅ Summary statistics saved to '../data/dataset_summary.csv'")
    
else:
    print("⚠️  No images found for analysis!")
    print("Please ensure you have images in the following directories:")
    print(f"• Real images: {REAL_DIR}")
    print(f"• Fake images: {FAKE_DIR}")

print(f"\n🎉 DATA EXPLORATION COMPLETE!")
print("=" * 50)