[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/AnthonyJia/DS4002_CS3/blob/main/Download_data_and_eda.ipynb)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import os
import zipfile
import shutil

# Paths inside your Google Drive
FFHQ_PATH = '/content/drive/MyDrive/ai_classification_dataset/ffhq' # Change this path depending on where the data lives in your drive
STYLEGAN_PATH = '/content/drive/MyDrive/ai_classification_dataset/stylegan' # Change this path depending on where the data lives in your drive

# Output folders
OUTPUT_DIR = '/content/dataset'
REAL_DIR = os.path.join(OUTPUT_DIR, 'real_images')
FAKE_DIR = os.path.join(OUTPUT_DIR, 'fake_images')

os.makedirs(REAL_DIR, exist_ok=True)
os.makedirs(FAKE_DIR, exist_ok=True)

In [None]:
import glob

for folder in sorted(glob.glob(os.path.join(FFHQ_PATH, '*'))):
    folder_name = os.path.basename(folder)
    """
    This effectively downloads all real face images from the FFHQ dataset from folders 00000 - 09000 (10k images).
    If you want to include more images, you can change the code to stop at a higher number folder
    (e.g., 15000 will include folders 00000 - 15000 (16k images)).
    """
    if int(folder_name) <= 9000:
        dest_folder = os.path.join(REAL_DIR, folder_name)
        shutil.copytree(folder, dest_folder, dirs_exist_ok=True)


In [None]:
import glob

for zip_path in sorted(glob.glob(os.path.join(STYLEGAN_PATH, '*.zip'))):
    folder_num = int(os.path.basename(zip_path).replace('.zip', ''))
    """
    This effectively downloads all fake face images from the StyleGAN dataset from folders 00000 - 09000 (10k images).
    If you want to include more images, you can change the code to stop at a higher number folder.
    (e.g., 15000 will include folders 00000 - 15000 (16k images)).
    """
    if folder_num > 9000:  # stop after first 10,000 images
        break

    # Create a subfolder with the same name as the zip
    folder_name = os.path.basename(zip_path).replace('.zip', '')
    dest_folder = os.path.join(FAKE_DIR, folder_name)
    os.makedirs(dest_folder, exist_ok=True)

    # Extract zip contents into that subfolder
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(dest_folder)

In [None]:
from pathlib import Path

real_count = len(list(Path(REAL_DIR).rglob('*.png')))
fake_count = len(list(Path(FAKE_DIR).rglob('*.png')))

print(f"Real images: {real_count}")
print(f"Fake images: {fake_count}")

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Data from previous steps
image_counts = {'Real Images': real_count, 'Fake Images': fake_count}

df_counts = pd.DataFrame(list(image_counts.items()), columns=['Category', 'Count'])

plt.figure(figsize=(8, 6))
plt.bar(df_counts['Category'], df_counts['Count'], color=['blue', 'orange'])
plt.xlabel('Image Category')
plt.ylabel('Number of Images')
plt.title('Distribution of Real vs. Fake Images')
plt.grid(axis='y', linestyle='--')

# Add count labels on top of the bars
for index, row in df_counts.iterrows():
    plt.text(row['Category'], row['Count'] + 50, str(row['Count']), color='black', ha='center')

plt.show()

## Visual check
Sample 5 real and 5 fake images, display them in a grid, and review them for visual characteristics such as centering, cropping, alignment, compression artifacts, texture patterns, lighting, and color tones.

In [None]:
import random
from pathlib import Path

# Get all real image paths
real_image_paths = list(Path(REAL_DIR).rglob('*.png'))

# Randomly select 5 real image paths
sampled_real_images = random.sample(real_image_paths, 5)

# Get all fake image paths
fake_image_paths = list(Path(FAKE_DIR).rglob('*.png'))

# Randomly select 5 fake image paths
sampled_fake_images = random.sample(fake_image_paths, 5)

print("\n--- Sampled Real Image Paths ---")
for img_path in sampled_real_images:
    print(img_path)

print("\n--- Sampled Fake Image Paths ---")
for img_path in sampled_fake_images:
    print(img_path)

In [None]:
import matplotlib.pyplot as plt
from PIL import Image

def display_images_in_grid(image_paths, title, num_rows, num_cols):
    plt.figure(figsize=(num_cols * 4, num_rows * 4))
    plt.suptitle(title, fontsize=16)
    for i, img_path in enumerate(image_paths):
        plt.subplot(num_rows, num_cols, i + 1)
        img = Image.open(img_path)
        plt.imshow(img)
        plt.title(f'Image {i+1}')
        plt.axis('off')
    plt.tight_layout(rect=[0, 0.03, 1, 0.95]) # Adjust layout to prevent title overlap
    plt.show()

# Display sampled real images
display_images_in_grid(sampled_real_images, 'Sampled Real Images', 1, 5)

# Display sampled fake images
display_images_in_grid(sampled_fake_images, 'Sampled Fake Images', 1, 5)

## Randomly sample 500 images from each dataset to perform EDA
Randomly select 500 image paths from `REAL_DIR` and 500 image paths from `FAKE_DIR` to create a manageable subset for analysis.

In [None]:
import random
from pathlib import Path

# Get all real image paths
all_real_image_paths = list(Path(REAL_DIR).rglob('*.png'))

# Randomly select 500 real image paths
sampled_real_images_subset = random.sample(all_real_image_paths, 500)

# Get all fake image paths
all_fake_image_paths = list(Path(FAKE_DIR).rglob('*.png'))

# Randomly select 500 fake image paths
sampled_fake_images_subset = random.sample(all_fake_image_paths, 500)

print(f"Number of sampled real images: {len(sampled_real_images_subset)}")
print(f"Number of sampled fake images: {len(sampled_fake_images_subset)}")

print("\nFirst 5 sampled real image paths:")
for i, path in enumerate(sampled_real_images_subset[:5]):
    print(f"  {i+1}. {path}")

print("\nFirst 5 sampled fake image paths:")
for i, path in enumerate(sampled_fake_images_subset[:5]):
    print(f"  {i+1}. {path}")

## Extract Image Properties

For each image in the sampled subset, extract its dimensions (width and height), calculate its aspect ratio, and compute its average RGB color values. Store these properties in a structured format, like a Pandas DataFrame.


In [None]:
import pandas as pd
from PIL import Image

image_data = []

def get_avg_rgb(image):
    # Ensure image is in RGB format
    if image.mode != 'RGB':
        image = image.convert('RGB')

    # Get image data as a list of pixel values
    pixels = list(image.getdata())

    # Calculate sum of R, G, B channels
    r_sum = sum(p[0] for p in pixels)
    g_sum = sum(p[1] for p in pixels)
    b_sum = sum(p[2] for p in pixels)

    # Calculate average R, G, B
    num_pixels = len(pixels)
    avg_r = r_sum / num_pixels
    avg_g = g_sum / num_pixels
    avg_b = b_sum / num_pixels

    return avg_r, avg_g, avg_b

# Process real images
for img_path in sampled_real_images_subset:
    try:
        with Image.open(img_path) as img:
            width, height = img.size
            aspect_ratio = width / height
            avg_r, avg_g, avg_b = get_avg_rgb(img)
            image_data.append({
                'path': str(img_path),
                'category': 'real',
                'width': width,
                'height': height,
                'aspect_ratio': aspect_ratio,
                'avg_r': avg_r,
                'avg_g': avg_g,
                'avg_b': avg_b
            })
    except Exception as e:
        print(f"Error processing real image {img_path}: {e}")

# Process fake images
for img_path in sampled_fake_images_subset:
    try:
        with Image.open(img_path) as img:
            width, height = img.size
            aspect_ratio = width / height
            avg_r, avg_g, avg_b = get_avg_rgb(img)
            image_data.append({
                'path': str(img_path),
                'category': 'fake',
                'width': width,
                'height': height,
                'aspect_ratio': aspect_ratio,
                'avg_r': avg_r,
                'avg_g': avg_g,
                'avg_b': avg_b
            })
    except Exception as e:
        print(f"Error processing fake image {img_path}: {e}")

# Convert to DataFrame
df_image_properties = pd.DataFrame(image_data)

print(f"DataFrame created with {len(df_image_properties)} entries.")
print(df_image_properties.head())

In [None]:
import pandas as pd

# Group by category and calculate descriptive statistics for specified columns
properties_to_analyze = ['width', 'height', 'aspect_ratio', 'avg_r', 'avg_g', 'avg_b']
descriptive_stats = df_image_properties.groupby('category')[properties_to_analyze].describe()

print("\n--- Descriptive Statistics for Image Properties (Real vs. Fake) ---")
print(descriptive_stats)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Create a figure with subplots for color distributions
fig, axes = plt.subplots(1, 3, figsize=(18, 5))
fig.suptitle('Distribution of Average RGB Color Values (Real vs. Fake)', fontsize=16)

# Plot for Red channel
sns.histplot(data=df_image_properties, x='avg_r', hue='category', kde=True, ax=axes[0], palette={'real': 'blue', 'fake': 'orange'})
axes[0].set_title('Average Red (avg_r)')
axes[0].set_xlabel('Average Red Value')
axes[0].set_ylabel('Frequency')
axes[0].legend(title='Category', labels=['Fake', 'Real'])

# Plot for Green channel
sns.histplot(data=df_image_properties, x='avg_g', hue='category', kde=True, ax=axes[1], palette={'real': 'blue', 'fake': 'orange'})
axes[1].set_title('Average Green (avg_g)')
axes[1].set_xlabel('Average Green Value')
axes[1].set_ylabel('Frequency')
axes[1].legend(title='Category', labels=['Fake', 'Real'])

# Plot for Blue channel
sns.histplot(data=df_image_properties, x='avg_b', hue='category', kde=True, ax=axes[2], palette={'real': 'blue', 'fake': 'orange'})
axes[2].set_title('Average Blue (avg_b)')
axes[2].set_xlabel('Average Blue Value')
axes[2].set_ylabel('Frequency')
axes[2].legend(title='Category', labels=['Fake', 'Real'])

plt.tight_layout(rect=[0, 0.03, 1, 0.95])
plt.show()


## Calculate Brightness

### Subtask:
Calculate a brightness metric (e.g., luminance) for each image in the `df_image_properties` DataFrame, using the previously extracted `avg_r`, `avg_g`, and `avg_b` values. A common formula for luminance is `0.299*R + 0.587*G + 0.114*B`.


In [None]:
df_image_properties['brightness'] = 0.299 * df_image_properties['avg_r'] + 0.587 * df_image_properties['avg_g'] + 0.114 * df_image_properties['avg_b']

print("DataFrame with new 'brightness' column:")
print(df_image_properties.head())

In [None]:
brightness_stats = df_image_properties.groupby('category')['brightness'].describe()

print("\n--- Descriptive Statistics for Brightness (Real vs. Fake) ---")
print(brightness_stats)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(10, 6))
sns.histplot(data=df_image_properties, x='brightness', hue='category', kde=True, palette={'real': 'blue', 'fake': 'orange'})
plt.title('Distribution of Image Brightness (Real vs. Fake)')
plt.xlabel('Brightness Value (Luminance)')
plt.ylabel('Frequency')
plt.legend(title='Category', labels=['Fake', 'Real'])
plt.grid(axis='y', linestyle='--')
plt.show()

# Analyze texture/level of detail
Calculate a 2D Fast Fourier Transform (FFT) for each image in the `df_image_properties` DataFrame after converting it to grayscale, then compute the magnitude spectrum, and extract the average magnitude as a new feature, storing it back into `df_image_properties`.

In [None]:
import numpy as np
from PIL import Image

def calculate_avg_magnitude_spectrum(image_path):
    try:
        # a. Load the image
        with Image.open(image_path) as img:
            # b. Convert the image to grayscale
            gray_img = img.convert('L')

            # c. Convert the grayscale image to a NumPy array
            img_array = np.array(gray_img)

            # d. Perform a 2D Fast Fourier Transform (FFT)
            fft_transform = np.fft.fft2(img_array)

            # e. Shift the zero-frequency component to the center of the spectrum
            fft_shifted = np.fft.fftshift(fft_transform)

            # f. Compute the magnitude spectrum
            magnitude_spectrum = np.abs(fft_shifted)

            # g. Calculate the average of this magnitude spectrum
            average_magnitude = np.mean(magnitude_spectrum)

            # h. Return the average magnitude
            return average_magnitude
    except Exception as e:
        print(f"Error processing image {image_path}: {e}")
        return np.nan # Return NaN for images that could not be processed

print("Libraries imported and 'calculate_avg_magnitude_spectrum' function defined.")

In [None]:
df_image_properties['avg_magnitude'] = df_image_properties['path'].apply(calculate_avg_magnitude_spectrum)

print("DataFrame with new 'avg_magnitude' column:")
print(df_image_properties.head())

In [None]:
avg_magnitude_stats = df_image_properties.groupby('category')['avg_magnitude'].describe()

print("\n--- Descriptive Statistics for Average Magnitude Spectrum (Real vs. Fake) ---")
print(avg_magnitude_stats)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(10, 6))
sns.histplot(data=df_image_properties, x='avg_magnitude', hue='category', kde=True, palette={'real': 'blue', 'fake': 'orange'})
plt.title('Distribution of Average Magnitude Spectrum (Real vs. Fake)')
plt.xlabel('Average Magnitude Spectrum')
plt.ylabel('Frequency')
plt.legend(title='Category', labels=['Fake', 'Real'])
plt.grid(axis='y', linestyle='--')
plt.show()

## Visualize Sample FFTs


In [None]:
import matplotlib.pyplot as plt
import numpy as np
from PIL import Image

def display_magnitude_spectrum(image_path, title):
    try:
        # a. Load the image from the given path
        with Image.open(image_path) as img:
            # b. Convert the image to grayscale
            gray_img = img.convert('L')

            # c. Convert the grayscale image to a NumPy array
            img_array = np.array(gray_img)

            # d. Perform a 2D Fast Fourier Transform (FFT)
            fft_transform = np.fft.fft2(img_array)

            # e. Shift the zero-frequency component to the center of the spectrum
            fft_shifted = np.fft.fftshift(fft_transform)

            # f. Compute the magnitude spectrum
            magnitude_spectrum = np.abs(fft_shifted)

            # g. Display the magnitude spectrum
            plt.imshow(np.log(magnitude_spectrum + 1), cmap='gray') # Use log scale for better visualization

            # h. Set the title of the plot and turn off the axes
            plt.title(title)
            plt.axis('off')
    except Exception as e:
        print(f"Error processing image {image_path}: {e}")

print("display_magnitude_spectrum function defined.")

In [None]:
num_images_to_display = 3 # Display 3 real and 3 fake images

# Select a small number of image paths for visualization
sampled_real_for_fft_display = sampled_real_images_subset[:num_images_to_display]
sampled_fake_for_fft_display = sampled_fake_images_subset[:num_images_to_display]

# Create a matplotlib figure with subplots
plt.figure(figsize=(num_images_to_display * 4, 8))
plt.suptitle('FFT Magnitude Spectrum of Sample Images (Real vs. Fake)', fontsize=16)

# Display real images FFT
for i, img_path in enumerate(sampled_real_for_fft_display):
    plt.subplot(2, num_images_to_display, i + 1)
    display_magnitude_spectrum(img_path, f'Real Image {i+1} FFT')

# Display fake images FFT
for i, img_path in enumerate(sampled_fake_for_fft_display):
    plt.subplot(2, num_images_to_display, num_images_to_display + i + 1)
    display_magnitude_spectrum(img_path, f'Fake Image {i+1} FFT')

plt.tight_layout(rect=[0, 0.03, 1, 0.95])
plt.show()

# Run the code below to zip the images and download them onto your local computer

In [None]:
!zip -r real_images.zip /content/dataset/real_images
!zip -r fake_images.zip /content/dataset/fake_images

from google.colab import files
files.download("real_images.zip")
files.download("fake_images.zip")