# Data Exploration - Flickr8k Dataset

Comprehensive exploration and analysis of the Flickr8k dataset.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from PIL import Image
from collections import Counter
from wordcloud import WordCloud
import pickle

sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (14, 6)

## 1. Load Data

In [None]:
# Load cleaned captions
data_dir = Path('../data/flickr8k')
df = pd.read_csv(data_dir / 'processed/captions_clean.csv')

# Load vocabulary
with open(data_dir / 'processed/vocabulary.pkl', 'rb') as f:
    vocab = pickle.load(f)

print(f"Dataset loaded successfully!")
print(f"  Total captions: {len(df):,}")
print(f"  Unique images: {df['image'].nunique():,}")
print(f"  Vocabulary size: {len(vocab):,}")

## 2. Dataset Statistics

In [None]:
# Load splits
train_df = pd.read_csv(data_dir / 'processed/train.csv')
val_df = pd.read_csv(data_dir / 'processed/val.csv')
test_df = pd.read_csv(data_dir / 'processed/test.csv')

# Create summary table
summary = pd.DataFrame({
    'Split': ['Train', 'Validation', 'Test', 'Total'],
    'Images': [
        train_df['image'].nunique(),
        val_df['image'].nunique(),
        test_df['image'].nunique(),
        df['image'].nunique()
    ],
    'Captions': [len(train_df), len(val_df), len(test_df), len(df)]
})

summary['Captions/Image'] = summary['Captions'] / summary['Images']
summary['Percentage'] = (summary['Images'] / summary.loc[3, 'Images'] * 100).round(1)

print("\nDataset Split Summary:")
print("=" * 70)
print(summary.to_string(index=False))
print("=" * 70)

In [None]:
# Visualize split distribution
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

# Images per split
splits = ['Train', 'Val', 'Test']
image_counts = [train_df['image'].nunique(), val_df['image'].nunique(), test_df['image'].nunique()]
colors = ['#3498db', '#2ecc71', '#e74c3c']

ax1.bar(splits, image_counts, color=colors, alpha=0.7, edgecolor='black')
ax1.set_ylabel('Number of Images')
ax1.set_title('Images per Split')
ax1.grid(axis='y', alpha=0.3)
for i, v in enumerate(image_counts):
    ax1.text(i, v + 50, str(v), ha='center', fontweight='bold')

# Pie chart
ax2.pie(image_counts, labels=splits, colors=colors, autopct='%1.1f%%', startangle=90)
ax2.set_title('Distribution of Images')

plt.tight_layout()
plt.show()

## 3. Caption Length Analysis

In [None]:
# Calculate caption lengths
df['caption_length'] = df['caption'].apply(lambda x: len(x.split()))

# Statistics
stats = df['caption_length'].describe()
print("\nCaption Length Statistics:")
print("=" * 40)
print(f"Mean:   {stats['mean']:.2f} words")
print(f"Median: {stats['50%']:.0f} words")
print(f"Std:    {stats['std']:.2f} words")
print(f"Min:    {stats['min']:.0f} words")
print(f"Max:    {stats['max']:.0f} words")
print(f"25%:    {stats['25%']:.0f} words")
print(f"75%:    {stats['75%']:.0f} words")
print("=" * 40)

In [None]:
# Visualize length distribution
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Histogram
axes[0, 0].hist(df['caption_length'], bins=30, edgecolor='black', alpha=0.7, color='steelblue')
axes[0, 0].axvline(df['caption_length'].mean(), color='red', linestyle='--', linewidth=2, label=f'Mean: {df["caption_length"].mean():.1f}')
axes[0, 0].axvline(df['caption_length'].median(), color='green', linestyle='--', linewidth=2, label=f'Median: {df["caption_length"].median():.0f}')
axes[0, 0].set_xlabel('Caption Length (words)')
axes[0, 0].set_ylabel('Frequency')
axes[0, 0].set_title('Distribution of Caption Lengths')
axes[0, 0].legend()
axes[0, 0].grid(alpha=0.3)

# Box plot
axes[0, 1].boxplot(df['caption_length'], vert=False, patch_artist=True)
axes[0, 1].set_xlabel('Caption Length (words)')
axes[0, 1].set_title('Caption Length Distribution (Box Plot)')
axes[0, 1].grid(alpha=0.3)

# Cumulative distribution
sorted_lengths = np.sort(df['caption_length'])
cumulative = np.arange(1, len(sorted_lengths) + 1) / len(sorted_lengths)
axes[1, 0].plot(sorted_lengths, cumulative, linewidth=2)
axes[1, 0].axhline(0.95, color='red', linestyle='--', label='95th percentile')
axes[1, 0].set_xlabel('Caption Length (words)')
axes[1, 0].set_ylabel('Cumulative Probability')
axes[1, 0].set_title('Cumulative Distribution')
axes[1, 0].legend()
axes[1, 0].grid(alpha=0.3)

# Length by split
train_df['caption_length'] = train_df['caption'].apply(lambda x: len(x.split()))
val_df['caption_length'] = val_df['caption'].apply(lambda x: len(x.split()))
test_df['caption_length'] = test_df['caption'].apply(lambda x: len(x.split()))

data_to_plot = [train_df['caption_length'], val_df['caption_length'], test_df['caption_length']]
axes[1, 1].boxplot(data_to_plot, labels=['Train', 'Val', 'Test'], patch_artist=True)
axes[1, 1].set_ylabel('Caption Length (words)')
axes[1, 1].set_title('Caption Length by Split')
axes[1, 1].grid(alpha=0.3)

plt.tight_layout()
plt.show()

## 4. Vocabulary Analysis

In [None]:
# Count word frequencies
all_words = []
for caption in df['caption']:
    all_words.extend(caption.split())

word_freq = Counter(all_words)

print(f"\nVocabulary Statistics:")
print("=" * 40)
print(f"Total words (with repetition): {len(all_words):,}")
print(f"Unique words: {len(word_freq):,}")
print(f"Words in vocabulary: {len(vocab):,}")
print(f"Vocabulary coverage: {len(vocab) / len(word_freq) * 100:.2f}%")
print("=" * 40)

In [None]:
# Top 30 most common words
top_words = word_freq.most_common(30)

words, counts = zip(*top_words)

plt.figure(figsize=(14, 6))
plt.barh(range(len(words)), counts, color='steelblue', alpha=0.7, edgecolor='black')
plt.yticks(range(len(words)), words)
plt.xlabel('Frequency')
plt.title('Top 30 Most Common Words')
plt.gca().invert_yaxis()
plt.grid(axis='x', alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
# Word cloud
text = ' '.join(df['caption'])
wordcloud = WordCloud(width=1200, height=600, background_color='white', colormap='viridis').generate(text)

plt.figure(figsize=(16, 8))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud of All Captions', fontsize=20, fontweight='bold')
plt.tight_layout()
plt.show()

In [None]:
# Word frequency distribution
freq_counts = Counter(word_freq.values())
frequencies = sorted(freq_counts.keys())
counts = [freq_counts[f] for f in frequencies]

plt.figure(figsize=(14, 6))
plt.subplot(1, 2, 1)
plt.plot(frequencies[:100], counts[:100], marker='o', linestyle='-', markersize=4)
plt.xlabel('Word Frequency')
plt.ylabel('Number of Words')
plt.title('Word Frequency Distribution (First 100)')
plt.grid(alpha=0.3)

plt.subplot(1, 2, 2)
plt.loglog(frequencies, counts, marker='o', linestyle='-', markersize=3)
plt.xlabel('Word Frequency (log scale)')
plt.ylabel('Number of Words (log scale)')
plt.title('Word Frequency Distribution (Log-Log)')
plt.grid(alpha=0.3)

plt.tight_layout()
plt.show()

## 5. Image Analysis

In [None]:
# Sample images with their captions
import random

images_dir = data_dir / 'Images'
sample_images = random.sample(list(df['image'].unique()), 9)

fig, axes = plt.subplots(3, 3, figsize=(15, 15))
axes = axes.flatten()

for idx, img_name in enumerate(sample_images):
    img_path = images_dir / img_name
    img = Image.open(img_path)
    
    # Get one caption
    caption = df[df['image'] == img_name]['caption'].values[0]
    
    axes[idx].imshow(img)
    axes[idx].axis('off')
    axes[idx].set_title(f"{caption[:60]}...", fontsize=9, wrap=True)

plt.suptitle('Random Sample Images with Captions', fontsize=16, fontweight='bold')
plt.tight_layout()
plt.show()

## 6. Caption Variation Analysis

In [None]:
# Show variation in captions for same image
sample_img = random.choice(list(df['image'].unique()))
captions = df[df['image'] == sample_img]['caption'].values

img_path = images_dir / sample_img
img = Image.open(img_path)

plt.figure(figsize=(10, 6))
plt.imshow(img)
plt.axis('off')
plt.title(f"Image: {sample_img}", fontsize=14, fontweight='bold')
plt.show()

print("\n" + "="*70)
print("CAPTION VARIATIONS FOR SAME IMAGE:")
print("="*70)
for i, caption in enumerate(captions, 1):
    print(f"{i}. {caption}")
print("="*70)

## 7. Key Insights

Summary of findings from data exploration.

In [None]:
print("\n" + "="*70)
print("KEY INSIGHTS FROM DATA EXPLORATION")
print("="*70)
print(f"\n1. Dataset Size:")
print(f"   - {df['image'].nunique():,} unique images")
print(f"   - {len(df):,} total captions")
print(f"   - Average {len(df) / df['image'].nunique():.1f} captions per image")

print(f"\n2. Caption Characteristics:")
print(f"   - Average length: {df['caption_length'].mean():.1f} words")
print(f"   - Length range: {df['caption_length'].min()}-{df['caption_length'].max()} words")
print(f"   - Most captions are {df['caption_length'].mode()[0]} words long")

print(f"\n3. Vocabulary:")
print(f"   - {len(word_freq):,} unique words in dataset")
print(f"   - {len(vocab):,} words in vocabulary (after filtering)")
print(f"   - Top 3 words: {', '.join([w for w, _ in word_freq.most_common(3)])}")

print(f"\n4. Data Split:")
print(f"   - Train: {train_df['image'].nunique():,} images ({train_df['image'].nunique() / df['image'].nunique() * 100:.1f}%)")
print(f"   - Val: {val_df['image'].nunique():,} images ({val_df['image'].nunique() / df['image'].nunique() * 100:.1f}%)")
print(f"   - Test: {test_df['image'].nunique():,} images ({test_df['image'].nunique() / df['image'].nunique() * 100:.1f}%)")

print(f"\n5. Caption Diversity:")
print(f"   - Each image has 5 different human-written captions")
print(f"   - Captions show good variation in phrasing and focus")
print(f"   - Vocabulary is rich and descriptive")
print("="*70)