In [None]:
# notebooks/01_data_exploration.ipynb
# This is a Python script that represents the notebook cells

"""
# Cell 1: Setup and Imports
"""
import pandas as pd
import numpy as np
import pickle
import json
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from collections import Counter
import sys

# Add project root to path
sys.path.append('..')

# Set style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

In [None]:
"""
# Cell 2: Load Preprocessed Data
"""
# Load vocabulary
with open('../data/processed/vocabulary.pkl', 'rb') as f:
    vocabulary = pickle.load(f)

# Load train corpus
with open('../data/processed/train_corpus.pkl', 'rb') as f:
    train_data = pickle.load(f)
    train_sequences = train_data['sequences']
    train_ids = train_data['ids']

# Load validation corpus
with open('../data/processed/val_corpus.pkl', 'rb') as f:
    val_data = pickle.load(f)
    val_sequences = val_data['sequences']
    val_ids = val_data['ids']

# Load test corpus
with open('../data/processed/test_corpus.pkl', 'rb') as f:
    test_data = pickle.load(f)
    test_sequences = test_data['sequences']
    test_ids = test_data['ids']

# Load statistics
with open('../data/statistics/corpus_stats.json', 'r') as f:
    stats = json.load(f)

print(f"Vocabulary size: {len(vocabulary)}")
print(f"Train sequences: {len(train_sequences)}")
print(f"Validation sequences: {len(val_sequences)}")
print(f"Test sequences: {len(test_sequences)}")


In [None]:
"""
# Cell 3: Corpus Statistics Overview
"""
print("=" * 60)
print("CORPUS STATISTICS")
print("=" * 60)

print(f"\nSequence Counts:")
print(f"  Training:   {len(train_sequences):,}")
print(f"  Validation: {len(val_sequences):,}")
print(f"  Test:       {len(test_sequences):,}")
print(f"  Total:      {len(train_sequences) + len(val_sequences) + len(test_sequences):,}")

print(f"\nToken Statistics:")
print(f"  Total tokens: {stats['total_tokens']:,}")
print(f"  Unique tokens: {stats['unique_tokens']:,}")
print(f"  Vocabulary size: {stats['vocabulary_size']:,}")
print(f"  OOV tokens: {stats['unique_tokens'] - stats['vocabulary_size']:,}")

print(f"\nSequence Length Statistics:")
print(f"  Average: {stats['avg_sequence_length']:.2f}")
print(f"  Median: {stats['median_sequence_length']:.0f}")
print(f"  Min: {stats['min_sequence_length']}")
print(f"  Max: {stats['max_sequence_length']}")
print(f"  Std: {stats['std_sequence_length']:.2f}")


In [None]:
"""
# Cell 4: Visualize Sequence Length Distribution
"""
# Collect all sequence lengths
all_lengths = [len(seq) for seq in train_sequences + val_sequences + test_sequences]

fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Histogram
axes[0].hist(all_lengths, bins=50, edgecolor='black', alpha=0.7)
axes[0].axvline(np.mean(all_lengths), color='red', linestyle='--', label=f'Mean: {np.mean(all_lengths):.1f}')
axes[0].axvline(np.median(all_lengths), color='green', linestyle='--', label=f'Median: {np.median(all_lengths):.1f}')
axes[0].set_xlabel('Sequence Length')
axes[0].set_ylabel('Frequency')
axes[0].set_title('Distribution of Sequence Lengths')
axes[0].legend()

# Box plot
data_for_box = {
    'Train': [len(seq) for seq in train_sequences],
    'Val': [len(seq) for seq in val_sequences],
    'Test': [len(seq) for seq in test_sequences]
}
axes[1].boxplot(data_for_box.values(), labels=data_for_box.keys())
axes[1].set_ylabel('Sequence Length')
axes[1].set_title('Sequence Length by Dataset Split')

plt.tight_layout()
plt.show()

In [None]:
"""
# Cell 5: Token Type Distribution
"""
# Create pie chart for token types
token_types = stats['token_type_distribution']

fig, ax = plt.subplots(figsize=(10, 8))
colors = ['#ff9999', '#66b3ff', '#99ff99', '#ffcc99', '#ff99cc']
wedges, texts, autotexts = ax.pie(
    token_types.values(), 
    labels=token_types.keys(),
    colors=colors,
    autopct='%1.1f%%',
    startangle=90
)

# Enhance text
for text in texts:
    text.set_fontsize(12)
for autotext in autotexts:
    autotext.set_color('white')
    autotext.set_fontsize(10)
    autotext.set_weight('bold')

ax.set_title('Token Type Distribution in Corpus', fontsize=14, fontweight='bold')
plt.show()

In [None]:
"""
# Cell 6: Most Common Tokens Analysis
"""
print("=" * 60)
print("TOP 30 MOST COMMON TOKENS")
print("=" * 60)

# Get more common tokens for analysis
all_tokens = []
for seq in train_sequences:
    all_tokens.extend(seq)

token_counter = Counter(all_tokens)
most_common_30 = token_counter.most_common(30)

# Create bar plot
fig, ax = plt.subplots(figsize=(15, 6))
tokens = [t[0] for t in most_common_30]
counts = [t[1] for t in most_common_30]

bars = ax.bar(range(len(tokens)), counts)
ax.set_xticks(range(len(tokens)))
ax.set_xticklabels(tokens, rotation=45, ha='right')
ax.set_xlabel('Token')
ax.set_ylabel('Frequency')
ax.set_title('30 Most Common Tokens in Training Data')

# Color code by token type
for i, (token, _) in enumerate(most_common_30):
    if token in ['<BOS>', '<EOS>', '<UNK>', '<PAD>', '<STRING>', '<NUM>