# 📊 Data Exploration & Analysis

This notebook provides comprehensive data exploration and analysis for the multimodal pill recognition dataset.

## 🎯 Objectives
- Analyze dataset statistics and distributions
- Visualize image and text data characteristics
- Identify data quality issues
- Generate insights for model development

In [None]:
import os
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from PIL import Image
import yaml
import base64
import cv2
from collections import Counter
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import warnings
warnings.filterwarnings('ignore')

# Add src to path
sys.path.append('../src')
from data.data_processing import SparkDataProcessor

# Set plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("📦 All packages imported successfully!")

## ⚙️ Configuration & Setup

In [None]:
# Load configuration
with open('../config/config.yaml', 'r') as f:
    config = yaml.safe_load(f)

print("📋 Configuration loaded:")
print(f"- Data path: {config['data']['data_path']}")
print(f"- Image size: {config['data']['image_size']}")
print(f"- Number of classes: {config['model']['classifier']['num_classes']}")

# Initialize Spark data processor
try:
    processor = SparkDataProcessor(config)
    print("✅ Spark processor initialized")
except Exception as e:
    print(f"⚠️ Could not initialize Spark: {e}")
    processor = None

## 🎲 Sample Data Generation

Generate sample synthetic data for exploration and testing.

In [None]:
if processor:
    # Generate sample dataset
    sample_size = 1000
    output_path = "../data/raw/sample_data.parquet"
    
    print(f"🎲 Generating {sample_size} sample records...")
    sample_df = processor.create_sample_dataset(output_path, sample_size)
    
    # Convert to Pandas for analysis
    sample_pd = sample_df.toPandas()
    print(f"✅ Sample data generated: {len(sample_pd)} records")
else:
    # Create dummy data without Spark
    print("📝 Creating dummy data without Spark...")
    
    pill_classes = [f"pill_class_{i:04d}" for i in range(50)]
    manufacturers = [f"pharma_company_{i}" for i in range(1, 11)]
    shapes = ["round", "oval", "square", "capsule"]
    colors = ["white", "blue", "red", "yellow", "green", "pink", "orange"]
    
    sample_data = []
    for i in range(1000):
        sample_data.append({
            "id": f"pill_{i:06d}",
            "pill_class": np.random.choice(pill_classes),
            "manufacturer": np.random.choice(manufacturers),
            "dosage": f"{np.random.randint(5, 500)}mg",
            "shape": np.random.choice(shapes),
            "color": np.random.choice(colors),
            "text_imprint": f"PILL {np.random.randint(1, 999)}",
            "split": np.random.choice(["train", "val", "test"], p=[0.7, 0.15, 0.15])
        })
    
    sample_pd = pd.DataFrame(sample_data)
    print(f"✅ Dummy data created: {len(sample_pd)} records")

# Display basic info
print("\n📊 Dataset Overview:")
print(sample_pd.info())

## 📈 Basic Statistics & Distributions

In [None]:
# Dataset statistics
print("📋 Dataset Statistics:")
print(f"Total samples: {len(sample_pd):,}")
print(f"Unique classes: {sample_pd['pill_class'].nunique()}")
print(f"Unique manufacturers: {sample_pd['manufacturer'].nunique()}")
print(f"Unique shapes: {sample_pd['shape'].nunique()}")
print(f"Unique colors: {sample_pd['color'].nunique()}")

# Train/Val/Test split
print("\n🎯 Data Split:")
split_counts = sample_pd['split'].value_counts()
for split, count in split_counts.items():
    percentage = (count / len(sample_pd)) * 100
    print(f"{split}: {count:,} samples ({percentage:.1f}%)")

In [None]:
# Class distribution analysis
fig = make_subplots(
    rows=2, cols=2,
    subplot_titles=["Class Distribution", "Manufacturer Distribution", 
                   "Shape Distribution", "Color Distribution"],
    specs=[[{"type": "bar"}, {"type": "bar"}],
           [{"type": "pie"}, {"type": "pie"}]]
)

# Class distribution (top 20)
top_classes = sample_pd['pill_class'].value_counts().head(20)
fig.add_trace(
    go.Bar(x=top_classes.index, y=top_classes.values, name="Classes"),
    row=1, col=1
)

# Manufacturer distribution
mfg_counts = sample_pd['manufacturer'].value_counts()
fig.add_trace(
    go.Bar(x=mfg_counts.index, y=mfg_counts.values, name="Manufacturers"),
    row=1, col=2
)

# Shape distribution
shape_counts = sample_pd['shape'].value_counts()
fig.add_trace(
    go.Pie(labels=shape_counts.index, values=shape_counts.values, name="Shapes"),
    row=2, col=1
)

# Color distribution
color_counts = sample_pd['color'].value_counts()
fig.add_trace(
    go.Pie(labels=color_counts.index, values=color_counts.values, name="Colors"),
    row=2, col=2
)

fig.update_layout(height=800, title_text="📊 Dataset Distribution Analysis")
fig.show()

## 📝 Text Imprint Analysis

In [None]:
# Text imprint analysis
import re
from collections import Counter

# Analyze text lengths
text_lengths = sample_pd['text_imprint'].str.len()
word_counts = sample_pd['text_imprint'].str.split().str.len()

print("📝 Text Imprint Statistics:")
print(f"Average character length: {text_lengths.mean():.1f}")
print(f"Average word count: {word_counts.mean():.1f}")
print(f"Max character length: {text_lengths.max()}")
print(f"Max word count: {word_counts.max()}")

# Extract all words and analyze frequency
all_words = []
for text in sample_pd['text_imprint']:
    words = re.findall(r'\b\w+\b', text.upper())
    all_words.extend(words)

word_freq = Counter(all_words)
print(f"\n🔤 Vocabulary size: {len(word_freq)}")
print("\n📊 Most common words:")
for word, count in word_freq.most_common(10):
    print(f"  {word}: {count}")

In [None]:
# Text length distributions
fig = make_subplots(
    rows=1, cols=2,
    subplot_titles=["Character Length Distribution", "Word Count Distribution"]
)

# Character length histogram
fig.add_trace(
    go.Histogram(x=text_lengths, nbinsx=20, name="Character Length"),
    row=1, col=1
)

# Word count histogram
fig.add_trace(
    go.Histogram(x=word_counts, nbinsx=10, name="Word Count"),
    row=1, col=2
)

fig.update_layout(height=400, title_text="📝 Text Imprint Length Analysis")
fig.show()

# Word frequency chart
top_words = dict(word_freq.most_common(20))
fig_words = px.bar(
    x=list(top_words.keys()),
    y=list(top_words.values()),
    title="🔤 Top 20 Most Frequent Words in Text Imprints",
    labels={"x": "Words", "y": "Frequency"}
)
fig_words.update_xaxis(tickangle=45)
fig_words.show()

## 🔍 Data Quality Analysis

In [None]:
# Check for missing values
print("🔍 Missing Value Analysis:")
missing_data = sample_pd.isnull().sum()
for column, missing_count in missing_data.items():
    if missing_count > 0:
        percentage = (missing_count / len(sample_pd)) * 100
        print(f"  {column}: {missing_count} ({percentage:.1f}%)")
    else:
        print(f"  {column}: No missing values ✅")

# Check for duplicates
duplicates = sample_pd.duplicated().sum()
print(f"\n🔄 Duplicate records: {duplicates}")

# Check class balance
print("\n⚖️ Class Balance Analysis:")
class_counts = sample_pd['pill_class'].value_counts()
print(f"Most frequent class: {class_counts.iloc[0]} samples")
print(f"Least frequent class: {class_counts.iloc[-1]} samples")
print(f"Imbalance ratio: {class_counts.iloc[0] / class_counts.iloc[-1]:.2f}")

# Calculate Gini coefficient for class imbalance
def gini_coefficient(x):
    sorted_x = sorted(x)
    n = len(x)
    index = np.arange(1, n + 1)
    return (2 * np.sum(index * sorted_x)) / (n * np.sum(sorted_x)) - (n + 1) / n

gini = gini_coefficient(class_counts.values)
print(f"Class distribution Gini coefficient: {gini:.3f}")
print("  (0 = perfectly balanced, 1 = maximally imbalanced)")

## 🔗 Feature Correlation Analysis

In [None]:
# Analyze relationships between categorical features
from scipy.stats import chi2_contingency

categorical_features = ['manufacturer', 'shape', 'color']

print("🔗 Feature Association Analysis (Chi-square test):")
for i, feat1 in enumerate(categorical_features):
    for feat2 in categorical_features[i+1:]:
        # Create contingency table
        contingency_table = pd.crosstab(sample_pd[feat1], sample_pd[feat2])
        
        # Perform chi-square test
        chi2, p_value, dof, expected = chi2_contingency(contingency_table)
        
        # Calculate Cramér's V (effect size)
        n = contingency_table.sum().sum()
        cramers_v = np.sqrt(chi2 / (n * (min(contingency_table.shape) - 1)))
        
        significance = "***" if p_value < 0.001 else "**" if p_value < 0.01 else "*" if p_value < 0.05 else "ns"
        
        print(f"  {feat1} vs {feat2}:")
        print(f"    Chi2: {chi2:.3f}, p-value: {p_value:.3f} {significance}")
        print(f"    Cramér's V: {cramers_v:.3f}")
        print()

In [None]:
# Visualize cross-tabulations
fig = make_subplots(
    rows=2, cols=2,
    subplot_titles=["Shape vs Color", "Manufacturer vs Shape", 
                   "Manufacturer vs Color", "Text Length vs Shape"]
)

# Shape vs Color heatmap
shape_color_crosstab = pd.crosstab(sample_pd['shape'], sample_pd['color'])
fig.add_trace(
    go.Heatmap(z=shape_color_crosstab.values,
               x=shape_color_crosstab.columns,
               y=shape_color_crosstab.index,
               colorscale='Viridis'),
    row=1, col=1
)

# Manufacturer vs Shape
mfg_shape_crosstab = pd.crosstab(sample_pd['manufacturer'], sample_pd['shape'])
fig.add_trace(
    go.Heatmap(z=mfg_shape_crosstab.values,
               x=mfg_shape_crosstab.columns,
               y=mfg_shape_crosstab.index,
               colorscale='Plasma'),
    row=1, col=2
)

# Add text length by shape analysis
text_len_by_shape = sample_pd.groupby('shape')['text_imprint'].str.len().mean()
fig.add_trace(
    go.Bar(x=text_len_by_shape.index, y=text_len_by_shape.values),
    row=2, col=2
)

fig.update_layout(height=800, title_text="🔗 Feature Relationships Analysis")
fig.show()

## 💡 Key Insights & Recommendations

In [None]:
# Generate automated insights
insights = []

# Class balance insight
if gini > 0.3:
    insights.append("⚖️ **Class Imbalance Detected**: Consider using class weights or sampling techniques during training.")
else:
    insights.append("✅ **Balanced Classes**: Dataset shows good class distribution.")

# Text length insight
avg_text_len = text_lengths.mean()
if avg_text_len < 10:
    insights.append("📝 **Short Text Imprints**: Consider data augmentation or synthetic text generation.")
elif avg_text_len > 50:
    insights.append("📝 **Long Text Imprints**: May need to increase maximum sequence length in model.")
else:
    insights.append("✅ **Optimal Text Length**: Text imprints are within good range for processing.")

# Vocabulary insight
vocab_size = len(word_freq)
if vocab_size < 100:
    insights.append("🔤 **Limited Vocabulary**: Consider expanding text data or using character-level encoding.")
else:
    insights.append(f"✅ **Rich Vocabulary**: {vocab_size} unique words provide good textual diversity.")

# Feature diversity insight
shape_diversity = sample_pd['shape'].nunique()
color_diversity = sample_pd['color'].nunique()
insights.append(f"🎨 **Visual Diversity**: {shape_diversity} shapes and {color_diversity} colors provide good visual variation.")

# Data quality insight
if duplicates == 0 and missing_data.sum() == 0:
    insights.append("✅ **High Data Quality**: No missing values or duplicates detected.")
else:
    insights.append("⚠️ **Data Quality Issues**: Address missing values and duplicates before training.")

print("💡 Key Insights & Recommendations:")
print("=" * 50)
for i, insight in enumerate(insights, 1):
    print(f"{i}. {insight}")
    print()

# Save insights to file
with open('../results/data_exploration_insights.txt', 'w') as f:
    f.write("Data Exploration Insights\n")
    f.write("=" * 30 + "\n\n")
    for insight in insights:
        f.write(f"- {insight}\n")

print("📄 Insights saved to 'results/data_exploration_insights.txt'")

## 📋 Summary Report

In [None]:
# Generate comprehensive summary report
summary_report = {
    "dataset_overview": {
        "total_samples": len(sample_pd),
        "unique_classes": sample_pd['pill_class'].nunique(),
        "unique_manufacturers": sample_pd['manufacturer'].nunique(),
        "data_splits": sample_pd['split'].value_counts().to_dict()
    },
    "text_analysis": {
        "vocabulary_size": len(word_freq),
        "avg_text_length": float(text_lengths.mean()),
        "avg_word_count": float(word_counts.mean()),
        "top_words": dict(word_freq.most_common(10))
    },
    "data_quality": {
        "missing_values": missing_data.sum().to_dict(),
        "duplicate_records": int(duplicates),
        "class_imbalance_gini": float(gini)
    },
    "feature_diversity": {
        "shapes": sample_pd['shape'].value_counts().to_dict(),
        "colors": sample_pd['color'].value_counts().to_dict(),
        "manufacturers": sample_pd['manufacturer'].value_counts().to_dict()
    }
}

# Save summary as JSON
import json
os.makedirs('../results', exist_ok=True)
with open('../results/data_exploration_summary.json', 'w') as f:
    json.dump(summary_report, f, indent=2)

print("📊 Data Exploration Complete!")
print("📄 Summary report saved to 'results/data_exploration_summary.json'")
print("\n🎯 Next Steps:")
print("1. Review insights and recommendations")
print("2. Address any data quality issues")
print("3. Proceed with model training")
print("4. Monitor performance across different classes and features")