In [None]:
import os
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
from collections import Counter
import seaborn as sns

In [None]:
class DatasetAnalyzer:
    def __init__(self, data_root="./datasets"):
        self.data_root = Path(data_root)
        self.dataset_info = {}
    
    def analyze_image_dataset(self, dataset_path):
        """Analyze image dataset structure and statistics"""
        path = self.data_root / dataset_path
        if not path.exists():
            print(f"Dataset path {path} does not exist")
            return None
        
        image_extensions = {'.jpg', '.jpeg', '.png', '.bmp', '.tiff'}
        image_files = []
        class_counts = Counter()
        
        for root, dirs, files in os.walk(path):
            for file in files:
                if Path(file).suffix.lower() in image_extensions:
                    image_files.append(os.path.join(root, file))
                    # Extract class from directory structure
                    class_name = Path(root).name
                    class_counts[class_name] += 1
        
        stats = {
            'total_images': len(image_files),
            'num_classes': len(class_counts),
            'class_distribution': dict(class_counts),
            'sample_files': image_files[:5]
        }
        
        return stats
    
    def analyze_text_dataset(self, dataset_path):
        """Analyze text corpus statistics"""
        path = self.data_root / dataset_path
        if not path.exists():
            return None
        
        text_files = list(path.glob("*.txt")) + list(path.glob("*.json"))
        total_chars = 0
        total_words = 0
        total_lines = 0
        
        for file_path in text_files[:10]:  # Sample first 10 files
            try:
                with open(file_path, 'r', encoding='utf-8') as f:
                    content = f.read()
                    total_chars += len(content)
                    total_words += len(content.split())
                    total_lines += len(content.split('\n'))
            except Exception as e:
                print(f"Error reading {file_path}: {e}")
        
        stats = {
            'total_files': len(text_files),
            'avg_chars_per_file': total_chars / max(len(text_files[:10]), 1),
            'avg_words_per_file': total_words / max(len(text_files[:10]), 1),
            'avg_lines_per_file': total_lines / max(len(text_files[:10]), 1),
            'sample_files': [str(f) for f in text_files[:5]]
        }
        
        return stats
    
    def analyze_audio_dataset(self, dataset_path):
        """Analyze audio dataset statistics"""
        path = self.data_root / dataset_path
        if not path.exists():
            return None
        
        audio_extensions = {'.wav', '.mp3', '.flac', '.m4a'}
        audio_files = []
        
        for root, dirs, files in os.walk(path):
            for file in files:
                if Path(file).suffix.lower() in audio_extensions:
                    audio_files.append(os.path.join(root, file))
        
        # Sample analysis (would need librosa for detailed analysis)
        stats = {
            'total_audio_files': len(audio_files),
            'sample_files': audio_files[:5],
            'file_extensions': list(set(Path(f).suffix for f in audio_files))
        }
        
        return stats
    
    def analyze_video_dataset(self, dataset_path):
        """Analyze video dataset statistics"""
        path = self.data_root / dataset_path
        if not path.exists():
            return None
        
        video_extensions = {'.mp4', '.avi', '.mov', '.mkv'}
        video_files = []
        
        for root, dirs, files in os.walk(path):
            for file in files:
                if Path(file).suffix.lower() in video_extensions:
                    video_files.append(os.path.join(root, file))
        
        stats = {
            'total_video_files': len(video_files),
            'sample_files': video_files[:5],
            'file_extensions': list(set(Path(f).suffix for f in video_files))
        }
        
        return stats
    
    def generate_overview_report(self):
        """Generate comprehensive dataset overview"""
        datasets = {
            'images': ['imagenet', 'coco', 'flickr30k'],
            'text': ['wikipedia', 'books', 'news'],
            'audio': ['librispeech', 'commonvoice', 'musiccaps'],
            'video': ['kinetics', 'activitynet', 'msr_vtt']
        }
        
        report = {'dataset_overview': {}}
        
        for modality, dataset_list in datasets.items():
            report['dataset_overview'][modality] = {}
            for dataset in dataset_list:
                if modality == 'images':
                    stats = self.analyze_image_dataset(dataset)
                elif modality == 'text':
                    stats = self.analyze_text_dataset(dataset)
                elif modality == 'audio':
                    stats = self.analyze_audio_dataset(dataset)
                elif modality == 'video':
                    stats = self.analyze_video_dataset(dataset)
                
                if stats:
                    report['dataset_overview'][modality][dataset] = stats
        
        return report
    
    def visualize_dataset_stats(self, report):
        """Create visualizations of dataset statistics"""
        fig, axes = plt.subplots(2, 2, figsize=(15, 10))
        fig.suptitle('Dataset Overview Statistics', fontsize=16)
        
        # Image dataset class distribution
        if 'images' in report['dataset_overview']:
            ax = axes[0, 0]
            for dataset, stats in report['dataset_overview']['images'].items():
                if stats and 'class_distribution' in stats:
                    classes = list(stats['class_distribution'].keys())[:10]
                    counts = [stats['class_distribution'][c] for c in classes]
                    ax.bar(range(len(classes)), counts, alpha=0.7, label=dataset)
            ax.set_title('Image Dataset Class Distribution (Top 10)')
            ax.set_xlabel('Classes')
            ax.set_ylabel('Count')
            ax.legend()
        
        # Text dataset file counts
        if 'text' in report['dataset_overview']:
            ax = axes[0, 1]
            datasets = []
            file_counts = []
            for dataset, stats in report['dataset_overview']['text'].items():
                if stats:
                    datasets.append(dataset)
                    file_counts.append(stats.get('total_files', 0))
            ax.bar(datasets, file_counts, color='green', alpha=0.7)
            ax.set_title('Text Dataset File Counts')
            ax.set_ylabel('Number of Files')
        
        # Audio dataset statistics
        if 'audio' in report['dataset_overview']:
            ax = axes[1, 0]
            datasets = []
            audio_counts = []
            for dataset, stats in report['dataset_overview']['audio'].items():
                if stats:
                    datasets.append(dataset)
                    audio_counts.append(stats.get('total_audio_files', 0))
            ax.bar(datasets, audio_counts, color='orange', alpha=0.7)
            ax.set_title('Audio Dataset File Counts')
            ax.set_ylabel('Number of Audio Files')
        
        # Video dataset statistics
        if 'video' in report['dataset_overview']:
            ax = axes[1, 1]
            datasets = []
            video_counts = []
            for dataset, stats in report['dataset_overview']['video'].items():
                if stats:
                    datasets.append(dataset)
                    video_counts.append(stats.get('total_video_files', 0))
            ax.bar(datasets, video_counts, color='red', alpha=0.7)
            ax.set_title('Video Dataset File Counts')
            ax.set_ylabel('Number of Video Files')
        
        plt.tight_layout()
        plt.show()

In [None]:
def main():
    analyzer = DatasetAnalyzer()
    
    print("Generating Dataset Overview Report...")
    report = analyzer.generate_overview_report()
    
    # Save report
    with open('dataset_overview_report.json', 'w') as f:
        json.dump(report, f, indent=2)
    
    print("Dataset Overview Report:")
    print(json.dumps(report, indent=2))
    
    # Create visualizations
    analyzer.visualize_dataset_stats(report)
    
    print("\n✓ Dataset overview analysis complete!")

main()