# 01 - Data Exploration

**Thesis:** Diseno y Validacion de un Modelo Semantico Hibrido para Optimizar Sistemas RAG

This notebook explores the corpus, terminology, chunking strategies, and deduplication results.

In [None]:
import sys
sys.path.insert(0, '..')

import json
from pathlib import Path
from collections import Counter, defaultdict

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Publication-quality settings
plt.rcParams.update({
    'figure.figsize': (10, 6),
    'figure.dpi': 300,
    'font.size': 12,
    'axes.titlesize': 14,
    'axes.labelsize': 12,
    'savefig.dpi': 300,
    'savefig.bbox': 'tight',
})
sns.set_theme(style='whitegrid', palette='Set2')

PROJECT_ROOT = Path('..').resolve()
PROCESSED_DIR = PROJECT_ROOT / 'data' / 'processed'
CHUNKS_DIR = PROJECT_ROOT / 'data' / 'chunks'
FIGURES_DIR = PROJECT_ROOT / 'output' / 'figures'
FIGURES_DIR.mkdir(parents=True, exist_ok=True)

print(f'Project root: {PROJECT_ROOT}')

## 1. Corpus Statistics

In [None]:
# Load all processed documents
docs = []
for json_file in PROCESSED_DIR.rglob('*.json'):
    try:
        data = json.loads(json_file.read_text(encoding='utf-8'))
        docs.append(data)
    except Exception as e:
        pass

df = pd.DataFrame(docs)
print(f'Total documents loaded: {len(df)}')
if len(df) > 0:
    print(f'Columns: {list(df.columns)}')
    df.head()

In [None]:
# Documents per provider
if len(df) > 0:
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    
    # Count by provider
    provider_counts = df['cloud_provider'].value_counts()
    provider_counts.plot(kind='bar', ax=axes[0], color=sns.color_palette('Set2'))
    axes[0].set_title('Documents per Provider')
    axes[0].set_xlabel('Provider')
    axes[0].set_ylabel('Count')
    axes[0].tick_params(axis='x', rotation=0)
    
    # Word count by provider
    word_totals = df.groupby('cloud_provider')['word_count'].sum()
    word_totals.plot(kind='bar', ax=axes[1], color=sns.color_palette('Set2'))
    axes[1].set_title('Total Words per Provider')
    axes[1].set_xlabel('Provider')
    axes[1].set_ylabel('Words')
    axes[1].tick_params(axis='x', rotation=0)
    
    plt.tight_layout()
    plt.savefig(FIGURES_DIR / 'corpus_by_provider.png')
    plt.show()
else:
    print('No documents found. Run the download pipeline first.')

In [None]:
# Document size distribution
if len(df) > 0:
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    
    df['word_count'].hist(bins=50, ax=axes[0], color='steelblue', edgecolor='white')
    axes[0].set_title('Document Size Distribution (Words)')
    axes[0].set_xlabel('Word Count')
    axes[0].set_ylabel('Frequency')
    axes[0].axvline(df['word_count'].median(), color='red', linestyle='--', label=f'Median: {df["word_count"].median():.0f}')
    axes[0].legend()
    
    # Doc type distribution
    if 'doc_type' in df.columns:
        df['doc_type'].value_counts().plot(kind='bar', ax=axes[1], color=sns.color_palette('Set2'))
        axes[1].set_title('Document Types')
        axes[1].set_xlabel('Type')
        axes[1].set_ylabel('Count')
        axes[1].tick_params(axis='x', rotation=45)
    
    plt.tight_layout()
    plt.savefig(FIGURES_DIR / 'document_size_distribution.png')
    plt.show()

## 2. Terminology Analysis

In [None]:
# Analyze detected terminology
if len(df) > 0 and 'terminology' in df.columns:
    all_siglas = []
    all_terms = []
    for _, row in df.iterrows():
        term_data = row.get('terminology', {})
        if isinstance(term_data, dict):
            all_siglas.extend(term_data.get('detected_siglas', []))
            all_terms.extend(term_data.get('normalized_terms', []))
    
    if all_siglas:
        fig, axes = plt.subplots(1, 2, figsize=(16, 8))
        
        # Top 30 acronyms
        sigla_counts = Counter(all_siglas).most_common(30)
        if sigla_counts:
            names, counts = zip(*sigla_counts)
            axes[0].barh(range(len(names)), counts, color='steelblue')
            axes[0].set_yticks(range(len(names)))
            axes[0].set_yticklabels(names)
            axes[0].set_title('Top 30 Acronyms in Corpus')
            axes[0].set_xlabel('Frequency')
            axes[0].invert_yaxis()
        
        # Top 20 normalized concepts
        term_counts = Counter(all_terms).most_common(20)
        if term_counts:
            names, counts = zip(*term_counts)
            axes[1].barh(range(len(names)), counts, color='coral')
            axes[1].set_yticks(range(len(names)))
            axes[1].set_yticklabels(names)
            axes[1].set_title('Top 20 Normalized Concepts')
            axes[1].set_xlabel('Frequency')
            axes[1].invert_yaxis()
        
        plt.tight_layout()
        plt.savefig(FIGURES_DIR / 'terminology_analysis.png')
        plt.show()
    else:
        print('No terminology data found. Run terminology normalizer first.')
else:
    print('No terminology data available.')

In [None]:
# Cross-provider term heatmap
if len(df) > 0 and 'terminology' in df.columns:
    provider_terms = defaultdict(set)
    for _, row in df.iterrows():
        prov = row.get('cloud_provider', '')
        term_data = row.get('terminology', {})
        if isinstance(term_data, dict):
            for t in term_data.get('normalized_terms', []):
                provider_terms[prov].add(t)
    
    if provider_terms:
        providers = sorted(provider_terms.keys())
        # Compute shared term counts
        matrix = []
        for p1 in providers:
            row = []
            for p2 in providers:
                shared = len(provider_terms[p1] & provider_terms[p2])
                row.append(shared)
            matrix.append(row)
        
        fig, ax = plt.subplots(figsize=(8, 6))
        sns.heatmap(
            matrix, annot=True, fmt='d',
            xticklabels=[p.upper() for p in providers],
            yticklabels=[p.upper() for p in providers],
            cmap='YlOrRd', ax=ax
        )
        ax.set_title('Shared Normalized Terms Between Providers')
        plt.tight_layout()
        plt.savefig(FIGURES_DIR / 'cross_provider_terms_heatmap.png')
        plt.show()

## 3. Chunking Analysis

In [None]:
# Load chunks from all strategies
chunk_data = []
for strategy_dir in sorted(CHUNKS_DIR.iterdir()) if CHUNKS_DIR.exists() else []:
    if not strategy_dir.is_dir():
        continue
    strategy = strategy_dir.name
    for size_dir in sorted(strategy_dir.iterdir()):
        if not size_dir.is_dir():
            continue
        size = size_dir.name.replace('size_', '')
        count = 0
        token_counts = []
        code_chunks = 0
        table_chunks = 0
        for jf in size_dir.glob('*.json'):
            try:
                data = json.loads(jf.read_text(encoding='utf-8'))
                count += 1
                token_counts.append(data.get('token_count', 0))
                if data.get('has_code'):
                    code_chunks += 1
                if data.get('has_table'):
                    table_chunks += 1
            except Exception:
                pass
        if count > 0:
            chunk_data.append({
                'strategy': strategy,
                'size': int(size),
                'num_chunks': count,
                'avg_tokens': np.mean(token_counts) if token_counts else 0,
                'std_tokens': np.std(token_counts) if token_counts else 0,
                'min_tokens': min(token_counts) if token_counts else 0,
                'max_tokens': max(token_counts) if token_counts else 0,
                'pct_code': code_chunks / count * 100,
                'pct_table': table_chunks / count * 100,
            })

if chunk_data:
    chunk_df = pd.DataFrame(chunk_data)
    print(chunk_df.to_string(index=False))
else:
    print('No chunks found. Run chunking first.')

In [None]:
# Chunk count comparison
if chunk_data:
    fig, axes = plt.subplots(1, 2, figsize=(14, 6))
    
    # Chunk count by strategy and size
    pivot = chunk_df.pivot(index='strategy', columns='size', values='num_chunks')
    pivot.plot(kind='bar', ax=axes[0])
    axes[0].set_title('Number of Chunks by Strategy and Size')
    axes[0].set_xlabel('Strategy')
    axes[0].set_ylabel('Number of Chunks')
    axes[0].tick_params(axis='x', rotation=45)
    axes[0].legend(title='Chunk Size')
    
    # Average token count
    pivot_avg = chunk_df.pivot(index='strategy', columns='size', values='avg_tokens')
    pivot_avg.plot(kind='bar', ax=axes[1])
    axes[1].set_title('Average Token Count per Chunk')
    axes[1].set_xlabel('Strategy')
    axes[1].set_ylabel('Avg Tokens')
    axes[1].tick_params(axis='x', rotation=45)
    axes[1].legend(title='Chunk Size')
    
    plt.tight_layout()
    plt.savefig(FIGURES_DIR / 'chunking_comparison.png')
    plt.show()

In [None]:
# Token distribution per strategy (box plot)
if CHUNKS_DIR.exists():
    all_chunk_tokens = []
    target_size = '500'
    
    for strategy_dir in sorted(CHUNKS_DIR.iterdir()):
        if not strategy_dir.is_dir():
            continue
        size_dir = strategy_dir / f'size_{target_size}'
        if not size_dir.exists():
            continue
        for jf in list(size_dir.glob('*.json'))[:500]:  # Sample for speed
            try:
                data = json.loads(jf.read_text(encoding='utf-8'))
                all_chunk_tokens.append({
                    'strategy': strategy_dir.name,
                    'tokens': data.get('token_count', 0),
                })
            except Exception:
                pass
    
    if all_chunk_tokens:
        token_df = pd.DataFrame(all_chunk_tokens)
        fig, ax = plt.subplots(figsize=(10, 6))
        sns.boxplot(data=token_df, x='strategy', y='tokens', ax=ax)
        ax.axhline(y=int(target_size), color='red', linestyle='--', alpha=0.7, label=f'Target: {target_size}')
        ax.set_title(f'Token Distribution per Chunk (Target Size: {target_size})')
        ax.set_xlabel('Strategy')
        ax.set_ylabel('Token Count')
        ax.legend()
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.savefig(FIGURES_DIR / 'chunk_token_distribution.png')
        plt.show()

## 4. Deduplication Analysis

In [None]:
# Load dedup stats
dedup_path = PROJECT_ROOT / 'data' / 'dedup_stats.json'
if dedup_path.exists():
    dedup_stats = json.loads(dedup_path.read_text(encoding='utf-8'))
    
    print('=== Deduplication Results ===')
    print(f'Input documents:  {dedup_stats.get("input_documents", "N/A")}')
    print(f'Output documents: {dedup_stats.get("output_documents", "N/A")}')
    print(f'\nIntra-document paragraphs removed: {dedup_stats.get("intra_document", {}).get("paragraphs_removed", 0)}')
    print(f'Intra-provider duplicates: {dedup_stats.get("intra_provider", {}).get("duplicates_found", 0)}')
    print(f'Cross-provider equivalent groups: {dedup_stats.get("cross_provider", {}).get("equivalents_marked", 0)}')
else:
    print('No deduplication stats found. Run deduplicator first.')

In [None]:
# Summary table
if len(df) > 0:
    summary = df.groupby('cloud_provider').agg({
        'doc_id': 'count',
        'word_count': ['sum', 'mean', 'median'],
        'has_code': 'sum',
        'has_tables': 'sum',
    }).round(0)
    summary.columns = ['Documents', 'Total Words', 'Avg Words', 'Median Words', 'With Code', 'With Tables']
    print('\n=== Corpus Summary ===')
    print(summary.to_string())
    
    # Save as CSV for thesis
    summary.to_csv(PROJECT_ROOT / 'output' / 'tables' / 'corpus_summary.csv')
    print(f'\nSaved to output/tables/corpus_summary.csv')