In [None]:
import json
from pathlib import Path
from collections import Counter, defaultdict
import statistics

In [None]:
def analyze_jsonl(file_path, sample_size=1000):
    """
    Analyze .jsonl file structure and content
    
    Args:
        file_path: Path to .jsonl file
        sample_size: Number of documents to sample (use fewer for faster analysis)
    """
    
    # Storage for analysis
    all_keys = Counter()
    key_types = defaultdict(Counter)
    text_field_candidates = defaultdict(list)
    doc_sizes = []
    sample_docs = []
    
    print(f"Analyzing {file_path}...")
    print("=" * 60)
    
    with open(file_path, 'r', encoding='utf-8') as f:
        for i, line in enumerate(f):
            if i >= sample_size:
                break
            
            try:
                doc = json.loads(line)
                
                # Store first few docs for manual inspection
                if i < 5:
                    sample_docs.append(doc)
                
                # Track document size
                doc_sizes.append(len(line))
                
                # Analyze structure
                for key, value in doc.items():
                    all_keys[key] += 1
                    key_types[key][type(value).__name__] += 1
                    
                    # Identify potential text fields
                    if isinstance(value, str) and len(value) > 100:
                        text_field_candidates[key].append(len(value))
                
            except json.JSONDecodeError as e:
                print(f"Warning: Malformed JSON at line {i+1}: {e}")
                continue
    
    num_docs_analyzed = min(i + 1, sample_size)
    
    # ===== REPORT RESULTS =====
    
    print(f"\nüìä ANALYZED {num_docs_analyzed} DOCUMENTS\n")
    
    # 1. Show sample documents
    print("=" * 60)
    print("1. SAMPLE DOCUMENTS (first 3)")
    print("=" * 60)
    for idx, doc in enumerate(sample_docs[:3], 1):
        print(f"\n--- Document {idx} ---")
        print(json.dumps(doc, indent=2, ensure_ascii=False)[:500])
        if len(json.dumps(doc)) > 500:
            print("... (truncated)")
    
    # 2. All fields discovered
    print("\n" + "=" * 60)
    print("2. ALL FIELDS DISCOVERED")
    print("=" * 60)
    print(f"{'Field Name':<30} {'Frequency':<15} {'Type(s)'}")
    print("-" * 60)
    for key, count in all_keys.most_common():
        types = ', '.join([f"{t}({c})" for t, c in key_types[key].most_common()])
        percentage = (count / num_docs_analyzed) * 100
        print(f"{key:<30} {count:>6} ({percentage:>5.1f}%)  {types}")
    
    # 3. Identify text field
    print("\n" + "=" * 60)
    print("3. TEXT FIELD CANDIDATES")
    print("=" * 60)
    if text_field_candidates:
        print(f"{'Field Name':<20} {'Avg Length':<15} {'Min':<10} {'Max':<10}")
        print("-" * 60)
        for field, lengths in sorted(text_field_candidates.items(), 
                                     key=lambda x: statistics.mean(x[1]), 
                                     reverse=True):
            avg_len = statistics.mean(lengths)
            min_len = min(lengths)
            max_len = max(lengths)
            print(f"{field:<20} {avg_len:>10.0f} chars  {min_len:>8}  {max_len:>10}")
        
        best_text_field = max(text_field_candidates.items(), 
                             key=lambda x: statistics.mean(x[1]))[0]
        print(f"\nüéØ RECOMMENDED TEXT FIELD: '{best_text_field}'")
    else:
        print("‚ö†Ô∏è  No string fields longer than 100 characters found!")
    
    # 4. Document size analysis
    print("\n" + "=" * 60)
    print("4. DOCUMENT SIZE ANALYSIS")
    print("=" * 60)
    if doc_sizes:
        avg_size = statistics.mean(doc_sizes)
        median_size = statistics.median(doc_sizes)
        min_size = min(doc_sizes)
        max_size = max(doc_sizes)
        
        print(f"Average document size: {avg_size:,.0f} bytes ({avg_size/1024:.1f} KB)")
        print(f"Median document size:  {median_size:,.0f} bytes ({median_size/1024:.1f} KB)")
        print(f"Min document size:     {min_size:,.0f} bytes")
        print(f"Max document size:     {max_size:,.0f} bytes ({max_size/1024:.1f} KB)")
        
        # Batching recommendation
        print("\nüì¶ BATCHING RECOMMENDATIONS:")
        if avg_size < 10_000:  # < 10KB
            print("   - Small documents: Use batch_size=1000-5000")
        elif avg_size < 100_000:  # < 100KB
            print("   - Medium documents: Use batch_size=100-500")
        else:
            print("   - Large documents: Use batch_size=10-50")
    
    # 5. Metadata summary
    print("\n" + "=" * 60)
    print("5. METADATA FIELDS")
    print("=" * 60)
    
    common_metadata_fields = ['source', 'url', 'domain', 'timestamp', 'date', 
                             'id', 'author', 'title', 'lang', 'language']
    
    found_metadata = []
    for field in common_metadata_fields:
        if field in all_keys:
            found_metadata.append(field)
    
    if found_metadata:
        print("Found metadata fields:")
        for field in found_metadata:
            percentage = (all_keys[field] / num_docs_analyzed) * 100
            print(f"  - {field}: present in {percentage:.1f}% of documents")
    else:
        print("No standard metadata fields found.")
        print("\nAll fields could be metadata:")
        for key in all_keys:
            if key not in text_field_candidates:
                print(f"  - {key}")
    
    # 6. Data quality checks
    print("\n" + "=" * 60)
    print("6. DATA QUALITY CHECKS")
    print("=" * 60)
    
    # Check for nulls/empty values
    empty_text_count = 0
    with open(file_path, 'r', encoding='utf-8') as f:
        for i, line in enumerate(f):
            if i >= sample_size:
                break
            try:
                doc = json.loads(line)
                if best_text_field in text_field_candidates:
                    if not doc.get(best_text_field, '').strip():
                        empty_text_count += 1
            except:
                continue
    
    if empty_text_count > 0:
        print(f"‚ö†Ô∏è  {empty_text_count} documents have empty text fields ({empty_text_count/num_docs_analyzed*100:.1f}%)")
    else:
        print("‚úÖ All sampled documents have non-empty text")
    
    return {
        'text_field': best_text_field if text_field_candidates else None,
        'all_fields': list(all_keys.keys()),
        'metadata_fields': found_metadata,
        'avg_doc_size': statistics.mean(doc_sizes) if doc_sizes else 0,
        'num_docs_sampled': num_docs_analyzed
    }

In [None]:
results = analyze_jsonl('/home/venturae/Downloads/mainpipe_data_v1.jsonl', sample_size=300000)
    
# Print summary
print("\n" + "=" * 60)
print("üìã SUMMARY")
print("=" * 60)
print(f"Text field to use: {results['text_field']}")
print(f"Metadata fields: {', '.join(results['metadata_fields'])}")
print(f"Average document size: {results['avg_doc_size']/1024:.1f} KB")