# EHDSLens Data Exploration

This notebook provides a deep dive into the EHDS systematic review study database, demonstrating how to explore, filter, and analyze the 52 included studies.

In [None]:
from ehdslens import EHDSAnalyzer, StudyDatabase, Study
from ehdslens.data import ThematicAxis, QualityRating, StudyType, load_ehds_studies
from collections import Counter

# Load the database
analyzer = EHDSAnalyzer()
analyzer.load_default_data()
db = analyzer.db

print(f"Loaded {len(db)} studies")

## Exploring Individual Studies

Each study is represented as a `Study` dataclass with rich metadata:

In [None]:
# Get a specific study by ID
study = db.get_study(1)

if study:
    print("=" * 60)
    print("STUDY DETAILS")
    print("=" * 60)
    print(f"ID: {study.id}")
    print(f"Authors: {study.authors}")
    print(f"Year: {study.year}")
    print(f"Title: {study.title}")
    print(f"Journal: {study.journal}")
    print(f"Study Type: {study.study_type.value}")
    print(f"Primary Axis: {study.primary_axis.value}")
    print(f"Quality Rating: {study.quality_rating.value}")
    print(f"DOI: {study.doi or 'N/A'}")
    print(f"Country: {study.country or 'N/A'}")

In [None]:
# Generate citations in different formats
if study:
    print("--- APA Format ---")
    print(study.get_citation(style="apa"))
    print("\n--- Vancouver Format ---")
    print(study.get_citation(style="vancouver"))

## Year Distribution Analysis

In [None]:
# Analyze publication years
years = [s.year for s in db.studies]
year_counts = Counter(years)

print("=" * 40)
print("PUBLICATION YEAR DISTRIBUTION")
print("=" * 40)

for year in sorted(year_counts.keys()):
    count = year_counts[year]
    bar = "â–ˆ" * count
    print(f"{year}: {bar} ({count})")

print(f"\nEarliest: {min(years)}")
print(f"Latest: {max(years)}")
print(f"Median: {sorted(years)[len(years)//2]}")

## Thematic Axis Distribution

In [None]:
# Studies per thematic axis
print("=" * 60)
print("STUDIES BY THEMATIC AXIS")
print("=" * 60)

for axis in ThematicAxis:
    studies = db.filter_by_axis(axis)
    pct = (len(studies) / len(db)) * 100
    bar = "â–ˆ" * (len(studies) // 2)
    print(f"\n{axis.value}:")
    print(f"  {bar} {len(studies)} studies ({pct:.1f}%)")

## Quality Assessment Overview

In [None]:
# Quality distribution
quality_counts = Counter(s.quality_rating for s in db.studies)

print("=" * 40)
print("QUALITY RATING DISTRIBUTION (MMAT)")
print("=" * 40)

quality_order = [QualityRating.HIGH, QualityRating.MODERATE, QualityRating.LOW, QualityRating.NOT_APPLICABLE]
quality_symbols = {
    QualityRating.HIGH: "ðŸŸ¢",
    QualityRating.MODERATE: "ðŸŸ¡",
    QualityRating.LOW: "ðŸŸ ",
    QualityRating.NOT_APPLICABLE: "âšª"
}

for quality in quality_order:
    count = quality_counts.get(quality, 0)
    pct = (count / len(db)) * 100
    symbol = quality_symbols[quality]
    print(f"{symbol} {quality.value.upper():15} {count:3} studies ({pct:5.1f}%)")

## Study Type Analysis

In [None]:
# Study types
type_counts = Counter(s.study_type for s in db.studies)

print("=" * 50)
print("STUDY TYPE DISTRIBUTION")
print("=" * 50)

for stype, count in sorted(type_counts.items(), key=lambda x: -x[1]):
    pct = (count / len(db)) * 100
    bar = "â–ˆ" * count
    print(f"{stype.value:20} {bar} {count} ({pct:.1f}%)")

## Geographic Distribution

In [None]:
# Country distribution (first author)
countries = [s.country for s in db.studies if s.country]
country_counts = Counter(countries)

print("=" * 40)
print("GEOGRAPHIC DISTRIBUTION (First Author)")
print("=" * 40)

for country, count in country_counts.most_common(15):
    bar = "â–ˆ" * count
    print(f"{country:20} {bar} {count}")

## Cross-Tabulation Analysis

In [None]:
# Quality by Thematic Axis
print("=" * 70)
print("QUALITY RATING BY THEMATIC AXIS")
print("=" * 70)

# Header
print(f"{'Axis':<35} {'High':>6} {'Mod':>6} {'Low':>6} {'N/A':>6}")
print("-" * 70)

for axis in ThematicAxis:
    axis_studies = db.filter_by_axis(axis)
    q_counts = Counter(s.quality_rating for s in axis_studies)
    
    print(f"{axis.value[:34]:<35} "
          f"{q_counts.get(QualityRating.HIGH, 0):>6} "
          f"{q_counts.get(QualityRating.MODERATE, 0):>6} "
          f"{q_counts.get(QualityRating.LOW, 0):>6} "
          f"{q_counts.get(QualityRating.NOT_APPLICABLE, 0):>6}")

## Advanced Filtering Examples

In [None]:
# Complex filter: High quality empirical studies from 2024+
filtered = [
    s for s in db.studies
    if s.year >= 2024
    and s.quality_rating == QualityRating.HIGH
    and s.study_type in [StudyType.QUALITATIVE, StudyType.QUANTITATIVE, StudyType.MIXED_METHODS]
]

print(f"High-quality empirical studies (2024+): {len(filtered)}")
for s in filtered:
    print(f"  â€¢ {s.authors} - {s.study_type.value}")

In [None]:
# Filter by year range
recent = db.filter_by_year(2025, 2026)
print(f"\nStudies from 2025-2026: {len(recent)}")

for s in recent[:5]:
    print(f"  â€¢ {s.authors} ({s.year}): {s.title[:50]}...")

## Data Export Options

In [None]:
# Export to JSON (preview)
import json

# Convert first 3 studies to dict for preview
sample_data = [
    {
        "id": s.id,
        "authors": s.authors,
        "year": s.year,
        "title": s.title,
        "primary_axis": s.primary_axis.value,
        "quality_rating": s.quality_rating.value
    }
    for s in list(db.studies)[:3]
]

print("Sample JSON export:")
print(json.dumps(sample_data, indent=2))

In [None]:
# Export to CSV format (preview)
print("\nSample CSV export:")
print("id,authors,year,title,axis,quality")
for s in list(db.studies)[:3]:
    print(f'{s.id},"{s.authors}",{s.year},"{s.title[:40]}...",{s.primary_axis.value},{s.quality_rating.value}')

## Summary Statistics Table

In [None]:
# Generate summary table
print("=" * 60)
print("EHDS SYSTEMATIC REVIEW - SUMMARY TABLE")
print("=" * 60)

stats = {
    "Total Studies": len(db),
    "Peer-reviewed": len([s for s in db.studies if s.study_type != StudyType.POLICY_DOCUMENT]),
    "Grey Literature": len([s for s in db.studies if s.study_type == StudyType.POLICY_DOCUMENT]),
    "Year Range": f"{min(years)}-{max(years)}",
    "High Quality": quality_counts.get(QualityRating.HIGH, 0),
    "Moderate Quality": quality_counts.get(QualityRating.MODERATE, 0),
    "Countries Represented": len(set(countries)),
    "Thematic Axes": len(ThematicAxis),
}

for key, value in stats.items():
    print(f"{key:25} {value}")