# Index news into Elasticsearch 


Assumptions:
- Your Elasticsearch is running in Docker at http://localhost:9200 .
- Run cells in order. If a cell needs packages, run the setup pip cell first.
- To change how many archives are indexed, edit `NUM_ZIPS_TO_INDEX` in the setup cell.

In [None]:
# !pip install elasticsearch==8.12.0 python-dotenv tqdm

import sys
sys.path.append('.')

from src import (
    ES_HOST, ES_PORT, ES_URL, INDEX_NAME, DATA_DIR, 
    NUM_ZIPS_TO_INDEX, BATCH_SIZE, MAX_LIMIT, DEFAULT_SEARCH_SIZE,
    MAPPING, create_es_client, iter_jsons_in_path, bulk_index,
    search_boolean_es, display_search_results
)
from tqdm import tqdm
import time

In [None]:

# Initialize Elasticsearch client
es = create_es_client()


# MAPPING 

In [None]:
# Mapping is imported from mapping.py
# To view the mapping configuration, check mapping.py or print it:
print("Index Mapping Configuration:")
from pprint import pprint
pprint(MAPPING)


# INDEXING 

In [None]:

# Create index if it doesn't exist
if not es.indices.exists(index=INDEX_NAME):
    es.indices.create(index=INDEX_NAME, body=MAPPING)
    print('âœ“ Created index:', INDEX_NAME)
else:
    print('âœ“ Index already exists:', INDEX_NAME)

# Find first NUM_ZIPS_TO_INDEX entries (directories, zips, or json files)
entries = sorted([p for p in DATA_DIR.iterdir()])[:NUM_ZIPS_TO_INDEX]
print(f'\nEntries to process: {len(entries)}')
print('Sample entries:', entries[:3])

# Index each entry
count = 0
for ent in entries:
    docs_iter = iter_jsons_in_path(ent)
    n = bulk_index(es, INDEX_NAME, docs_iter, batch_size=BATCH_SIZE)
    print(f'âœ“ Indexed {n} docs from {ent.name}')
    count += n
    if count >= MAX_LIMIT:
        print(f'Reached maximum limit of {MAX_LIMIT} documents')
        break

print(f'\nâœ“ Total documents indexed: {count}')


# Retrieve Result for Given Query


In [None]:
# Test queries for inference/demonstration
test_queries = [
    # From ransomware / healthcare article
    "Change AND Healthcare AND ransomware",
    "Change Healthcare AND data AND breach",
    "UnitedHealth AND ransomware",
    "Alphv AND BlackCat AND ransom",
    "RansomHub AND leak AND patient",
    "Change Healthcare AND NOT RansomHub",
    "Healthcare AND cyberattack",
    "Change AND Healthcare AND payment",
    "Change Healthcare AND podcast",
    "ransomware AND healthcare",

    # From vaccine lawsuit article
    "Biden AND vaccine AND lawsuits",
    "Biden AND NOT Trump",
    "Vaccine AND administration AND attorneys",
    "Lawsuits AND healthcare",
    "Vaccine AND class AND action",
    "Membership AND contract AND policy",
    "Privacy AND data AND agreement",
    "Legal AND jurisdiction AND website",
    "User AND conduct AND violation",
    "Cookies AND privacy AND agreement",

    # Cross-domain combinations
    "Healthcare AND data AND privacy",
    "Cyberattack AND lawsuits",
    "Ransomware AND legal AND response",
    "Biden AND cyberattack",
    "Change Healthcare AND lawsuit"
]


In [None]:

# Execute and display results for sample queries
print("=" * 70)
print("ELASTICSEARCH SEARCH INFERENCE")
print("=" * 70)

for query in test_queries[0:5]:
    print(f"\nðŸ“‹ Query: {query}")
    results = search_boolean_es(es, query, size=5)
    if results:
        display_search_results(results)
    else:
        print("No results found or error occurred.")


In [None]:

# Additional inference: Search and analyze specific query patterns
def run_custom_search(query, size=10):
    """Run a custom search query and display results with analysis."""
    print(f"\n{'='*70}")
    print(f"CUSTOM SEARCH: {query}")
    print(f"{'='*70}")
    
    results = search_boolean_es(es, query, size=size)
    if results:
        total = results['hits']['total']['value'] if isinstance(results['hits']['total'], dict) else results['hits']['total']
        print(f"Total matches: {total}")
        print(f"Displaying top {len(results['hits']['hits'])} results\n")
        display_search_results(results)
    else:
        print("No results found.")

# Example custom searches
# Uncomment to run:
# run_custom_search("ransomware AND healthcare", size=5)
# run_custom_search("Biden AND vaccine", size=5)
