In [1]:
import os
import re
import time
import requests
import zipfile
import pandas as pd
import numpy as np
from io import StringIO, BytesIO
from bs4 import BeautifulSoup
from datetime import datetime

In [2]:
OUTPUT = os.path.join(os.getcwd(), 'Output')
os.makedirs(OUTPUT, exist_ok=True)

HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
    'Referer': 'https://www.fao.org/faostat/en/',
}

# ============================================================
# PIPELINE 1: WHO Disease Outbreak News
# ============================================================

def pipeline_who():
    """Fetch WHO DON data, extract outbreak features, save raw + features."""
    
    url = 'https://www.who.int/api/news/diseaseoutbreaknews'
    headers = {'User-Agent': 'Mozilla/5.0 (research-project)'}
    
    # --- Paginated fetch ---
    all_items = []
    skip, batch, limit = 0, 100, 5000
    
    while skip < limit:
        try:
            r = requests.get(url, headers=headers, timeout=15,)
            r.raise_for_status()
            items = r.json().get('value', [])
        except:
            break
        if not items:
            break
        all_items.extend(items)
        if len(items) < batch:
            break
        skip += batch
    
    if not all_items:
        return None, None
    
    # --- Build raw DataFrame ---
    df = pd.DataFrame(all_items)
    df['PublicationDate'] = pd.to_datetime(df['PublicationDate'], errors='coerce')
    df = df.sort_values('PublicationDate', ascending=False).reset_index(drop=True)
    
    def clean_html(html):
        if pd.isna(html) or not html:
            return ''
        return BeautifulSoup(str(html), 'html.parser').get_text(' ', strip=True)
    
    for col in ['Summary', 'Epidemiology', 'Assessment', 'Overview', 'Response', 'Advice']:
        if col in df.columns:
            df[col + '_clean'] = df[col].apply(clean_html)
    
    if 'ItemDefaultUrl' in df.columns:
        df['URL'] = df['ItemDefaultUrl'].apply(
            lambda x: f'https://www.who.int{x}' if x and not str(x).startswith('http') else str(x))
    
    # --- Extract structured features ---
    def extract(row):
        title = str(row.get('Title', ''))
        summary = str(row.get('Summary_clean', ''))
        epi = str(row.get('Epidemiology_clean', ''))
        assessment = str(row.get('Assessment_clean', ''))
        response = str(row.get('Response_clean', ''))
        advice = str(row.get('Advice_clean', ''))
        combined = f'{summary} {epi}'
        
        parts = re.split(r'\s*[-\u2013\u2014]\s*', title, maxsplit=1)
        disease = parts[0].strip() if parts else ''
        country = parts[-1].strip() if len(parts) > 1 else ''
        
        case_nums = re.findall(r'(\d[\d,]*)\s*(?:confirmed\s+)?cases', combined, re.I)
        cases = [int(n.replace(',', '')) for n in case_nums if n.strip()]
        
        death_nums = re.findall(r'(\d[\d,]*)\s*deaths?', combined, re.I)
        deaths = [int(n.replace(',', '')) for n in death_nums if n.strip()]
        
        cfr_m = re.search(r'(?:fatality|CFR)[^\d]*(\d+\.?\d*)%', combined, re.I)
        
        risk = 'unknown'
        al = assessment.lower()
        if 'very high' in al: risk = 'very_high'
        elif 'high' in al and 'not high' not in al: risk = 'high'
        elif 'moderate' in al: risk = 'moderate'
        elif 'low' in al: risk = 'low'
        
        dl = disease.lower()
        dtype = 'other'
        if any(k in dl for k in ['cholera','typhoid']): dtype = 'waterborne'
        elif any(k in dl for k in ['influenza','mers','covid','sars']): dtype = 'respiratory'
        elif any(k in dl for k in ['ebola','marburg','lassa']): dtype = 'hemorrhagic_fever'
        elif any(k in dl for k in ['dengue','zika','malaria']): dtype = 'vector_borne'
        elif any(k in dl for k in ['measles','polio']): dtype = 'vaccine_preventable'
        elif any(k in dl for k in ['plague','mpox']): dtype = 'zoonotic'
        
        return {
            'date': row.get('PublicationDate'),
            'don_id': row.get('DonId', ''),
            'disease': disease, 'disease_type': dtype, 'country': country,
            'total_cases': max(cases) if cases else None,
            'total_deaths': max(deaths) if deaths else None,
            'cfr_pct': float(cfr_m.group(1)) if cfr_m else None,
            'who_risk_level': risk,
            'multi_country': bool(re.search(r'international|multiple countries|global', combined, re.I)),
            'intl_response': bool(re.search(r'WHO|deployed|GOARN', response, re.I)),
            'travel_advisory': bool(re.search(r'travel|restriction|border', advice, re.I)),
            'vaccination': bool(re.search(r'vaccin', f'{response} {advice}', re.I)),
            'epi_length': len(epi), 'summary_length': len(summary),
        }
    
    feat = df.apply(extract, axis=1, result_type='expand')
    feat['date'] = pd.to_datetime(feat['date'], errors='coerce')
    feat = feat.sort_values('date', ascending=False).reset_index(drop=True)
    
    return df, feat


# ============================================================
# PIPELINE 2: FAOSTAT Crop & Livestock Production
# ============================================================

def pipeline_faostat_production():
    """Fetch FAOSTAT crop/livestock production data."""
    
    # Strategy 1: SDMX API
    urls = [
        'https://nsi-release-ro-statsuite.fao.org/rest/data/FAO,DF_CROP_LS_PROD,1.0/A..5412.0111..?startPeriod=2018&endPeriod=2023&format=csvfilewithlabels',
        'https://nsi-release-ro-statsuite.fao.org/rest/data/FAO,DF_CROP_LS_PROD,1.0/all?startPeriod=2022&endPeriod=2023&format=csvfilewithlabels',
    ]
    for url in urls:
        try:
            r = requests.get(url, timeout=120)
            r.raise_for_status()
            return pd.read_csv(StringIO(r.text))
        except:
            continue
    
    # Strategy 2: faostat package
    try:
        import faostat
        return faostat.get_data_df('QCL', pars={'element': [5510]})
    except:
        pass
    
    # Strategy 3: Bulk ZIP
    bulk = [
        'https://bulks-faostat.fao.org/production/Production_Crops_Livestock_E_All_Data_(Normalized).csv.zip',
        'https://fenixservices.fao.org/faostat/static/bulkdownloads/Production_Crops_Livestock_E_All_Data_(Normalized).zip',
    ]
    for url in bulk:
        try:
            r = requests.get(url, headers=HEADERS, timeout=120, allow_redirects=True)
            r.raise_for_status()
            z = zipfile.ZipFile(BytesIO(r.content))
            csv_name = [f for f in z.namelist() if f.endswith('.csv')][0]
            return pd.read_csv(z.open(csv_name), encoding='latin-1', nrows=100000)
        except:
            continue
    
    # Strategy 4: OWID mirror
    for url in ['https://catalog.ourworldindata.org/garden/faostat/2024-03-14/faostat_qcl/faostat_qcl.csv']:
        try:
            return pd.read_csv(url, nrows=100000)
        except:
            continue
    
    return None


# ============================================================
# PIPELINE 3: FAOSTAT Trade (Exports + Imports)
# ============================================================

def pipeline_faostat_trade(trade_type='both'):
    """Fetch FAOSTAT trade data. trade_type: 'exports', 'imports', or 'both'."""
    
    elem_map = {'exports': '5910+5922', 'imports': '5610+5622', 'both': '5910+5922+5610+5622'}
    elems = elem_map.get(trade_type, elem_map['both'])
    
    # Strategy 1: SDMX API
    for tf in ['DF_TRADE_CL', 'DF_TRADE', 'DF_TCL']:
        try:
            url = (f'https://nsi-release-ro-statsuite.fao.org/rest/data/'
                   f'FAO,{tf},1.0/A..{elems}..?'
                   f'startPeriod=2018&endPeriod=2023&format=csvfilewithlabels')
            r = requests.get(url, timeout=90)
            if r.status_code == 200 and len(r.text) > 500:
                return pd.read_csv(StringIO(r.text), nrows=300000)
        except:
            continue
    
    # Strategy 2: Bulk ZIP
    bulk = [
        'https://bulks-faostat.fao.org/production/Trade_CropsLivestock_E_All_Data_(Normalized).csv.zip',
        'https://fenixservices.fao.org/faostat/static/bulkdownloads/Trade_CropsLivestock_E_All_Data_(Normalized).zip',
    ]
    for url in bulk:
        try:
            r = requests.get(url, headers=HEADERS, timeout=120, allow_redirects=True)
            if r.status_code == 200 and len(r.content) > 1000:
                z = zipfile.ZipFile(BytesIO(r.content))
                csv_name = [f for f in z.namelist() if f.endswith('.csv')][0]
                df = pd.read_csv(z.open(csv_name), encoding='latin-1', nrows=500000)
                if 'Element' in df.columns and trade_type != 'both':
                    el = df['Element'].str.lower()
                    df = df[el.str.contains(trade_type.rstrip('s'), na=False)]
                return df
        except:
            continue
    
    # Strategy 3: Detailed Trade Matrix
    try:
        url = 'https://fenixservices.fao.org/faostat/static/bulkdownloads/Trade_DetailedTradeMatrix_E_All_Data_(Normalized).zip'
        r = requests.get(url, headers=HEADERS, timeout=300, stream=True)
        if r.status_code == 200:
            z = zipfile.ZipFile(BytesIO(r.content))
            csv_name = [f for f in z.namelist() if f.endswith('.csv')][0]
            df = pd.read_csv(z.open(csv_name), encoding='latin-1', nrows=300000)
            if 'Element' in df.columns and trade_type != 'both':
                el = df['Element'].str.lower()
                df = df[el.str.contains(trade_type.rstrip('s'), na=False)]
            return df
    except:
        pass
    
    # Strategy 4: faostat package
    try:
        import faostat
        pars = {}
        if trade_type == 'exports': pars = {'element': [5910, 5922]}
        elif trade_type == 'imports': pars = {'element': [5610, 5622]}
        return faostat.get_data_df('TCL', pars=pars)
    except:
        pass
    
    return None


# ============================================================
# SAVE HELPER
# ============================================================

def save(df, filename):
    """Save DataFrame to Output folder as CSV."""
    if df is None or (isinstance(df, pd.DataFrame) and df.empty):
        print(f'  [FAIL] {filename}: no data to save')
        return False
    path = os.path.join(OUTPUT, filename)
    df.to_csv(path, index=False)
    print(f'  [SAVED] {filename} ({df.shape[0]:,} rows x {df.shape[1]} cols) -> \"{OUTPUT}\"')
    return True

In [3]:
# ============================================================
# RUN PIPELINE
# ============================================================

print(f'Output folder: {OUTPUT}')
print(f'Run started: {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}')
print('=' * 60)

results = {}

# --- Pipeline 1: WHO DON ---
print('\n[Pipeline-1] WHO Disease Outbreak News — processing...')
try:
    who_raw, who_features = pipeline_who()
    s1 = save(who_raw, 'who_don_raw.csv')
    s2 = save(who_features, 'who_don_features.csv')
    results['WHO_DON'] = 'SUCCESS' if (s1 or s2) else 'FAIL'
except Exception as e:
    results['WHO_DON'] = f'FAIL: {e}'
print(f'[Pipeline-1] WHO Disease Outbreak News — {results["WHO_DON"]}')

# --- Pipeline 2: FAOSTAT Production ---
print('\n[Pipeline-2] FAOSTAT Production — processing...')
try:
    df_prod = pipeline_faostat_production()
    s = save(df_prod, 'faostat_production.csv')
    results['FAOSTAT_Production'] = 'SUCCESS' if s else 'FAIL'
except Exception as e:
    results['FAOSTAT_Production'] = f'FAIL: {e}'
print(f'[Pipeline-2] FAOSTAT Production — {results["FAOSTAT_Production"]}')

# --- Pipeline 3: FAOSTAT Trade ---
print('\n[Pipeline-3] FAOSTAT Trade (Exports) — processing...')
try:
    df_exp = pipeline_faostat_trade('exports')
    s = save(df_exp, 'faostat_trade_exports.csv')
    results['FAOSTAT_Exports'] = 'SUCCESS' if s else 'FAIL'
except Exception as e:
    results['FAOSTAT_Exports'] = f'FAIL: {e}'
print(f'[Pipeline-3] FAOSTAT Trade (Exports) — {results["FAOSTAT_Exports"]}')

print('\n[Pipeline-4] FAOSTAT Trade (Imports) — processing...')
try:
    df_imp = pipeline_faostat_trade('imports')
    s = save(df_imp, 'faostat_trade_imports.csv')
    results['FAOSTAT_Imports'] = 'SUCCESS' if s else 'FAIL'
except Exception as e:
    results['FAOSTAT_Imports'] = f'FAIL: {e}'
print(f'[Pipeline-4] FAOSTAT Trade (Imports) — {results["FAOSTAT_Imports"]}')

# --- Summary ---
print('\n' + '=' * 60)
print('PIPELINE SUMMARY')
print('=' * 60)
all_ok = True
for name, status in results.items():
    icon = 'OK' if status == 'SUCCESS' else 'XX'
    if status != 'SUCCESS': all_ok = False
    print(f'  [{icon}] {name}: {status}')

print('\nFiles in Output:')
for f in sorted(os.listdir(OUTPUT)):
    size = os.path.getsize(os.path.join(OUTPUT, f))
    print(f'  {f:40s} {size/1024:>8.1f} KB')

if all_ok:
    print(f'\nAll data fetched successfully and saved in \"{OUTPUT}\"')
else:
    failed = [k for k, v in results.items() if v != 'SUCCESS']
    print(f'\nPartial success. Failed: {failed}')

print(f'Run finished: {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}')

Output folder: /Users/smitchandi/Documents/Coding/Practice/Hackathon/Output
Run started: 2026-02-14 20:07:19

[Pipeline-1] WHO Disease Outbreak News — processing...
  [SAVED] who_don_raw.csv (50 rows x 29 cols) -> "/Users/smitchandi/Documents/Coding/Practice/Hackathon/Output"
  [SAVED] who_don_features.csv (50 rows x 15 cols) -> "/Users/smitchandi/Documents/Coding/Practice/Hackathon/Output"
[Pipeline-1] WHO Disease Outbreak News — SUCCESS

[Pipeline-2] FAOSTAT Production — processing...
  [SAVED] faostat_production.csv (100,000 rows x 14 cols) -> "/Users/smitchandi/Documents/Coding/Practice/Hackathon/Output"
[Pipeline-2] FAOSTAT Production — SUCCESS

[Pipeline-3] FAOSTAT Trade (Exports) — processing...
  [SAVED] faostat_trade_exports.csv (188,329 rows x 14 cols) -> "/Users/smitchandi/Documents/Coding/Practice/Hackathon/Output"
[Pipeline-3] FAOSTAT Trade (Exports) — SUCCESS

[Pipeline-4] FAOSTAT Trade (Imports) — processing...
  [SAVED] faostat_trade_imports.csv (311,671 rows x 14 cols)