In [1]:
"""
QUICK DATA CLEANING - Robust version for all 77 districts
"""

import pandas as pd
from pathlib import Path

# Setup paths
RAW_PATH = Path(r'C:\Users\saurav\Downloads\SEVI_Nepal_Project\data\raw')
PROCESSED_PATH = Path(r'C:\Users\saurav\Downloads\SEVI_Nepal_Project\data\processed')
PROCESSED_PATH.mkdir(parents=True, exist_ok=True)

print("ROBUST DATA EXTRACTION - 77 DISTRICTS")
print("="*60)

# ============================================================================
# STEP 1: Identify the correct file names
# ============================================================================

print("\nChecking available files...")
all_files = list(RAW_PATH.glob('*.csv'))
print(f"Found {len(all_files)} CSV files")

# Show all files
for i, file_path in enumerate(all_files, 1):
    print(f"{i:2}. {file_path.name}")

# ============================================================================
# STEP 2: Load cooking_fuel to get district IDs
# ============================================================================

print("\n" + "="*60)
print("ANALYZING cooking_fuel.csv FOR DISTRICT IDS")
print("="*60)

try:
    cf_df = pd.read_csv(RAW_PATH / 'cooking_fuel.csv')
    print(f"File shape: {cf_df.shape}")
    print(f"Columns: {list(cf_df.columns)}")
    
    # Check structure
    print("\nFirst 25 rows:")
    for i, row in cf_df.head(25).iterrows():
        area_name = str(row.get('AREA', row.get('AREA_NAME', 'Unknown')))
        print(f"ID {row['ID']:3}: {area_name}")
    
    # Analyze ID structure
    print(f"\nID analysis:")
    print(f"  Min ID: {cf_df['ID'].min()}")
    print(f"  Max ID: {cf_df['ID'].max()}")
    print(f"  Unique IDs: {cf_df['ID'].nunique()}")
    
    # Identify different sections
    print(f"\nIdentifying sections:")
    
    # IDs 1-6: National aggregates
    national = cf_df[cf_df['ID'] <= 6]
    print(f"  National aggregates (ID 1-6): {len(national)} rows")
    
    # IDs 7-13: Provinces
    provinces = cf_df[(cf_df['ID'] >= 7) & (cf_df['ID'] <= 13)]
    print(f"  Provinces (ID 7-13): {len(provinces)} rows")
    if len(provinces) > 0:
        print(f"    Province names: {list(provinces['AREA'].unique())}")
    
    # IDs 14+: Districts
    districts = cf_df[cf_df['ID'] >= 14]
    print(f"  Districts (ID 14+): {len(districts)} rows")
    
    if len(districts) == 77:
        print(f"  ✓ Perfect! Found all 77 districts")
    elif len(districts) > 77:
        print(f"  ⚠ Found {len(districts)} rows, expected 77")
        # Check for duplicates or other issues
        duplicate_ids = districts[districts.duplicated('ID', keep=False)]
        if len(duplicate_ids) > 0:
            print(f"  ⚠ Found {len(duplicate_ids)} duplicate IDs")
    else:
        print(f"  ⚠ Only found {len(districts)} districts, expected 77")
    
    # Create district reference
    district_ref = districts[['ID', 'AREA']].copy()
    district_ref = district_ref.rename(columns={'AREA': 'DISTRICT_NAME'})
    district_ref = district_ref.sort_values('ID').reset_index(drop=True)
    
    # Save district reference
    district_ref.to_csv(PROCESSED_PATH / 'district_reference.csv', index=False)
    print(f"\n✓ District reference saved with {len(district_ref)} districts")
    
    # Display first and last districts
    print(f"\nFirst 5 districts:")
    print(district_ref.head().to_string(index=False))
    print(f"\nLast 5 districts:")
    print(district_ref.tail().to_string(index=False))
    
    # District IDs list
    district_ids = district_ref['ID'].tolist()
    print(f"\nDistrict ID range: {min(district_ids)} to {max(district_ids)}")
    
except Exception as e:
    print(f"ERROR with cooking_fuel.csv: {e}")
    exit(1)

# ============================================================================
# STEP 3: Check key files for district data
# ============================================================================

print("\n" + "="*60)
print("CHECKING KEY FILES FOR DISTRICT DATA")
print("="*60)

# Define files to check - updated based on what we found
key_files_to_check = [
    'wall_materials.csv',
    'drinking_watersource.csv', 
    'toilet_facility.csv',  # or toilet_facilities.csv?
    'cooking_fuel.csv',     # Already processed
    'household_amenities.csv',
    'main_source_of_lighting.csv',
    'radio_television_etc.csv'
]

file_info = {}

for filename in key_files_to_check:
    file_path = RAW_PATH / filename
    if file_path.exists():
        try:
            df = pd.read_csv(file_path)
            file_info[filename] = {
                'shape': df.shape,
                'columns': list(df.columns),
                'has_id': 'ID' in df.columns,
                'has_area': any(col in df.columns for col in ['AREA', 'AREA_NAME', 'DISTRICT']),
                'district_rows': 0
            }
            
            print(f"\n{filename}:")
            print(f"  Shape: {df.shape[0]} rows × {df.shape[1]} cols")
            print(f"  Has ID: {'Yes' if 'ID' in df.columns else 'No'}")
            
            if 'ID' in df.columns:
                # Check district rows
                district_rows = df[df['ID'].isin(district_ids)]
                count = len(district_rows)
                file_info[filename]['district_rows'] = count
                print(f"  District rows: {count}")
                
                if count != 77:
                    print(f"  ⚠ Expected 77, found {count}")
                    
            # Check if LONG format
            area_col = None
            for col in ['AREA', 'AREA_NAME', 'DISTRICT']:
                if col in df.columns:
                    area_col = col
                    break
            
            if area_col:
                unique_areas = df[area_col].nunique()
                if df.shape[0] > unique_areas:
                    print(f"  ⚠ LONG format: {df.shape[0]} rows for {unique_areas} areas")
                    print(f"    ~{df.shape[0]/unique_areas:.1f} rows per area")
                    
        except Exception as e:
            print(f"  ERROR: {e}")
    else:
        print(f"\n{filename}: FILE NOT FOUND")

# ============================================================================
# STEP 4: Create merged dataset - ROBUST VERSION
# ============================================================================

print("\n" + "="*60)
print("CREATING MERGED DATASET")
print("="*60)

# Start with district reference
merged_df = district_ref.copy()
print(f"Starting with {len(merged_df)} districts")

# Files to merge (with priority)
merge_priority = [
    'wall_materials.csv',
    'drinking_watersource.csv',
    'toilet_facility.csv',  # Try without 's'
    'main_source_of_lighting.csv',
    'radio_television_etc.csv'
]

# Also try alternative names
try_alt_names = {
    'toilet_facility.csv': 'toilet_facilities.csv',
    'radio_television_etc.csv': 'radio_television_etc..csv'  # Sometimes has double extension
}

for filename in merge_priority:
    actual_file = filename
    
    # Try alternative name if main doesn't exist
    if not (RAW_PATH / filename).exists() and filename in try_alt_names:
        alt_name = try_alt_names[filename]
        if (RAW_PATH / alt_name).exists():
            actual_file = alt_name
            print(f"  Using alternative: {alt_name}")
    
    file_path = RAW_PATH / actual_file
    if file_path.exists():
        try:
            df = pd.read_csv(file_path)
            
            if 'ID' in df.columns:
                # Filter for districts
                df_districts = df[df['ID'].isin(district_ids)].copy()
                
                if len(df_districts) > 0:
                    # Get area column name
                    area_col = None
                    for col in ['AREA', 'AREA_NAME', 'DISTRICT']:
                        if col in df.columns:
                            area_col = col
                            break
                    
                    # Remove area column if present (we already have it from cooking_fuel)
                    cols_to_merge = []
                    for col in df_districts.columns:
                        if col != 'ID' and col != area_col and not col.startswith('Unnamed'):
                            cols_to_merge.append(col)
                    
                    if cols_to_merge:
                        # Create prefix from filename (first 3 chars)
                        prefix = actual_file[:3].upper().replace('.', '')
                        rename_dict = {}
                        for col in cols_to_merge:
                            # Handle duplicates
                            new_name = f"{prefix}_{col}"
                            rename_dict[col] = new_name
                        
                        # Prepare for merge
                        df_to_merge = df_districts[['ID'] + cols_to_merge].rename(columns=rename_dict)
                        
                        # Check for duplicate columns
                        duplicate_cols = set(df_to_merge.columns) & set(merged_df.columns)
                        if duplicate_cols:
                            print(f"  ⚠ Duplicate columns: {duplicate_cols}")
                            # Remove ID from duplicates
                            duplicate_cols = [col for col in duplicate_cols if col != 'ID']
                        
                        before_cols = len(merged_df.columns)
                        merged_df = pd.merge(merged_df, df_to_merge, on='ID', how='left', suffixes=('', '_dup'))
                        after_cols = len(merged_df.columns)
                        
                        added = after_cols - before_cols
                        print(f"  ✓ {actual_file}: Added {added} columns")
                        
                        # Remove duplicate columns that might have been created
                        dup_suffix = [col for col in merged_df.columns if col.endswith('_dup')]
                        if dup_suffix:
                            merged_df = merged_df.drop(columns=dup_suffix)
                    else:
                        print(f"  ⚠ {actual_file}: No columns to merge")
                else:
                    print(f"  ⚠ {actual_file}: No district rows found")
            else:
                print(f"  ⚠ {actual_file}: No ID column, skipping for now")
                
        except Exception as e:
            print(f"  ERROR processing {actual_file}: {e}")
    else:
        print(f"  ⚠ {filename}: File not found")

# ============================================================================
# STEP 5: Handle household_amenities separately (no ID column)
# ============================================================================

print("\n" + "="*60)
print("PROCESSING household_amenities.csv")
print("="*60)

ha_path = RAW_PATH / 'household_amenities.csv'
if ha_path.exists():
    try:
        ha_df = pd.read_csv(ha_path)
        print(f"  Shape: {ha_df.shape}")
        print(f"  Columns: {ha_df.columns.tolist()}")
        
        # This file has AREA_NAME and AREA_TYPE
        if 'AREA_NAME' in ha_df.columns and 'AREA_TYPE' in ha_df.columns:
            # Filter for district rows (not Country, Urban/Rural, Ecological Belt)
            ha_districts = ha_df[
                (~ha_df['AREA_TYPE'].isin(['Country', 'Ecological Belt'])) & 
                (ha_df['AREA_TYPE'] != 'Urban/Rural')
            ].copy()
            
            print(f"  District rows: {len(ha_districts)}")
            
            if len(ha_districts) > 0:
                # Get columns to merge (exclude metadata)
                exclude_cols = ['AREA_NAME', 'AREA_TYPE']
                cols_to_merge = [col for col in ha_districts.columns 
                               if col not in exclude_cols and not col.startswith('Unnamed')]
                
                if cols_to_merge:
                    # Add prefix
                    rename_dict = {col: f"HA_{col}" for col in cols_to_merge}
                    
                    # Prepare for merge by name
                    df_to_merge = ha_districts[['AREA_NAME'] + cols_to_merge].rename(
                        columns={'AREA_NAME': 'DISTRICT_NAME', **rename_dict}
                    )
                    
                    # Clean names for matching
                    merged_df['DISTRICT_NAME_CLEAN'] = merged_df['DISTRICT_NAME'].str.strip().str.lower()
                    df_to_merge['DISTRICT_NAME_CLEAN'] = df_to_merge['DISTRICT_NAME'].str.strip().str.lower()
                    
                    # Merge
                    before_cols = len(merged_df.columns)
                    merged_df = pd.merge(merged_df, df_to_merge.drop('DISTRICT_NAME', axis=1), 
                                       on='DISTRICT_NAME_CLEAN', how='left')
                    added = len(merged_df.columns) - before_cols
                    
                    # Clean up
                    merged_df = merged_df.drop('DISTRICT_NAME_CLEAN', axis=1)
                    
                    print(f"  ✓ Added {added} columns from household_amenities")
                else:
                    print(f"  ⚠ No columns to merge")
            else:
                print(f"  ⚠ No district rows found")
        else:
            print(f"  ⚠ Missing AREA_NAME or AREA_TYPE columns")
            
    except Exception as e:
        print(f"  ERROR: {e}")
else:
    print(f"  ⚠ File not found")

# ============================================================================
# STEP 6: Final checks and save
# ============================================================================

print("\n" + "="*60)
print("FINAL DATASET")
print("="*60)

print(f"\nDataset shape: {merged_df.shape}")
print(f"Districts: {len(merged_df)}")
print(f"Total features: {len(merged_df.columns)}")

# Check for missing districts
if len(merged_df) != 77:
    print(f"\n⚠ WARNING: Expected 77 districts, found {len(merged_df)}")
    
    # Find missing IDs
    missing_ids = set(district_ids) - set(merged_df['ID'].tolist())
    if missing_ids:
        print(f"  Missing district IDs: {sorted(missing_ids)}")

# Check for duplicates
duplicates = merged_df[merged_df.duplicated('ID', keep=False)]
if len(duplicates) > 0:
    print(f"⚠ WARNING: Found {len(duplicates)} duplicate district IDs")
    print(duplicates[['ID', 'DISTRICT_NAME']].head())

# Check missing values
missing = merged_df.isnull().sum()
missing_cols = missing[missing > 0]
if len(missing_cols) > 0:
    print(f"\nColumns with missing values: {len(missing_cols)}")
    print("Top 5 columns with most missing:")
    for col, count in missing_cols.sort_values(ascending=False).head(5).items():
        pct = count / len(merged_df) * 100
        print(f"  {col:30}: {count:3} ({pct:.1f}%)")

# Save the merged dataset
merged_path = PROCESSED_PATH / 'simple_merged_districts.csv'
merged_df.to_csv(merged_path, index=False)
print(f"\n✓ Merged dataset saved to: {merged_path}")

# Also save a sample
sample_path = PROCESSED_PATH / 'district_sample.csv'
merged_df.head(10).to_csv(sample_path, index=False)
print(f"✓ Sample saved to: {sample_path}")

# Display structure
print(f"\nDataset structure:")
print("-" * 40)
print(f"{'Column':40} {'Type':10} {'Non-Null':10}")
print("-" * 40)
for col in merged_df.columns:
    non_null = merged_df[col].notna().sum()
    dtype = str(merged_df[col].dtype)
    print(f"{col:40} {dtype:10} {non_null:10}")

print("\n" + "="*60)
print("SUMMARY")
print("="*60)
print(f"1. Processed files: {len(merge_priority) + 1} (+ household_amenities)")
print(f"2. Final dataset: {len(merged_df)} districts × {len(merged_df.columns)} features")
print(f"3. Expected: 77 districts of Nepal")
print(f"4. Data saved to 'data/processed/' folder")
print("\nNow run 03_feature_engineering.py to create vulnerability features!")

ROBUST DATA EXTRACTION - 77 DISTRICTS

Checking available files...
Found 16 CSV files
 1. below_secondary_education.csv
 2. children_living_arrangement.csv
 3. cooking_fuel.csv
 4. drinking_watersource.csv
 5. educational_attainment.csv
 6. educational_field_distribution.csv
 7. floor_type.csv
 8. foundation_type.csv
 9. household_amenities.csv
10. housing_ownership.csv
11. lighting_source.csv
12. months_worked.csv
13. population_occupation.csv
14. roof_type.csv
15. toilet_facility.csv
16. wall_materials.csv

ANALYZING cooking_fuel.csv FOR DISTRICT IDS
File shape: (90, 9)
Columns: ['ID', 'AREA', 'WOOD_FIREWOOD', 'LPG_GAS', 'ELECTRICITY', 'COW_DUNG', 'BIOGAS', 'KEROSENE', 'OTHER']

First 25 rows:
ID   1: Nepal
ID   2: Urban Municipalities
ID   3: Rural Municipalities
ID   4: Mountain
ID   5: Hill
ID   6: Tarai
ID   7: Koshi
ID   8: Madhesh
ID   9: Bagmati
ID  10: Gandaki
ID  11: Lumbini
ID  12: Karnali
ID  13: Sudur Paschim
ID  14: Taplejung
ID  15: Sankhuwasabha
ID  16: Solukhumbu
ID  