
Nepal District Vulnerability Analysis - Data Inspection

Step 1: Inspect all raw CSV files to understand structure

Author: Saurav Sen
Date: 2025 December 04


In [1]:
import pandas as pd
import numpy as np
from pathlib import Path

# Setup paths - USE RAW STRING FOR WINDOWS
RAW_PATH = Path(r'C:\Users\saurav\Downloads\SEVI_Nepal_Project\data\raw')
PROCESSED_PATH = Path(r'C:\Users\saurav\Downloads\SEVI_Nepal_Project\data\processed')
PROCESSED_PATH.mkdir(parents=True, exist_ok=True)

print("="*80)
print("DETAILED DATA INSPECTION - COMPREHENSIVE ANALYSIS")
print("="*80)

# List all CSV files
csv_files = list(RAW_PATH.glob('*.csv'))
print(f"Found {len(csv_files)} CSV files")

# Function to inspect a file in detail - IMPROVED VERSION
def inspect_file_details(file_path):
    """Detailed inspection of a CSV file"""
    try:
        df = pd.read_csv(file_path, low_memory=False)  # Added low_memory for safety
        filename = file_path.name
        
        # Prepare output
        output_lines = []
        output_lines.append("="*80)
        output_lines.append(f"FILE: {filename}")
        output_lines.append("="*80)
        output_lines.append(f"Shape: {df.shape} (rows × columns)")
        
        # Memory usage
        memory_mb = df.memory_usage(deep=True).sum() / 1024 / 1024
        output_lines.append(f"Memory usage: {memory_mb:.2f} MB")
        
        # Columns with more details
        output_lines.append(f"\nCOLUMNS ({len(df.columns)} total):")
        for i, col in enumerate(df.columns, 1):
            col_str = f"'{col}'"
            dtype_str = f"{df[col].dtype}"
            
            # Count unique values for categorical columns
            if df[col].dtype == 'object' or df[col].nunique() < 20:
                unique_count = df[col].nunique()
                dtype_str = f"{dtype_str} ({unique_count} unique)"
            
            output_lines.append(f"{i:3}. {col_str:40} {dtype_str}")
        
        # Find district/area column with more flexibility
        area_cols = []
        for col in df.columns:
            col_lower = str(col).lower()
            if any(keyword in col_lower for keyword in ['district', 'dist', 'mun', 'area', 'vdc']):
                area_cols.append(col)
        
        if area_cols:
            output_lines.append(f"\nAREA/DISTRICT COLUMNS ({len(area_cols)} found):")
            for col in area_cols:
                unique_count = df[col].nunique()
                sample_vals = list(df[col].dropna().unique()[:3])
                output_lines.append(f"  '{col}': {unique_count} unique values")
                output_lines.append(f"    Sample: {sample_vals}")
                
                # Check for 'Total' or aggregate rows
                if df[col].dtype == 'object':
                    total_rows = df[col].str.contains('total|Total|TOTAL|Nepal|Province', na=False).sum()
                    if total_rows > 0:
                        output_lines.append(f"    Contains {total_rows} aggregate rows (Total/National/Province)")
        else:
            output_lines.append("\nWARNING: No district/area column found!")
        
        # Find ID column
        id_cols = [col for col in df.columns if 'id' in str(col).lower() or 'code' in str(col).lower()]
        if id_cols:
            output_lines.append(f"\nID COLUMNS ({len(id_cols)} found):")
            for col in id_cols:
                dtype = df[col].dtype
                unique_count = df[col].nunique()
                output_lines.append(f"  '{col}': {dtype}, {unique_count} unique values")
        
        # Missing values analysis - IMPROVED
        output_lines.append("\nMISSING VALUES ANALYSIS:")
        missing_total = df.isnull().sum().sum()
        missing_percentage = (missing_total / (df.shape[0] * df.shape[1])) * 100
        
        output_lines.append(f"  Total missing cells: {missing_total:,}")
        output_lines.append(f"  Percentage missing: {missing_percentage:.1f}%")
        
        # Columns with missing values
        missing_cols = df.isnull().sum()
        missing_cols = missing_cols[missing_cols > 0]
        
        if len(missing_cols) > 0:
            output_lines.append(f"  Columns with missing values: {len(missing_cols)}")
            for col, count in missing_cols.nlargest(10).items():  # Top 10
                pct = count / len(df) * 100
                output_lines.append(f"    '{col}': {count} ({pct:.1f}%)")
        
        # Determine format (Long vs Wide)
        if area_cols:
            primary_area_col = area_cols[0]
            value_counts = df[primary_area_col].value_counts()
            
            output_lines.append("\nDATA STRUCTURE ANALYSIS:")
            if len(value_counts) == len(df):
                output_lines.append("  FORMAT: Wide (1 row per district/area)")
                output_lines.append("  NOTE: May need to filter out aggregate rows (Total, Province)")
            else:
                output_lines.append("  FORMAT: Long (Multiple rows per district/area)")
                output_lines.append(f"  Average rows per district: {len(df)/len(value_counts):.1f}")
                output_lines.append(f"  Min rows per district: {value_counts.min()}")
                output_lines.append(f"  Max rows per district: {value_counts.max()}")
                
                # Identify what varies across rows (e.g., age groups, materials)
                # Look for categorical columns with few unique values
                categorical_cols = []
                for col in df.columns:
                    if col not in area_cols + id_cols:
                        if df[col].nunique() < 20:
                            categorical_cols.append((col, df[col].nunique()))
                
                if categorical_cols:
                    output_lines.append("  Likely breakdown categories:")
                    for col, count in categorical_cols[:5]:  # Top 5
                        sample_vals = list(df[col].dropna().unique()[:3])
                        output_lines.append(f"    '{col}' ({count} categories): {sample_vals}")
        
        # Data sample - smarter selection
        output_lines.append("\nDATA SAMPLE (First 3 rows):")
        output_lines.append(df.head(3).to_string())
        
        # Summary statistics for numeric columns
        numeric_cols = df.select_dtypes(include=[np.number]).columns
        if len(numeric_cols) > 0:
            output_lines.append("\nNUMERIC COLUMNS SUMMARY (first 3):")
            for col in numeric_cols[:3]:
                output_lines.append(f"  '{col}':")
                output_lines.append(f"    Min: {df[col].min():.2f}, Max: {df[col].max():.2f}")
                output_lines.append(f"    Mean: {df[col].mean():.2f}, Std: {df[col].std():.2f}")
                if df[col].notna().sum() > 0:
                    output_lines.append(f"    Non-zero: {(df[col] != 0).sum()} rows")
        
        return "\n".join(output_lines), df, area_cols[0] if area_cols else None
        
    except Exception as e:
        error_msg = f"ERROR reading {file_path.name}: {str(e)}"
        return error_msg, None, None

# Also create a summary dataframe
summary_data = []

print("\nInspecting all files...")

with open(PROCESSED_PATH / 'detailed_inspection.txt', 'w', encoding='utf-8') as f:
    # Write header
    f.write("="*80 + "\n")
    f.write("DETAILED INSPECTION OF ALL CSV FILES\n")
    f.write("="*80 + "\n\n")
    
    # Process each file
    for i, csv_file in enumerate(csv_files, 1):
        print(f"Processing {i:2}/{len(csv_files)}: {csv_file.name}")
        inspection_output, df, district_col = inspect_file_details(csv_file)
        f.write(inspection_output + "\n\n")
        
        # Add to summary
        if df is not None:
            summary_data.append({
                'Filename': csv_file.name,
                'Rows': df.shape[0],
                'Columns': df.shape[1],
                'District_Column': district_col if district_col else 'None',
                'Format': 'Wide' if district_col and df[district_col].nunique() == len(df) else 'Long',
                'Missing_%': (df.isnull().sum().sum() / (df.shape[0] * df.shape[1]) * 100),
                'Memory_MB': df.memory_usage(deep=True).sum() / 1024 / 1024
            })
        
        # Print key files to screen
        key_files = ['wall_materials.csv', 'drinking_watersource.csv', 'cooking_fuel.csv',
                    'toilet_facilities.csv', 'household_amenities.csv']
        
        if csv_file.name in key_files:
            print("\n" + "="*60)
            print(f"KEY FILE: {csv_file.name}")
            print("="*60)
            if df is not None:
                print(f"Rows: {df.shape[0]}, Columns: {df.shape[1]}")
                print(f"District column: '{district_col}'" if district_col else "No district column")
                if district_col:
                    print(f"Unique areas: {df[district_col].nunique()}")
                    print(f"Format: {'Wide' if df[district_col].nunique() == len(df) else 'Long'}")
                print()

# Create summary dataframe
if summary_data:
    summary_df = pd.DataFrame(summary_data)
    
    # Save summary to CSV
    summary_df.to_csv(PROCESSED_PATH / 'file_summary.csv', index=False)
    
    # Also add to text file
    with open(PROCESSED_PATH / 'detailed_inspection.txt', 'a', encoding='utf-8') as f:
        f.write("\n" + "="*80 + "\n")
        f.write("SUMMARY TABLE FOR ALL FILES\n")
        f.write("="*80 + "\n\n")
        f.write(summary_df.to_string())
        
        # Recommendations
        f.write("\n\n" + "="*80 + "\n")
        f.write("RECOMMENDATIONS\n")
        f.write("="*80 + "\n\n")
        
        # Check for wall_materials format
        wall_file = RAW_PATH / 'wall_materials.csv'
        if wall_file.exists():
            wall_df = pd.read_csv(wall_file)
            if 'District' in wall_df.columns:
                if wall_df['District'].nunique() == len(wall_df):
                    f.write("1. wall_materials.csv is in WIDE format\n")
                    f.write("   → Can merge directly by District\n")
                else:
                    f.write("1. wall_materials.csv is in LONG format\n")
                    f.write("   → Need to pivot to wide format before merging\n")

print(f"\n✓ Detailed inspection saved to: {PROCESSED_PATH / 'detailed_inspection.txt'}")
print(f"✓ Summary table saved to: {PROCESSED_PATH / 'file_summary.csv'}")

# Quick recommendations
print("\n" + "="*80)
print("QUICK RECOMMENDATIONS:")
print("="*80)

# Check wall_materials specifically
wall_path = RAW_PATH / 'wall_materials.csv'
if wall_path.exists():
    wall_df = pd.read_csv(wall_path)
    print(f"\nwall_materials.csv analysis:")
    print(f"  Columns: {list(wall_df.columns)}")
    
    if 'District' in wall_df.columns:
        unique_districts = wall_df['District'].nunique()
        total_rows = len(wall_df)
        
        print(f"  Unique districts: {unique_districts}")
        print(f"  Total rows: {total_rows}")
        
        if total_rows > unique_districts:
            print(f"  FORMAT: LONG (multiple rows per district)")
            print(f"  Likely categories: {[col for col in wall_df.columns if col != 'District' and wall_df[col].nunique() < 20]}")
            print("\n  ACTION NEEDED: Pivot to wide format")
            print("  Example: df.pivot(index='District', columns='Material_Type', values='Count')")
        else:
            print(f"  FORMAT: WIDE (1 row per district)")
            print("\n  ACTION: Can merge directly by district name")
    else:
        print("  No 'District' column found!")
        print("  Check for alternative area column names")

print("\n" + "="*80)
print("SHARE THIS INFORMATION:")
print("="*80)
print("\nPlease share:")
print("1. The output file: data/processed/detailed_inspection.txt")
print("2. Specifically for wall_materials.csv:")
print("   - All column names")
print("   - Whether it's Long or Wide format")
print("   - What varies across rows (e.g., material types)")
print("\nThis will determine if we need to pivot the data.")

DETAILED DATA INSPECTION - COMPREHENSIVE ANALYSIS
Found 16 CSV files

Inspecting all files...
Processing  1/16: below_secondary_education.csv
Processing  2/16: children_living_arrangement.csv
Processing  3/16: cooking_fuel.csv

KEY FILE: cooking_fuel.csv
Rows: 90, Columns: 9
District column: 'AREA'
Unique areas: 90
Format: Wide

Processing  4/16: drinking_watersource.csv

KEY FILE: drinking_watersource.csv
Rows: 90, Columns: 11
District column: 'AREA'
Unique areas: 90
Format: Wide

Processing  5/16: educational_attainment.csv
Processing  6/16: educational_field_distribution.csv
Processing  7/16: floor_type.csv
Processing  8/16: foundation_type.csv
Processing  9/16: household_amenities.csv

KEY FILE: household_amenities.csv
Rows: 90, Columns: 19
District column: 'AREA_TYPE'
Unique areas: 5
Format: Long

Processing 10/16: housing_ownership.csv
Processing 11/16: lighting_source.csv
Processing 12/16: months_worked.csv
Processing 13/16: population_occupation.csv
Processing 14/16: roof_type.