In [6]:
import pandas as pd
from pathlib import Path

# Define paths
data_dir = Path("~/OneDrive/Desktop/sem 5/Hackathon/DPL_25/Trade-Resilience-and-Economic-Networks/data_raw").expanduser()

# First define the file lists
export_files = [
    data_dir / "2000-2012_Export.csv",
    data_dir / "2013-2024_Export.csv"
]

import_files = [
    data_dir / "2000-2012_Import.csv",
    data_dir / "2013-2024_Import.csv"
]

# Then use them in your loops
export_dfs = []
for file in export_files:
    try:
        df = pd.read_csv(file, encoding='utf-8')
    except UnicodeDecodeError:
        try:
            df = pd.read_csv(file, encoding='latin1')
        except UnicodeDecodeError:
            df = pd.read_csv(file, encoding='windows-1252')
    export_dfs.append(df)

import_dfs = []
for file in import_files:
    try:
        df = pd.read_csv(file, encoding='utf-8')
    except UnicodeDecodeError:
        try:
            df = pd.read_csv(file, encoding='latin1')
        except UnicodeDecodeError:
            df = pd.read_csv(file, encoding='windows-1252')
    import_dfs.append(df)

# Rest of your code...
# Verify the isAggregate column in all dataframes
for i, df in enumerate(export_dfs + import_dfs):
    print(f"\nDataFrame {i+1} - isAggregate values:")
    if 'isAggregate' in df.columns:
        print("Unique values:", df['isAggregate'].unique())
        print("Value counts:\n", df['isAggregate'].value_counts(dropna=False))
    else:
        print("No isAggregate column in this dataframe")

# If any NA values remain, fill them with False
for df in export_dfs + import_dfs:
    if 'isAggregate' in df.columns:
        na_count = df['isAggregate'].isna().sum()
        if na_count > 0:
            print(f"\nFilling {na_count} NA values in isAggregate with False")
            df['isAggregate'] = df['isAggregate'].fillna(False)
            print("Updated value counts:\n", df['isAggregate'].value_counts(dropna=False))



# Standardize column names
def standardize_columns(df):
    df.columns = df.columns.str.strip().str.lower()
    return df

export_dfs = [standardize_columns(df) for df in export_dfs]
import_dfs = [standardize_columns(df) for df in import_dfs]

# Missing value preprocessing function
def preprocess_trade_data(df):
    # Make a copy to avoid modifying the original
    df_clean = df.copy()
    
    # 1. Initial diagnostics
    print("\nOriginal missing values:")
    print(df_clean.isnull().sum()[df_clean.isnull().sum() > 0])
    
    # 2. Special check for isaggregate column
    if 'isaggregate' in df_clean.columns:
        print("\nisaggregate column value counts:")
        print(df_clean['isaggregate'].value_counts(dropna=False))
        
        # Handle any potential missing values in isaggregate (if they exist)
        if df_clean['isaggregate'].isnull().any():
            print(f"\nFilling {df_clean['isaggregate'].isnull().sum()} missing values in isaggregate with FALSE")
            df_clean['isaggregate'] = df_clean['isaggregate'].fillna(False)
    
    # 2. Drop columns with 100% missing values
    cols_to_drop = [col for col in df_clean.columns if df_clean[col].isnull().all()]
    if cols_to_drop:
        print(f"\nDropping columns with 100% missing values: {cols_to_drop}")
        df_clean.drop(columns=cols_to_drop, inplace=True)
    
    # 3. Handle high missingness columns (70-100%)
    high_missing_cols = [col for col in df_clean.columns 
                        if df_clean[col].isnull().mean() > 0.7 
                        and col not in cols_to_drop]
    
    for col in high_missing_cols:
        if pd.api.types.is_numeric_dtype(df_clean[col]):
            print(f"Filling high-missing numeric {col} with 0")
            df_clean[col] = df_clean[col].fillna(0)
        else:
            print(f"Filling high-missing categorical {col} with 'Unknown'")
            df_clean[col] = df_clean[col].fillna("Unknown")
    
    # 4. Handle moderate missingness (20-70%)
    moderate_missing_cols = [col for col in df_clean.columns 
                           if 0.2 < df_clean[col].isnull().mean() <= 0.7]
    
    for col in moderate_missing_cols:
        if pd.api.types.is_numeric_dtype(df_clean[col]):
            median_val = df_clean[col].median()
            print(f"Filling moderate-missing numeric {col} with median: {median_val}")
            df_clean[col] = df_clean[col].fillna(median_val)
        else:
            mode_val = df_clean[col].mode()[0]
            print(f"Filling moderate-missing categorical {col} with mode: {mode_val}")
            df_clean[col] = df_clean[col].fillna(mode_val)
    
    # 5. Handle low missingness (<20%)
    low_missing_cols = [col for col in df_clean.columns 
                       if 0 < df_clean[col].isnull().mean() <= 0.2]
    
    for col in low_missing_cols:
        if col in ['refyear', 'refmonth', 'period']:
            print(f"Forward filling time-related {col}")
            df_clean[col] = df_clean[col].ffill()
        elif pd.api.types.is_numeric_dtype(df_clean[col]):
            mean_val = df_clean[col].mean()
            print(f"Filling low-missing numeric {col} with mean: {mean_val}")
            df_clean[col] = df_clean[col].fillna(mean_val)
        else:
            mode_val = df_clean[col].mode()[0]
            print(f"Filling low-missing categorical {col} with mode: {mode_val}")
            df_clean[col] = df_clean[col].fillna(mode_val)
    
    # 6. Special treatment for trade value columns
    if 'cifvalue' in df_clean.columns and 'fobvalue' in df_clean.columns:
        cif_missing = df_clean['cifvalue'].isnull()
        print(f"\nFilling {cif_missing.sum()} missing cifvalue with fobvalue")
        df_clean.loc[cif_missing, 'cifvalue'] = df_clean.loc[cif_missing, 'fobvalue']
    
    # 7. Add imputation flags
    for col in df_clean.columns:
        if df[col].isnull().any():  # If there were originally missing values
            flag_col = f"{col}_imputed"
            df_clean[flag_col] = df[col].isnull().astype(int)
    
    # 8. Final check
    remaining_missing = df_clean.isnull().sum().sum()
    if remaining_missing > 0:
        print(f"\nWarning: {remaining_missing} missing values remain after preprocessing")
        print(df_clean.isnull().sum()[df_clean.isnull().sum() > 0])
    else:
        print("\nAll missing values successfully handled")
    
    return df_clean

# Process all DataFrames with detailed logging
print("\n" + "="*80)
print("BEGINNING DATA PREPROCESSING")
print("="*80 + "\n")

export_dfs_clean = []
for i, df in enumerate(export_dfs):
    print(f"\n{'='*40} Processing Export DataFrame {i+1} {'='*40}")
    df_clean = preprocess_trade_data(df)
    export_dfs_clean.append(df_clean)
    print(f"\nExport DataFrame {i+1} shape after cleaning: {df_clean.shape}")

import_dfs_clean = []
for i, df in enumerate(import_dfs):
    print(f"\n{'='*40} Processing Import DataFrame {i+1} {'='*40}")
    df_clean = preprocess_trade_data(df)
    import_dfs_clean.append(df_clean)
    print(f"\nImport DataFrame {i+1} shape after cleaning: {df_clean.shape}")

# Final validation
print("\n" + "="*80)
print("PREPROCESSING COMPLETE - FINAL SUMMARY")
print("="*80 + "\n")

all_dfs = export_dfs_clean + import_dfs_clean
for i, df in enumerate(all_dfs):
    print(f"\nDataFrame {i+1} Final Status:")
    print(f"Shape: {df.shape}")
    print("Missing values:", df.isnull().sum().sum())
    print(f"New columns added: {[col for col in df.columns if '_imputed' in col]}")
    print(f"First 3 rows:\n{df.head(3)}")
    
def convert_data_types(df):
    # Convert date-related columns
    if 'refperiodid' in df.columns:
        df['refperiodid'] = pd.to_datetime(df['refperiodid'], format='%Y%m%d', errors='coerce')
    
    # Convert year to integer
    if 'refyear' in df.columns:
        df['refyear'] = pd.to_numeric(df['refyear'], errors='coerce').astype('Int64')
    
    # Convert boolean columns
    bool_cols = ['isoriginalclassification', 'isleaf', 'isqtyestimated', 
                 'isaltqtyestimated', 'isnetwgtestimated', 'isgrosswgtestimated',
                 'isreported', 'isaggregate']
    for col in bool_cols:
        if col in df.columns:
            df[col] = df[col].astype(bool)
    
    return df


export_dfs = [convert_data_types(df) for df in export_dfs]
import_dfs = [convert_data_types(df) for df in import_dfs]


def clean_specific_columns(df):
    # Clean reporter and partner descriptions
    if 'reporterdesc' in df.columns:
        df['reporterdesc'] = df['reporterdesc'].str.strip().str.title()
    if 'partnerdesc' in df.columns:
        df['partnerdesc'] = df['partnerdesc'].str.strip().str.title()
    
    # Ensure flow codes are consistent
    if 'flowcode' in df.columns:
        df['flowcode'] = df['flowcode'].str.upper().str.strip()
    
    # Clean commodity descriptions
    if 'cmddesc' in df.columns:
        df['cmddesc'] = df['cmddesc'].str.strip()
    
    return df



export_dfs = [clean_specific_columns(df) for df in export_dfs]
import_dfs = [clean_specific_columns(df) for df in import_dfs]

# Concatenate export files
exports = pd.concat(export_dfs, axis=0, ignore_index=True)

# Concatenate import files
imports = pd.concat(import_dfs, axis=0, ignore_index=True)

exports['trade_type'] = 'export'
imports['trade_type'] = 'import'

trade_data = pd.concat([exports, imports], axis=0, ignore_index=True)

def final_cleanup(df):
    """Handle all remaining missing values after concatenation"""
    # Input validation
    if df is None:
        raise ValueError("Input DataFrame is None")
    if not isinstance(df, pd.DataFrame):
        raise TypeError("Input must be a pandas DataFrame")
    if df.empty:
        print("Warning: Empty DataFrame received")
        return df

    # 1. Drop completely empty columns
    cols_to_drop = [col for col in df.columns if df[col].isnull().all()]
    if cols_to_drop:
        print(f"Dropping completely empty columns: {cols_to_drop}")
        df.drop(columns=cols_to_drop, inplace=True)
    
    # 2. Fill rules dictionary (without trailing comma)
    fill_rules = {
        'qty': 0,
        'altqty': 0,
        'netwgt': 0,
        'grosswgt': 0,
        'cifvalue': lambda x: x['fobvalue'] if pd.notna(x['fobvalue']) else 0,
        'fobvalue': 0,
        'primaryvalue': lambda x: x['fobvalue'] if x.get('flowcode') == 'X' else x.get('cifvalue', 0),
        'reporteriso': 'XX',
        'partneriso': 'XX',
        'partner2iso': 'XX',
        'cmdcode': 'TOTAL',
        'flowcode': lambda x: ('X' if 'export' in str(x.get('flowdesc', '')).lower() 
                             else ('M' if 'import' in str(x.get('flowdesc', '')).lower() 
                             else 'X')),
        'isaggregate': False,
        'isreported': False
    }

    # 3. Apply fill rules
    for col, fill_value in fill_rules.items():
        if col in df.columns and df[col].isnull().any():
            try:
                if callable(fill_value):
                    df.loc[df[col].isnull(), col] = df[df[col].isnull()].apply(fill_value, axis=1)
                else:
                    df[col] = df[col].fillna(fill_value)
                print(f"Filled missing values in {col}")
            except Exception as e:
                print(f"Error filling {col}: {str(e)}")

    # 4. Final cleanup
    if 'flowcode' in df.columns:
        df['flowcode'] = df['flowcode'].str.upper().str.strip().replace({'': 'X'})
    
    numeric_cols = ['qty', 'altqty', 'netwgt', 'grosswgt', 'cifvalue', 'fobvalue', 'primaryvalue']
    for col in numeric_cols:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0)
    
    return df

# Apply to the final combined DataFrame
trade_data = final_cleanup(trade_data)

# Check for consistency in combined data
print("Final data shape:", trade_data.shape)
print("Missing values:\n", trade_data.isnull().sum())

# Check time period coverage
print("Year range:", trade_data['refyear'].min(), "-", trade_data['refyear'].max())

# Check unique countries
print("Unique reporters:", trade_data['reporteriso'].nunique())
print("Unique partners:", trade_data['partneriso'].nunique())

# Define the output directory (same as input directory for consistency)
output_dir = data_dir.parent / "data_processed"
output_dir.mkdir(parents=True, exist_ok=True)  # Create directory if it doesn't exist

# Define the output file path
output_file = "~/OneDrive/Desktop/sem 5/Hackathon/DPL_25/Trade-Resilience-and-Economic-Networks/data_clean/trade_clean.csv"

# Save the cleaned DataFrame to CSV
trade_data.to_csv(output_file, index=False, encoding='utf-8')


# Optional: Verify the saved file by reading a sample
print("\nVerifying the saved file by reading first 3 rows:")
print(pd.read_csv(output_file, nrows=3))



DataFrame 1 - isAggregate values:
Unique values: [False  True]
Value counts:
 isAggregate
False    33286
True      8652
Name: count, dtype: int64

DataFrame 2 - isAggregate values:
Unique values: [False  True]
Value counts:
 isAggregate
True     33520
False     8356
Name: count, dtype: int64

DataFrame 3 - isAggregate values:
Unique values: [False  True]
Value counts:
 isAggregate
False    38050
True      9323
Name: count, dtype: int64

DataFrame 4 - isAggregate values:
Unique values: [False  True]
Value counts:
 isAggregate
True     37066
False     9245
Name: count, dtype: int64

BEGINNING DATA PREPROCESSING



Original missing values:
qtyunitabbr       41938
qty               33264
altqtyunitabbr    41938
altqty            33286
netwgt            33689
grosswgt          33615
cifvalue          36723
dtype: int64

isaggregate column value counts:
isaggregate
False    33286
True      8652
Name: count, dtype: int64

Dropping columns with 100% missing values: ['qtyunitabbr', 'altqtyunit

Current working directory: C:\Users\sreea\OneDrive\Desktop\sem 5\Hackathon\DPL_25\Trade-Resilience-and-Economic-Networks\notebooks
