In [1]:
import pandas as pd
import os

data_folder = 'Data'

# List of files in this category
standard_files = [
    't_bills.csv', 'seasonal_worker_remittance.csv', 'REER.csv',
    'quaterly_gdp_2015.csv', 'policy_rate.csv', 'm2_broad_money.csv',
    'kibor_kibid.csv', 'inflation_base_2015.csv', 'inflation_base_2007.csv',
    'gdp_domestic_2005.csv', 'foreign_invest_sectors.csv',
    'foreign_invest_countires.csv', 'exchange_rate.csv',
    'consumer_confidence_survey.csv', 'country_wise_remittance.csv',
    'borrow_loans.csv', 'LSM_QIM_2015.csv', 'LSM_QIM_2005.csv',
    'gold_foreign_exchange_reserves.csv'
]

for filename in standard_files:
    file_path = os.path.join(data_folder, filename)
    try:
        # Use header=0 (default) and specify quotechar
        df = pd.read_csv(file_path, quotechar='"', header=0)
        print(f"Loaded {filename} successfully.")
        # print(df.head(2)) # Optional: print first few rows to verify
    except Exception as e:
        print(f"Error loading {filename}: {e}")

Loaded t_bills.csv successfully.
Loaded seasonal_worker_remittance.csv successfully.
Loaded REER.csv successfully.
Loaded quaterly_gdp_2015.csv successfully.
Loaded policy_rate.csv successfully.
Loaded m2_broad_money.csv successfully.
Loaded kibor_kibid.csv successfully.
Loaded inflation_base_2015.csv successfully.
Loaded inflation_base_2007.csv successfully.
Loaded gdp_domestic_2005.csv successfully.
Loaded foreign_invest_sectors.csv successfully.
Loaded foreign_invest_countires.csv successfully.
Loaded exchange_rate.csv successfully.
Loaded consumer_confidence_survey.csv successfully.
Loaded country_wise_remittance.csv successfully.
Loaded borrow_loans.csv successfully.
Loaded LSM_QIM_2015.csv successfully.
Loaded LSM_QIM_2005.csv successfully.
Loaded gold_foreign_exchange_reserves.csv successfully.


  df = pd.read_csv(file_path, quotechar='"', header=0)


In [3]:
# List of files in this category
multi_header_files = [
    'transport_and_communications.csv', 'trade_and_payments.csv',
    'public_debt.csv', 'population,_labor_force_and_employment.csv',
    'money_and_credit.csv', 'manufacturing_and_mining.csv',
    'inflation.csv', 'health_and_nutrition.csv', 'growth_and_investment.csv',
    'fiscal_development.csv', 'energy.csv', 'education.csv',
    'economic_and_social_indicators.csv',
    'capital_markets_and_corporate_sector.csv', 'agriculture.csv'
]

for filename in multi_header_files:
    file_path = os.path.join(data_folder, filename)
    try:
        # Use header=0 (default)
        df = pd.read_csv(file_path, header=0)
        print(f"Loaded {filename} successfully.")
        # print(df.head(2)) # Optional: print first few rows to verify
    except Exception as e:
        print(f"Error loading {filename}: {e}")

Loaded transport_and_communications.csv successfully.
Loaded trade_and_payments.csv successfully.
Loaded public_debt.csv successfully.
Loaded population,_labor_force_and_employment.csv successfully.
Loaded money_and_credit.csv successfully.
Loaded manufacturing_and_mining.csv successfully.
Loaded inflation.csv successfully.
Loaded health_and_nutrition.csv successfully.
Loaded growth_and_investment.csv successfully.
Loaded fiscal_development.csv successfully.
Loaded energy.csv successfully.
Loaded education.csv successfully.
Loaded economic_and_social_indicators.csv successfully.
Loaded capital_markets_and_corporate_sector.csv successfully.
Loaded agriculture.csv successfully.


In [5]:
# List of files in this category
inflation_files = [
    'usa_inflation.csv', 'uk_inflation.csv', 'uae_inflation.csv',
    'spain_inflation.csv', 'pakistan_inflation.csv',
    'netherlands_inflation.csv', 'italy_inflation.csv',
    'germany_inflation.csv', 'china_inflation.csv',
    'bangladesh_inflation.csv', 'afghanistan_inflation.csv'
]

for filename in inflation_files:
    file_path = os.path.join(data_folder, filename)
    try:
        # Use header=0 (default)
        df = pd.read_csv(file_path, header=0)
        print(f"Loaded {filename} successfully.")
        # print(df.head(2)) # Optional: print first few rows to verify
    except Exception as e:
        print(f"Error loading {filename}: {e}")

Loaded usa_inflation.csv successfully.
Loaded uk_inflation.csv successfully.
Loaded uae_inflation.csv successfully.
Loaded spain_inflation.csv successfully.
Loaded pakistan_inflation.csv successfully.
Loaded netherlands_inflation.csv successfully.
Loaded italy_inflation.csv successfully.
Loaded germany_inflation.csv successfully.
Loaded china_inflation.csv successfully.
Loaded bangladesh_inflation.csv successfully.
Loaded afghanistan_inflation.csv successfully.


In [7]:
# CMO_historical_data_monthly.csv: Header seems to be on row 5 (index 4)
file_path_monthly = os.path.join(data_folder, 'CMO_historical_data_monthly.csv')
try:
    df_monthly = pd.read_csv(file_path_monthly, header=4) # header is 0-indexed
    print(f"Loaded CMO_historical_data_monthly.csv successfully.")
    # print(df_monthly.head(2)) # Optional: print first few rows to verify
except Exception as e:
    print(f"Error loading CMO_historical_data_monthly.csv: {e}")

# CMO_historical_data_indices.csv: Header seems to be on row 6 (index 5)
file_path_indices = os.path.join(data_folder, 'CMO_historical_data_indices.csv')
try:
    df_indices = pd.read_csv(file_path_indices, header=5) # header is 0-indexed
    print(f"Loaded CMO_historical_data_indices.csv successfully.")
    # print(df_indices.head(2)) # Optional: print first few rows to verify
except Exception as e:
    print(f"Error loading CMO_historical_data_indices.csv: {e}")

Loaded CMO_historical_data_monthly.csv successfully.
Loaded CMO_historical_data_indices.csv successfully.


In [15]:
import pandas as pd
import os
import sys # Import sys module for redirecting stdout
import io  # Import io for capturing df.info()
import re # Import regex for identifying year columns
import numpy as np # Import numpy for numeric types check

# Define the path to your data folder
data_folder = 'Data'
# Define the output file name
output_filename = 'data_overview_output_detailed.txt'

# --- File Categorization (Based on previous analysis) ---

# Common strings representing missing values
common_na_values = ['..', 'no data', 'NaN', 'NA', 'na', '-', ' ', '', '<NA>'] # Added <NA>

# 1. Standard CSV Files (Comma-separated, Quoted Fields, Header=0)
# These have a consistent structure with 'Observation Date' and 'Observation Value'
standard_files = [
    't_bills.csv', 'seasonal_worker_remittance.csv', 'REER.csv',
    'quaterly_gdp_2015.csv', 'policy_rate.csv', 'm2_broad_money.csv',
    'kibor_kibid.csv', 'inflation_base_2015.csv', 'inflation_base_2007.csv',
    'gdp_domestic_2005.csv', 'foreign_invest_sectors.csv',
    'foreign_invest_countires.csv', 'exchange_rate.csv',
    'consumer_confidence_survey.csv', 'country_wise_remittance.csv',
    'borrow_loans.csv', 'LSM_QIM_2015.csv', 'LSM_QIM_2005.csv',
    'gold_foreign_exchange_reserves.csv'
]
# Columns to parse as dates in standard files
standard_date_cols = ['Observation Date']

# 2. Wide Time-Series Files (Header=0, Years as Columns)
# These typically have 'Sectors', 'Sub-Sectors-Level1/2' and then year columns
wide_files = [
    'transport_and_communications.csv', 'trade_and_payments.csv',
    'public_debt.csv', 'population,_labor_force_and_employment.csv',
    'money_and_credit.csv', 'manufacturing_and_mining.csv',
    'inflation.csv', 'health_and_nutrition.csv', 'growth_and_investment.csv',
    'fiscal_development.csv', 'energy.csv', 'education.csv',
    'economic_and_social_indicators.csv',
    'capital_markets_and_corporate_sector.csv', 'agriculture.csv',
    # Adding inflation files here as they also have years as columns
    'usa_inflation.csv', 'uk_inflation.csv', 'uae_inflation.csv',
    'spain_inflation.csv', 'pakistan_inflation.csv',
    'netherlands_inflation.csv', 'italy_inflation.csv',
    'germany_inflation.csv', 'china_inflation.csv',
    'bangladesh_inflation.csv', 'afghanistan_inflation.csv'
]

# 3. World Bank Commodity Data (Metadata Above Header)
cmo_files = {
    'CMO_historical_data_monthly.csv': {'header': 4, 'date_col': 0}, # Header row index, date col index
    'CMO_historical_data_indices.csv': {'header': 5, 'date_col': 0}  # Header row index, date col index
}

# 4. Empty Files (To be skipped)
empty_files = [
    'social_protection.csv',
    'information_technology_and_telecommunication.csv'
]

# --- Function to identify year-like columns ---
def get_year_columns(df):
    year_cols = []
    # Regex to match 4 digits possibly separated by hyphen/slash (e.g., 1999, 2000-01)
    # Updated regex to be more flexible with column names
    year_pattern = re.compile(r'^(?:\d{4}(?:[-\/]\d{2,4})?|\d{4})$')
    for col in df.columns:
        # Ensure column name is treated as a string
        col_str = str(col)
        if year_pattern.match(col_str):
            year_cols.append(col) # Keep original column name (could be int or str)
    return year_cols

# --- Redirect Output to File ---
original_stdout = sys.stdout # Save a reference to the original standard output

print(f"Attempting to write detailed overview to: {output_filename}")

try:
    with open(output_filename, 'w', encoding='utf-8') as f:
        sys.stdout = f # Change the standard output to the file we created.

        # --- Load DataFrames ---
        dataframes = {} # Dictionary to store the loaded dataframes

        print("--- Loading Standard CSV Files ---")
        for filename in standard_files:
            file_path = os.path.join(data_folder, filename)
            try:
                df = pd.read_csv(
                    file_path,
                    quotechar='"',
                    header=0,
                    na_values=common_na_values,
                    parse_dates=standard_date_cols,
                    infer_datetime_format=True,
                    dayfirst=True, # Assuming day comes first in dates like '26-Mar-2025'
                    keep_default_na=True # Keep default NaN recognition
                )
                # Attempt to convert Observation Value to numeric after load
                if 'Observation Value' in df.columns:
                   df['Observation Value'] = pd.to_numeric(df['Observation Value'], errors='coerce')

                dataframes[filename] = df
                print(f"Successfully loaded: {filename}")
            except FileNotFoundError:
                print(f"Error: File not found at {file_path}")
            except Exception as e:
                print(f"Error loading {filename}: {e}")

        print("\n--- Loading Wide Time-Series Files ---")
        for filename in wide_files:
            file_path = os.path.join(data_folder, filename)
            try:
                df = pd.read_csv(
                    file_path,
                    header=0,
                    na_values=common_na_values,
                    keep_default_na=True
                )
                # Identify and convert year columns to numeric after load
                year_columns = get_year_columns(df)
                if year_columns:
                    print(f"Found potential year columns in {filename}: {year_columns}")
                    for col in year_columns:
                         # Check if column still exists (might be removed if all NA)
                        if col in df.columns:
                            # Convert column to numeric, coercing errors to NaN
                            df[col] = pd.to_numeric(df[col], errors='coerce')
                else:
                     print(f"Could not identify year columns automatically in {filename}")

                dataframes[filename] = df
                print(f"Successfully loaded: {filename}")
            except FileNotFoundError:
                print(f"Error: File not found at {file_path}")
            except Exception as e:
                print(f"Error loading {filename}: {e}")

        print("\n--- Loading CMO Files ---")
        for filename, params in cmo_files.items():
            file_path = os.path.join(data_folder, filename)
            try:
                # Specify the date column index for parsing
                date_col_list = [params['date_col']] if params['date_col'] is not None else None

                df = pd.read_csv(
                    file_path,
                    header=params['header'],
                    na_values=common_na_values,
                    parse_dates=date_col_list,
                    infer_datetime_format=True,
                    keep_default_na=True
                 )
                # Convert all other columns to numeric (assuming they should be)
                # Handle potential case where date_col is None or invalid index
                date_col_actual = df.columns[params['date_col']] if (params['date_col'] is not None and params['date_col'] < len(df.columns)) else None
                data_cols = df.columns.drop(date_col_actual) if date_col_actual else df.columns

                for col in data_cols:
                     if col in df.columns: # Check if column exists
                        df[col] = pd.to_numeric(df[col], errors='coerce')

                dataframes[filename] = df
                print(f"Successfully loaded: {filename}")
            except FileNotFoundError:
                print(f"Error: File not found at {file_path}")
            except Exception as e:
                print(f"Error loading {filename}: {e}")

        print(f"\n--- Skipping Empty Files: {', '.join(empty_files)} ---")

        print(f"\nTotal DataFrames loaded: {len(dataframes)}")

        # --- Displaying Detailed Data Overview ---

        print("\n\n--- Displaying Detailed DataFrame Overviews ---")

        # Set display options for better viewing in the output file
        pd.set_option('display.max_rows', 200) # Show even more rows
        pd.set_option('display.max_columns', 60) # Show more columns
        pd.set_option('display.width', 1500) # Wider display for less wrapping

        for filename, df in dataframes.items():
            print(f"\n\n{'='*80}")
            print(f"   Dataset: {filename}")
            print(f"{'='*80}")

            print("\n--- First 20 Rows (head) ---")
            # Use to_string() to prevent truncation in the output file
            print(df.head(20).to_string())

            print("\n--- Last 10 Rows (tail) ---")
            print(df.tail(10).to_string())

            print("\n--- DataFrame Info (info) ---")
            # Use buffer to capture info output as string for printing
            buffer = io.StringIO()
            df.info(buf=buffer, verbose=True) # Use verbose=True for more details
            info_str = buffer.getvalue()
            print(info_str)

            print("\n--- Missing Value Counts (isnull().sum()) ---")
            print(df.isnull().sum())

            print("\n--- Unique Value Counts (nunique()) ---")
            print(df.nunique())

            print("\n--- Descriptive Statistics (describe include='all') ---")
            # Use include='all' without the problematic argument
            try:
                 # Attempt to describe all columns, handling potential errors for mixed types
                 # Explicitly convert datetime columns to object for describe() if needed
                 df_desc = df.copy()
                 for col in df_desc.select_dtypes(include=[np.datetime64]):
                     df_desc[col] = df_desc[col].astype(str) # Convert datetime to string for describe
                 print(df_desc.describe(include='all'))
            except Exception as e:
                print(f"Could not generate full describe() for {filename}: {e}")
                # Fallback to describing only numeric columns
                try:
                    print("\n--- Descriptive Statistics (describe - numeric only) ---")
                    print(df.describe()) # Default is numeric only
                except Exception as e_num:
                     print(f"Could not generate numeric describe() for {filename}: {e_num}")

            print("\n--- Value Counts for Categorical Columns (Top 50 unique values shown if <= 50 total unique) ---")
            # Select columns with 'object' dtype or 'category' dtype
            cat_cols = df.select_dtypes(include=['object', 'category']).columns
            for col in cat_cols:
                num_unique = df[col].nunique()
                if num_unique <= 50: # Only print value_counts if not excessively many unique values
                    print(f"\nValue Counts for Column: '{col}' ({num_unique} unique values)")
                    print(df[col].value_counts().to_string())
                else:
                    print(f"\nSkipping value counts for Column: '{col}' ({num_unique} unique values > 50)")


        print("\n\n--- End of Detailed Data Overview ---")

# --- Restore Original Output ---
finally: # Ensure stdout is reset even if errors occur
    sys.stdout = original_stdout

print(f"\nDetailed data overview has been saved to '{output_filename}'")
# Optional: Print a few lines from the file to confirm writing
try:
    with open(output_filename, 'r', encoding='utf-8') as f_check:
        print("\nFirst few lines of the output file:")
        for _ in range(10): # Print more lines for confirmation
            line = f_check.readline()
            if not line:
                break
            print(line, end='')
except FileNotFoundError:
    print(f"Could not read back the output file '{output_filename}' for confirmation.")
except Exception as e:
    print(f"Error reading back output file: {e}")


Attempting to write detailed overview to: data_overview_output_detailed.txt


  df = pd.read_csv(
  df = pd.read_csv(
  df = pd.read_csv(
  df = pd.read_csv(
  df = pd.read_csv(
  df = pd.read_csv(
  df = pd.read_csv(
  df = pd.read_csv(
  df = pd.read_csv(
  df = pd.read_csv(
  df = pd.read_csv(
  df = pd.read_csv(
  df = pd.read_csv(
  df = pd.read_csv(
  df = pd.read_csv(
  df = pd.read_csv(
  df = pd.read_csv(
  df = pd.read_csv(
  df = pd.read_csv(
  df = pd.read_csv(
  df = pd.read_csv(
  df = pd.read_csv(
  df = pd.read_csv(
  df = pd.read_csv(



Detailed data overview has been saved to 'data_overview_output_detailed.txt'

First few lines of the output file:
--- Loading Standard CSV Files ---
Successfully loaded: t_bills.csv
Successfully loaded: seasonal_worker_remittance.csv
Successfully loaded: REER.csv
Successfully loaded: quaterly_gdp_2015.csv
Successfully loaded: policy_rate.csv
Successfully loaded: m2_broad_money.csv
Successfully loaded: kibor_kibid.csv
Successfully loaded: inflation_base_2015.csv
Successfully loaded: inflation_base_2007.csv
