# Data Cleaning and Preprocessing for Economic Indicators

This notebook covers loading raw economic indicator data, handling missing values, removing outliers, and normalizing/scaling features. The cleaned data will be saved for further analysis.

In [6]:
# 1. Import Libraries
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns

# 2. Load Raw Data (update path as needed)
df = pd.read_csv('../data/raw/USALOLITOAASTSAM.csv', index_col=0, parse_dates=True)
print(f"Loaded data shape: {df.shape}")
display(df.head())

# 3. Handle Missing Values
missing = df.isnull().sum()
print("Missing values per column:")
print(missing[missing > 0])

df = df.fillna(method='ffill').fillna(method='bfill')
print("Missing values after fill:", df.isnull().sum().sum())

# 4. Outlier Removal (IQR method)
def remove_outliers_iqr(df, columns):
    df_out = df.copy()
    for col in columns:
        Q1 = df_out[col].quantile(0.25)
        Q3 = df_out[col].quantile(0.75)
        IQR = Q3 - Q1
        lower = Q1 - 1.5 * IQR
        upper = Q3 + 1.5 * IQR
        df_out = df_out[(df_out[col] >= lower) & (df_out[col] <= upper)]
    return df_out

# Only select numeric columns for outlier removal and scaling
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
df_clean = remove_outliers_iqr(df, numeric_cols)
print(f"Shape after outlier removal: {df_clean.shape}")

# 5. Scaling (StandardScaler) - only scale numeric columns
scaler = StandardScaler()
df_scaled_numeric = pd.DataFrame(
    scaler.fit_transform(df_clean[numeric_cols]),
    index=df_clean.index,
    columns=numeric_cols
)

# If you want to keep non-numeric columns, concatenate them back
non_numeric_cols = df_clean.drop(columns=numeric_cols)
df_scaled = pd.concat([df_scaled_numeric, non_numeric_cols], axis=1)

print("Data scaled. Mean:", np.round(df_scaled_numeric.mean(), 2))
print("Std:", np.round(df_scaled_numeric.std(), 2))

# 6. Save Cleaned Data
os.makedirs('../data/processed', exist_ok=True)
df_scaled.to_csv('../data/processed/cleaned_economic_indicators.csv')
print("Cleaned and scaled data saved to ../data/processed/cleaned_economic_indicators.csv")

Loaded data shape: (701, 3)


Unnamed: 0_level_0,realtime_end,date,value
realtime_start,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2025-07-29,2025-07-29,1967-02-01,98.590266
2025-07-29,2025-07-29,1967-03-01,98.606358
2025-07-29,2025-07-29,1967-04-01,98.745387
2025-07-29,2025-07-29,1967-05-01,98.993997
2025-07-29,2025-07-29,1967-06-01,99.304794


Missing values per column:
Series([], dtype: int64)
Missing values after fill: 0
Shape after outlier removal: (670, 3)
Data scaled. Mean: value    0.0
dtype: float64
Std: value    1.0
dtype: float64
Cleaned and scaled data saved to ../data/processed/cleaned_economic_indicators.csv


  df = df.fillna(method='ffill').fillna(method='bfill')
