In [None]:
# src/data/preprocess.py

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import logging
import os

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

def load_data(filepath):
    """Load data from CSV file."""
    logger.info(f"Loading data from {filepath}")
    try:
        data = pd.read_csv(filepath)
        logger.info(f"Data loaded successfully with shape {data.shape}")
        return data
    except Exception as e:
        logger.error(f"Error loading data: {e}")
        raise

def check_missing_values(data):
    """Check for missing values in the dataframe."""
    missing = data.isnull().sum()
    if missing.sum() > 0:
        logger.warning(f"Missing values found:\n{missing[missing > 0]}")
    else:
        logger.info("No missing values found")
    return missing

def handle_outliers(data, columns, method='iqr', threshold=1.5):
    """Handle outliers in specified columns."""
    data_clean = data.copy()
    
    for col in columns:
        if method == 'iqr':
            Q1 = data_clean[col].quantile(0.25)
            Q3 = data_clean[col].quantile(0.75)
            IQR = Q3 - Q1
            lower_bound = Q1 - threshold * IQR
            upper_bound = Q3 + threshold * IQR
            
            # Log outliers
            outliers = data_clean[(data_clean[col] < lower_bound) | (data_clean[col] > upper_bound)]
            if not outliers.empty:
                logger.info(f"Found {len(outliers)} outliers in column {col}")
            
            # Cap outliers
            data_clean[col] = np.where(
                data_clean[col] < lower_bound,
                lower_bound,
                np.where(data_clean[col] > upper_bound, upper_bound, data_clean[col])
            )
    
    return data_clean

def preprocess_data(data, target_col='default', test_size=0.2, random_state=42):
    """Preprocess data for modeling."""
    logger.info("Starting data preprocessing")
    
    # Check for missing values
    check_missing_values(data)
    
    # Extract features and target
    X = data.drop(columns=[target_col, 'customer_id'])  # Assuming customer_id is not needed
    y = data[target_col]
    
    logger.info(f"Features shape: {X.shape}, Target shape: {y.shape}")
    
    # Handle outliers in numerical columns
    numerical_cols = X.select_dtypes(include=['float64', 'int64']).columns.tolist()
    X = handle_outliers(X, numerical_cols)
    
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=random_state, stratify=y
    )
    logger.info(f"Train set: {X_train.shape}, Test set: {X_test.shape}")
    
    # Scale features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    logger.info("Data preprocessing completed")
    
    return X_train_scaled, X_test_scaled, y_train, y_test, scaler

def save_processed_data(X_train, X_test, y_train, y_test, scaler, output_dir):
    """Save processed data to disk."""
    logger.info(f"Saving processed data to {output_dir}")
    os.makedirs(output_dir, exist_ok=True)
    
    # Convert to DataFrames for saving
    X_train_df = pd.DataFrame(X_train)
    X_test_df = pd.DataFrame(X_test)
    
    # Save data
    X_train_df.to_csv(f"{output_dir}/X_train.csv", index=False)
    X_test_df.to_csv(f"{output_dir}/X_test.csv", index=False)
    y_train.to_csv(f"{output_dir}/y_train.csv", index=False)
    y_test.to_csv(f"{output_dir}/y_test.csv", index=False)
    
    # Save scaler
    import joblib
    joblib.dump(scaler, f"{output_dir}/scaler.pkl")
    
    logger.info("All processed data saved successfully")

if __name__ == "__main__":
    # Example usage
    input_file = "data/raw/Loan_Data.csv"
    output_dir = "data/processed"
    
    data = load_data(input_file)
    X_train, X_test, y_train, y_test, scaler = preprocess_data(data)
    save_processed_data(X_train, X_test, y_train, y_test, scaler, output_dir)

Now, let's create a script to perform EDA:

# src/data/eda.py

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import logging

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

def load_data(filepath):
    """Load data from CSV file."""
    logger.info(f"Loading data from {filepath}")
    try:
        data = pd.read_csv(filepath)
        logger.info(f"Data loaded successfully with shape {data.shape}")
        return data
    except Exception as e:
        logger.error(f"Error loading data: {e}")
        raise

def explore_data(data, output_dir):
    """Perform exploratory data analysis and save visualizations."""
    logger.info("Starting exploratory data analysis")
    
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    
    # Basic statistics
    logger.info("Generating basic statistics")
    stats = data.describe().T
    stats.to_csv(f"{output_dir}/basic_statistics.csv")
    
    # Check target distribution
    logger.info("Analyzing target distribution")
    plt.figure(figsize=(8, 6))
    default_counts = data['default'].value_counts()
    sns.barplot(x=default_counts.index, y=default_counts.values)
    plt.title('Distribution of Default vs Non-Default')
    plt.xlabel('Default Status (1=Default, 0=Non-Default)')
    plt.ylabel('Count')
    plt.savefig(f"{output_dir}/target_distribution.png")
    
    # Correlation matrix
    logger.info("Generating correlation matrix")
    plt.figure(figsize=(12, 10))
    numeric_data = data.select_dtypes(include=['float64', 'int64'])
    corr = numeric_data.corr()
    sns.heatmap(corr, annot=True, cmap='coolwarm', fmt=".2f")
    plt.title('Correlation Matrix')
    plt.tight_layout()
    plt.savefig(f"{output_dir}/correlation_matrix.png")
    
    # Feature distributions by default status
    logger.info("Analyzing feature distributions by default status")
    feature_cols = [col for col in data.columns if col not in ['customer_id', 'default']]
    
    for col in feature_cols:
        plt.figure(figsize=(10, 6))
        sns.histplot(data=data, x=col, hue='default', kde=True, element='step')
        plt.title(f'Distribution of {col} by Default Status')
        plt.tight_layout()
        plt.savefig(f"{output_dir}/dist_{col}.png")
    
    # Box plots for outlier detection
    logger.info("Creating box plots for outlier detection")
    for col in feature_cols:
        plt.figure(figsize=(10, 6))
        sns.boxplot(data=data, x='default', y=col)
        plt.title(f'Box Plot of {col} by Default Status')
        plt.tight_layout()
        plt.savefig(f"{output_dir}/boxplot_{col}.png")
    
    # Generate summary report
    logger.info("Generating data summary report")
    with open(f"{output_dir}/eda_summary.txt", 'w') as f:
        f.write(f"Dataset Shape: {data.shape}\n\n")
        f.write(f"Data Types:\n{data.dtypes}\n\n")
        f.write(f"Missing Values:\n{data.isnull().sum()}\n\n")
        f.write(f"Target Distribution:\n{data['default'].value_counts()}\n")
        f.write(f"Target Distribution (%):\n{data['default'].value_counts(normalize=True) * 100}\n\n")
        
    logger.info("EDA completed successfully")

if __name__ == "__main__":
    input_file = "data/raw/Loan_Data.csv"
    output_dir = "notebooks/eda_results"
    
    data = load_data(input_file)
    explore_data(data, output_dir)
