In [6]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from scipy import stats

In [7]:
class CreditProcessorQuick:
    def __init__(self, filepath):
        self.df = None
        try:
            self.df = pd.read_csv(filepath, low_memory=False)
            print(f"Loaded data. Shape: {self.df.shape}")
        except FileNotFoundError:
            print(f"Error: File not found at '{filepath}'")
        except Exception as e:
            print(f"Error loading file: {e}")

    def clean_data(self, outlier_cols=['income', 'balance'], impute_strategy='mean'):
        "Performs outlier , removal and missing value imputation."
        if self.df is None: return

        # 1. Outlier Detection (Z-score)
        print("Detecting and removing outliers...")
        for col in outlier_cols:
            if col in self.df.columns and self.df[col].dtype in ['int64', 'float64']:
                # Calculate Z-score on non-NaN values
                z_scores = np.abs(stats.zscore(self.df[col].dropna()))
                # Create a mask for rows to keep (non-outliers)
                outlier_idx = self.df[col].dropna().index[z_scores >= 3]
                self.df.drop(outlier_idx, inplace=True)
            else:
                print(f"  Skipping outlier detection for '{col}': not found or not numeric.")
        print(f"  After outlier removal. New shape: {self.df.shape}")

        # 2. Missing Value Imputation (Numeric columns only)
        print(f"Treating missing values using '{impute_strategy}' strategy...")
        numeric_cols = self.df.select_dtypes(include=np.number).columns
        if not numeric_cols.empty:
            if self.df[numeric_cols].isnull().sum().sum() > 0: # Check if there are any NaNs
                imputer = SimpleImputer(strategy=impute_strategy) # Mean is fastest
                self.df[numeric_cols] = imputer.fit_transform(self.df[numeric_cols])
                print("  Missing values imputed.")
            else:
                print("  No missing numeric values found.")
        else:
            print("  No numeric columns found for imputation.")

    def engineer_features(self):
        """Creates 'balance_to_income_ratio' and 'credit_utilization'."""
        if self.df is None: return

        print("Engineering features...")
        if 'balance' in self.df.columns and 'income' in self.df.columns:
            self.df['balance_to_income_ratio'] = self.df['balance'] / (self.df['income'].replace(0, np.nan).fillna(1e-6) + 1e-6)
        else:
            print("  Skipping 'balance_to_income_ratio': required columns missing.")

        if 'used_credit' in self.df.columns and 'credit_limit' in self.df.columns:
            self.df['credit_utilization'] = self.df['used_credit'] / (self.df['credit_limit'].replace(0, np.nan).fillna(1e-6) + 1e-6)
        else:
            print("  Skipping 'credit_utilization': required columns missing.")
        print("  Feature engineering complete.")

    def export_data(self, output_filepath, file_format='csv'):
        """Exports the processed DataFrame."""
        if self.df is None: return

        print(f"Exporting data to '{output_filepath}' as {file_format.upper()}...")
        try:
            if file_format.lower() == 'csv':
                self.df.to_csv(output_filepath, index=False)
            elif file_format.lower() == 'xlsx':
                # Ensure openpyxl is installed for Excel export
                # pip install openpyxl
                self.df.to_excel(output_filepath, index=False)
            else:
                print(f"Error: Unsupported format '{file_format}'. Use 'csv' or 'xlsx'.")
                return
            print("  Export successful.")
        except Exception as e:
            print(f"Error exporting data: {e}. (Ensure 'openpyxl' is installed for .xlsx)")


# 1. Define file paths
input_csv = '/content/creditcard.csv'
output_excel = 'processed_credit_data_quick.xlsx'
output_csv = 'processed_credit_data_quick.csv'

# 2. Initialize the processor
processor = CreditProcessorQuick(input_csv)

# 3. Perform all processing steps (if data loaded)
if processor.df is not None:
    processor.clean_data(outlier_cols=['income', 'balance'], impute_strategy='mean') # Using 'mean' for speed
    processor.engineer_features()

    # 4. Export the data (choose CSV for speed, or XLSX for visualization)
    processor.export_data(output_excel, file_format='xlsx')

print("\nQuick processing script finished.")

Loaded data. Shape: (150002, 18)
Detecting and removing outliers...
  Skipping outlier detection for 'income': not found or not numeric.
  Skipping outlier detection for 'balance': not found or not numeric.
  After outlier removal. New shape: (150002, 18)
Treating missing values using 'mean' strategy...
  Missing values imputed.
Engineering features...
  Skipping 'balance_to_income_ratio': required columns missing.
  Skipping 'credit_utilization': required columns missing.
  Feature engineering complete.
Exporting data to 'processed_credit_data_quick.xlsx' as XLSX...
  Export successful.

Quick processing script finished.


In [None]:
from google.colab import drive
drive.mount('/content/drive')