In [1]:
# Cell 1: Imports & load
import re
import numpy as np
import pandas as pd
# from scipy.stats import mode # No longer needed for categorical mode
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.decomposition import PCA
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import KNNImputer

# Load the dataset
df = pd.read_csv('train.csv')
y = df.pop('Credit_Score')  # target variable

# --- Custom Transformers ---

# Convert Credit History Age String to Months
class CreditHistoryAgeToMonths(BaseEstimator, TransformerMixin):
    def __init__(self, column='Credit_History_Age', new_column_name='Credit_History_Age_in_months'):
        self.column = column
        self.new_column_name = new_column_name

    def fit(self, X, y=None):
        return self # No fitting needed

    def transform(self, X):
        X = X.copy()
        if self.column not in X.columns:
            # print(f"Warning: Column '{self.column}' not found in CreditHistoryAgeToMonths.") # Optional
            return X

        def to_months(x):
            if pd.isna(x):
                return np.nan
            try:
                s = str(x).strip()
                # More flexible regex to capture numbers, allowing for variations
                match = re.search(r'(\d+)\s*Years?\s*(?:and)?\s*(\d+)?\s*Months?', s, re.IGNORECASE)
                if match:
                    years = int(match.group(1)) if match.group(1) else 0
                    months = int(match.group(2)) if match.group(2) else 0
                    return years * 12 + months
                # Handle cases like "X Years" without months specified
                match_years_only = re.search(r'(\d+)\s*Years?', s, re.IGNORECASE)
                if match_years_only:
                    years = int(match_years_only.group(1))
                    return years * 12
                # Handle cases like "Y Months" without years specified (less common but possible)
                match_months_only = re.search(r'(\d+)\s*Months?', s, re.IGNORECASE)
                if match_months_only:
                    months = int(match_months_only.group(1))
                    return months

            except Exception: # Catch potential errors during conversion
                pass # Return NaN if any error occurs
            return np.nan # Return NaN if no match or error

        # Use .loc for safer assignment
        X.loc[:, self.new_column_name] = X[self.column].apply(to_months)
        return X


# Clean numeric columns: Remove '-' and '_' then convert to numeric
class CleanNumeric(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        for c in self.columns:
            # Ensure column exists and is not entirely NaN/empty before processing
            if c in X.columns and not X[c].dropna().empty:
                # Convert to string first to apply string methods, handle potential non-string types
                X[c] = (
                    X[c].astype(str)
                        .str.replace(r'[-_]', '', regex=True)
                        .pipe(pd.to_numeric, errors='coerce') # Convert to numeric, invalid parsing becomes NaN
                )
            elif c in X.columns:
                # If column exists but is empty/all NaN, ensure it's numeric type if possible
                X[c] = pd.to_numeric(X[c], errors='coerce')
            # else: # Optional warning if column not found
            #     print(f"Warning: Column '{c}' not found in CleanNumeric.")
        return X

# Forward/backward fill by Customer_ID for static fields
class StaticFieldFiller(BaseEstimator, TransformerMixin):
    def __init__(self, columns, group_col='Customer_ID'):
        self.columns = columns
        self.group_col = group_col

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        # Check if group column exists
        if self.group_col not in X.columns:
            # print(f"Warning: Group column '{self.group_col}' not found in StaticFieldFiller. Skipping.") # Optional
            return X

        cols_present = [col for col in self.columns if col in X.columns]
        if cols_present:
            X.loc[:, cols_present] = X.groupby(self.group_col)[cols_present].transform(lambda g: g.ffill().bfill())
        # else: # Optional warning if no target columns found
        #      print(f"Warning: No target columns found in StaticFieldFiller.")
        return X

# Fix Num_of_Loan using mode + IQR clipping per group
class FixLoanOutliers(BaseEstimator, TransformerMixin):
    def __init__(self, loan_col="Num_of_Loan", group_col="Type_of_Loan"):
        self.loan_col = loan_col
        self.group_col = group_col
        # Store maps after fitting
        self.mode_map_ = None
        self.low_map_ = None
        self.high_map_ = None

    def fit(self, X, y=None):
        # Calculate maps during fit
        if self.loan_col in X.columns and self.group_col in X.columns:
            # Ensure loan_col is numeric before calculations
            s_numeric = pd.to_numeric(X[self.loan_col], errors='coerce')
            tmp = X[self.group_col].fillna("__MISSING__") # Handle NaNs in group column

            # Calculate mode, Q1, Q3 per group
            # Use .agg with a lambda that handles empty groups
            self.mode_map_  = s_numeric.groupby(tmp).agg(lambda grp: grp.mode().iat[0] if not grp.mode().empty else np.nan)
            q1_map          = s_numeric.groupby(tmp).quantile(0.25)
            q3_map          = s_numeric.groupby(tmp).quantile(0.75)

            # Calculate IQR bounds, handle potential NaNs from quantiles
            iqr_map = q3_map - q1_map
            self.low_map_   = q1_map - 1.5 * iqr_map
            self.high_map_  = q3_map + 1.5 * iqr_map

        # else: # Optional warning if columns not found
        #     print(f"Warning: Columns '{self.loan_col}' or '{self.group_col}' not found during fit in FixLoanOutliers.")
        return self

    def transform(self, X):
        X = X.copy()
        # Check if fit was successful and columns exist
        if self.mode_map_ is None or self.loan_col not in X.columns or self.group_col not in X.columns:
            # print(f"Warning: FixLoanOutliers transform skipped - columns missing or fit failed.") # Optional
            return X # Return original X if columns are missing or fit failed

        tmp = X[self.group_col].fillna("__MISSING__")
        s = pd.to_numeric(X[self.loan_col], errors='coerce') # Ensure numeric for transform too

        def cap_fill(grp):
            name = grp.name
            # Get fitted values, default to global values or NaN if group not seen
            m = self.mode_map_.get(name, self.mode_map_.get("__MISSING__", np.nan)) # Fallback to __MISSING__ group mode or NaN
            l = self.low_map_.get(name, -np.inf)
            h = self.high_map_.get(name, np.inf)

            # Fill NaNs with the mode *before* clipping
            grp_filled = grp.fillna(m)
            # Clip values
            grp_clipped = grp_filled.clip(lower=l, upper=h)
            # Fill again in case clipping introduced NaNs (unlikely with inf bounds) or original fill value was NaN
            grp_final = grp_clipped.fillna(m)

            return grp_final

        # Apply the cap_fill function to each group
        # Use .loc for safer assignment
        X.loc[:, self.loan_col] = s.groupby(tmp, group_keys=False).apply(cap_fill)
        return X

# Clean Category Strings
class CategoryCleaner(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        for c in self.columns:
            if c in X.columns:
                # Ensure column is string type first
                X[c] = X[c].astype(str)
                # Apply cleaning steps
                X[c] = (
                    X[c]
                        .str.replace(r'[^A-Za-z\s]', '', regex=True) # Keep only letters and spaces
                        .str.strip()
                        .str.replace(r'\s+', '_', regex=True) # Replace spaces with underscore
                        .str.lower()
                        .replace(r'^_+$', np.nan, regex=True) # Handle cases that become only underscores
                        .replace(r'^\s*$', np.nan, regex=True) # Replace empty/whitespace-only with NaN
                        .replace('nan', np.nan) # Replace string 'nan' with NaN
                )
            # else: # Optional warning if column not found
            #     print(f"Warning: Column '{c}' not found in CategoryCleaner.")
        return X

# Impute Categorical using Local Mode (includes rare values)
class LocalModeCatImputer(BaseEstimator, TransformerMixin):
    def __init__(self, columns, window=5):
        self.columns = columns
        self.window = window
        # Store global modes as fallback
        self.global_modes_ = {}

    def fit(self, X, y=None):
        # Fit global modes for fallback
        for col in self.columns:
            if col in X.columns:
                # Calculate mode on non-NaN values using pandas mode
                mode_val = X[col].dropna().mode()
                self.global_modes_[col] = mode_val.iat[0] if not mode_val.empty else np.nan
            # else: # Optional warning if column not found
            #     print(f"Warning: Column '{col}' not found during fit in LocalModeCatImputer.")
        return self

    def transform(self, X):
        X = X.copy()
        n = len(X)
        for col in self.columns:
            if col not in X.columns or col not in self.global_modes_:
                # print(f"Warning: Column '{col}' not found or not fitted in LocalModeCatImputer. Skipping.") # Optional
                continue

            # Ensure NaNs are consistent (use pd.isna) and handle string 'nan'
            vals = X[col].copy().replace('nan', np.nan)

            # Identify rare values (consider if freq=1 is too strict)
            # Calculate counts on non-NaN values
            counts = vals.value_counts(dropna=True)
            # Define rare based on a threshold (e.g., count <= 1)
            rare = set(counts[counts <= 1].index)

            # Find indices needing imputation (NaN or rare)
            indices_to_impute = X.index[vals.isna() | vals.isin(rare)]

            for i in indices_to_impute:
                lo = max(0, i - self.window)
                hi = min(n, i + self.window + 1)

                # Get window data using .loc to handle potential index gaps
                window_indices = X.index[lo:hi].drop(i, errors='ignore')
                # Get non-NaN values from the window
                w = X.loc[window_indices, col].dropna()

                impute_value = self.global_modes_.get(col, np.nan) # Default to global mode

                if not w.empty:
                    # Calculate mode of the window using pandas mode
                    mode_val = w.mode()
                    if not mode_val.empty: # Check if mode calculation returned anything
                        impute_value = mode_val.iat[0] # Use local mode if available

                # Use .loc for assignment
                X.loc[i, col] = impute_value

        return X
class LabelEncodeColumns(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns
        self.encoders_ = {}  # Store fitted encoders
        self.mode_map_ = {}  # Store mode per column for NA filling

    def fit(self, X, y=None):
        for col in self.columns:
            if col in X.columns:
                le = LabelEncoder()
                # Compute mode for the column to fill NaNs
                col_mode = X[col].dropna().mode()
                self.mode_map_[col] = str(col_mode.iloc[0]) if not col_mode.empty else 'UNKNOWN'
                
                # Fit encoder on unique non-NA values (as strings)
                unique_vals = X[col].dropna().astype(str).unique()
                self.encoders_[col] = le.fit(unique_vals)
        return self

    def transform(self, X):
        X = X.copy()
        for col in self.columns:
            if col in X.columns and col in self.encoders_:
                le = self.encoders_[col]
                # Fill NA with the mode value
                fill_val = self.mode_map_.get(col, 'UNKNOWN')
                filled_col = X[col].fillna(fill_val).astype(str)

                known_mask = filled_col.isin(le.classes_)
                encoded_series = pd.Series(index=X.index, dtype=float)

                encoded_series.loc[known_mask] = le.transform(filled_col[known_mask])
                encoded_series.loc[~known_mask] = -1  # Assign -1 to unseen values

                X.loc[:, col] = encoded_series
        return X

# Impute Numeric using Local Mode (includes outliers)
class LocalModeNumImputer(BaseEstimator, TransformerMixin):
    def __init__(self, columns, window=5):
        self.columns = columns
        self.window = window
        # Store bounds and global modes from fit
        self.bounds_ = {}
        self.global_modes_ = {}

    def fit(self, X, y=None):
        for col in self.columns:
            if col in X.columns:
                # Ensure column is numeric for calculations
                s = pd.to_numeric(X[col], errors='coerce')
                if not s.dropna().empty:
                    q1, q3 = s.quantile([.25, .75])
                    iqr = q3 - q1
                    low, high = q1 - 1.5 * iqr, q3 + 1.5 * iqr
                    self.bounds_[col] = (low, high)
                    # Calculate global mode on non-NaN values using pandas mode
                    mode_val = s.dropna().mode()
                    self.global_modes_[col] = mode_val.iat[0] if not mode_val.empty else np.nan
                else: # Handle empty columns
                    self.bounds_[col] = (-np.inf, np.inf)
                    self.global_modes_[col] = np.nan
            # else: # Optional warning if column not found
            #      print(f"Warning: Column '{col}' not found during fit in LocalModeNumImputer.")
        return self

    def transform(self, X):
        X = X.copy()
        n = len(X)
        for col in self.columns:
            if col not in X.columns or col not in self.bounds_:
                # print(f"Warning: Column '{col}' not found or not fitted in LocalModeNumImputer. Skipping.") # Optional
                continue

            # Ensure column is numeric for processing
            s = pd.to_numeric(X[col], errors='coerce').copy()
            low, high = self.bounds_[col]

            # Find indices needing imputation (NaN or outside bounds)
            indices_to_impute = X.index[s.isna() | (s < low) | (s > high)]

            for i in indices_to_impute:
                lo = max(0, i - self.window)
                hi = min(n, i + self.window + 1)

                # Get window data using .loc, ensure numeric, and drop NaNs
                window_indices = X.index[lo:hi].drop(i, errors='ignore')
                w = pd.to_numeric(X.loc[window_indices, col], errors='coerce').dropna()

                impute_value = self.global_modes_.get(col, np.nan) # Default to global mode

                if not w.empty:
                    # Calculate mode of the window using pandas mode
                    mode_val = w.mode()
                    if not mode_val.empty: # Check if mode calculation returned anything
                        impute_value = mode_val.iat[0] # Use local mode if available

                # Use .loc for assignment
                X.loc[i, col] = impute_value

            # Final check: Ensure column remains numeric after imputation
            X.loc[:, col] = pd.to_numeric(X[col], errors='coerce')

        return X

# Drop Specified Columns
class DropColumns(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        # Ensure columns to drop actually exist
        cols_to_drop = [col for col in self.columns if col in X.columns]
        return X.drop(columns=cols_to_drop, errors='ignore')




  df = pd.read_csv('train.csv')


In [2]:
# --- Pipeline Definition ---

# Define column groups based on pipeline flow
initial_numeric_cols = [
    "Age", "Annual_Income", "Monthly_Inhand_Salary", "Num_Bank_Accounts",
    "Num_Credit_Card", "Interest_Rate", "Num_of_Loan", "Delay_from_due_date",
    "Num_of_Delayed_Payment", "Changed_Credit_Limit", "Num_Credit_Inquiries",
    "Outstanding_Debt", "Total_EMI_per_month", "Amount_invested_monthly",
    "Monthly_Balance"
]
# Static columns to fill using ffill/bfill per customer
# Ensure these are handled appropriately by cleaning/imputation steps afterwards
static_cols = ["Age","Occupation","Annual_Income","Monthly_Inhand_Salary","Num_Bank_Accounts","Num_Credit_Card","Interest_Rate"]

# Categorical columns needing string cleaning
cat_clean_cols = ['Credit_Mix','Payment_Behaviour','Occupation','Payment_of_Min_Amount','Month']

# Categorical columns to impute using local mode (AFTER cleaning)
cat_impute_cols = ['Credit_Mix','Payment_Behaviour','Occupation','Payment_of_Min_Amount','Month']

# Categorical columns to label encode (AFTER cleaning and categorical imputation)
cat_encode_cols = ['Credit_Mix','Payment_Behaviour','Occupation','Payment_of_Min_Amount','Month']

# Numeric columns to impute using local mode (AFTER cleaning, filling, and conversion)
# Include the newly created numeric credit history column and other numeric columns
num_impute_cols = [
    "Age", "Annual_Income", "Monthly_Inhand_Salary", "Num_Bank_Accounts",
    "Num_Credit_Card", "Interest_Rate", "Num_of_Loan", "Delay_from_due_date",
    "Num_of_Delayed_Payment", "Changed_Credit_Limit", "Num_Credit_Inquiries",
    "Outstanding_Debt", "Total_EMI_per_month", "Amount_invested_monthly",
    "Monthly_Balance", 'Credit_History_Age_in_months' 
]


# Columns to finally drop (IDs, original text fields replaced by converted/encoded ones)
cols_to_drop = ['Name','Type_of_Loan','ID','SSN', 'Credit_History_Age', 'Customer_ID'] # Drop original history col

# Define the preprocessing pipeline
# Order matters!
full_preprocessor = Pipeline([
    # 1) Convert Credit History Age string to months (numeric)
    ('credit_age_months', CreditHistoryAgeToMonths(column='Credit_History_Age',
        new_column_name='Credit_History_Age_in_months')),

    # 2) Clean initial numeric columns (removes symbols, coerces to numeric)
    ('clean_numeric',     CleanNumeric(columns=initial_numeric_cols)),

    # 3) Forward/backward fill static fields per customer
    ('fill_static',       StaticFieldFiller(columns=static_cols, group_col='Customer_ID')),

    # 4) Fix Num_of_Loan outliers and NaNs using group-wise mode/IQR
    ('fix_loan',          FixLoanOutliers(loan_col="Num_of_Loan", group_col="Type_of_Loan")),

    # 5) Clean categorical strings (removes symbols, standardizes format)
    ('clean_cats',        CategoryCleaner(columns=cat_clean_cols)),

    # 6) Impute categorical NaNs and rare values using local mode (AFTER cleaning)
    ('impute_cat_mode',   LocalModeCatImputer(columns=cat_impute_cols)),

    # 7) Label encode your cleaned and imputed categoricals
    ('label_encode',      LabelEncodeColumns(columns=cat_encode_cols)),

    # 8) Drop original text/ID columns and the original credit history column
    ('drop_cols',         DropColumns(columns=cols_to_drop)),

    # 9) Impute numeric NaNs and outliers using local mode (AFTER cleaning, conversion, static fill, loan fix)
    ('impute_num_mode',   LocalModeNumImputer(columns=num_impute_cols)),

    # 10) Fill any remaining NaNs via KNN imputation (cat columns are now numeric after label encoding)
    #('knn_impute', KNNImputer(n_neighbors=5, weights='uniform')),

    # 11) Scale numeric features
    ('scaler',    StandardScaler()),
    # ('minmax_scaler', MinMaxScaler()), # Example if you prefer MinMaxScaler

    # 12) Apply PCA for dimensionality reduction
    ('pca',       PCA(n_components=0.95)), # Keep 95% of variance

])


In [3]:

# Build final model pipeline (including preprocessor and potentially a classifier)
# Currently only includes the preprocessor for testing cleaning
pipeline = Pipeline(steps=[
    ("preprocessor", full_preprocessor),
    # Uncomment and add your classifier here when ready, e.g.:
    # ("classifier", SVC(kernel="rbf", C=1.0, probability=True, random_state=42))
])

# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    df, y, test_size=0.2, random_state=42, stratify=y # Added stratify for classification
)

print("Starting pipeline fitting...")
# Fit the pipeline on the training data
pipeline.fit(X_train, y_train)
print("Pipeline fitting finished.")

# Optional: Transform the training data and inspect the output
print("\nSample data after preprocessing (first 5 rows):")
# Apply the fitted preprocessor to the first 5 rows of the original X_train
X_train_processed_sample = pipeline.named_steps['preprocessor'].transform(X_train.head())
# Convert the output numpy array back to a DataFrame for easier inspection
# Need to get the column names after preprocessing - this can be tricky with custom transformers
# For a simple check, we can just print the numpy array or try to infer names if possible
# A more robust way is to fit/transform a small sample separately to get column names
try:
    # Attempt to get feature names after preprocessing (might not work with all custom transformers)
    # This is a common challenge with custom transformers that don't implement get_feature_names_out
    # For now, we'll just show the numpy array output
    print(X_train_processed_sample)
except Exception as e:
    print(f"Could not display as DataFrame: {e}")
    print(X_train_processed_sample) # Print the numpy array

# print("\nData types after preprocessing:")
# # This will also be tricky without column names.
# # You would typically check the dtype of the resulting numpy array or DataFrame
# if isinstance(X_train_processed_sample, np.ndarray):
#      print(f"Output is a numpy array with dtype: {X_train_processed_sample.dtype}")
# else:
#      print(X_train_processed_sample.info())


# # Predict & evaluate (uncomment when classifier is added and fitted)
# print("Starting prediction...")
# y_pred = pipeline.predict(X_test)
# print("Prediction finished.")

# print("\nClassification Report:")
# print(classification_report(y_test, y_pred))


Starting pipeline fitting...
Pipeline fitting finished.

Sample data after preprocessing (first 5 rows):
[[-2.20910995  1.80255344 -0.66134142  3.95526806 -0.56567646  0.46555824
  -0.22434103  0.9505057   2.96330489 -0.20619482  1.65302874  0.89825971
   0.04156496  0.99343532 -1.03906614 -0.77977052 -0.03921545 -0.62293253
  -0.29492692]
 [-2.26943078  1.87313254 -2.18589523  3.22932998  0.42444863 -0.81422706
  -1.44240647 -1.39447324  2.66679104 -0.71926089  1.86893154 -0.80914997
  -0.27478568  0.19108723  0.55400875 -0.23995233  0.18232405 -0.26125931
   0.10326096]
 [-3.56266589  0.17568498 -0.73253597  3.56917078  0.44913926 -0.77999332
  -1.30887888 -0.19836928  3.05490208  0.82699614  2.24919027  0.50982818
  -1.05512593  0.69558769 -0.04589288 -0.63268301  1.00900108 -0.11335743
   0.56686079]
 [-0.78490607  0.04470292 -0.70276326  0.30223621 -0.88485732  0.47852289
  -0.47758977  1.07628903  0.27775255 -0.40222024 -1.05453236  0.27608401
  -0.65288423  0.19156179 -0.5430233

In [4]:
X_train_processed_sample2 = pipeline.named_steps['preprocessor'].transform(X_train.head())

X_train_processed_sample_df = pd.DataFrame(X_train_processed_sample2)

# Define the filename for the CSV
output_filename = "X_train_processed_sample.csv"

# Save the DataFrame to a CSV file
# index=False prevents pandas from writing the DataFrame index as a column
X_train_processed_sample_df.to_csv(output_filename, index=False)


In [5]:
# Apply the preprocessor to some data (e.g., the training set)
X_train_transformed = pipeline.named_steps["preprocessor"].transform(X_train)

# Show number of rows and columns in the output
print("✅ Transformed Output Info:")
print(f"Number of rows: {X_train_transformed.shape[0]}")
print(f"Number of columns (features): {X_train_transformed.shape[1]}")


✅ Transformed Output Info:
Number of rows: 80000
Number of columns (features): 19
