In [None]:
# Cell 1: Core imports & column‐lists
import re
import numpy as np
import pandas as pd
from scipy.stats import mode
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC

# Your column‐lists
numeric_strip_cols = [
    "Age", "Annual_Income", "Monthly_Inhand_Salary", "Num_Bank_Accounts",
    "Num_Credit_Card", "Interest_Rate", "Num_of_Loan", "Delay_from_due_date",
    "Num_of_Delayed_Payment", "Changed_Credit_Limit", "Num_Credit_Inquiries",
    "Outstanding_Debt", "Total_EMI_per_month", "Amount_invested_monthly",
    "Monthly_Balance"
]
static_cols = ["Age", "Occupation", "Annual_Income", "Monthly_Inhand_Salary",
               "Num_Bank_Accounts", "Num_Credit_Card", "Interest_Rate"]
group_col = "Type_of_Loan"
loan_col  = "Num_of_Loan"
cat_clean_cols    = ['Credit_Mix','Payment_Behaviour','Occupation']
mode_impute_cols  = cat_clean_cols + ['Payment_of_Min_Amount','Month']
label_encode_cols = mode_impute_cols + ['Credit_History_Age_months']

# Load data
df = pd.read_csv('train.csv')
y = df.pop('Credit_Score')   


  df = pd.read_csv('train.csv')


In [20]:
# Cell 2: Define ALL custom Transformers

class StripAndNumeric(BaseEstimator, TransformerMixin):
    def __init__(self, cols): self.cols = cols
    def fit(self, X, y=None): return self
    def transform(self, X):
        X = X.copy()
        for c in self.cols:
            X[c] = (
                X[c].astype(str)
                     .str.replace(r'[-_]', '', regex=True)
                     .pipe(pd.to_numeric, errors='coerce')
            )
        return X

class StaticFieldImputer(BaseEstimator, TransformerMixin):
    def __init__(self, id_col, cols):
        self.id_col, self.cols = id_col, cols
    def fit(self, X, y=None): return self
    def transform(self, X):
        X = X.copy()
        for c in self.cols:
            X[c] = X.groupby(self.id_col)[c]\
                    .transform(lambda g: g.ffill().bfill().iloc[0])
        return X

class LoanModeImputer(BaseEstimator, TransformerMixin):
    """Fill NaNs & cap outliers to the group MODE, without dropping any rows."""
    def __init__(self, group_col, value_col, na_placeholder="__MISSING__"):
        self.group_col = group_col
        self.value_col = value_col
        self.na_placeholder = na_placeholder

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        # ensure every row is in a group
        X[self.group_col] = X[self.group_col].fillna(self.na_placeholder)

        def clean_grp(g):
            s = g[self.value_col]
            try:
                m = mode(s.dropna())[0][0]
            except:
                m = np.nan
            q1, q3 = s.quantile([.25, .75])
            iqr = q3 - q1
            low, high = q1 - 1.5*iqr, q3 + 1.5*iqr
            # fill & cap
            s = s.fillna(m).apply(lambda x: m if x < low or x > high else x)
            g[self.value_col] = s.fillna(m)
            return g

        return X.groupby(self.group_col, group_keys=False).apply(clean_grp)

class CategoryCleaner(BaseEstimator, TransformerMixin):
    def __init__(self, cols): self.cols = cols
    def fit(self, X, y=None): return self
    def transform(self, X):
        X = X.copy()
        for c in self.cols:
            X[c] = (
                X[c].astype(str)
                    .str.replace(r'[^A-Za-z\s]', '', regex=True)
                    .str.strip()
                    .str.replace(r'\s+', '_', regex=True)
                    .str.lower()
            )
        return X

class LocalModeImputer(BaseEstimator, TransformerMixin):
    def __init__(self, cols, window=5):
        self.cols, self.window = cols, window
    def fit(self, X, y=None): return self
    def transform(self, X):
        X = X.copy()
        for c in self.cols:
            vals = X[c].replace(r'^\s*$', np.nan, regex=True)
            counts = vals.value_counts(dropna=True)
            rare = set(counts[counts==1].index)
            arr = vals.copy()
            n = len(arr)
            for i in range(n):
                v = arr.iat[i]
                if pd.isna(v) or v in rare:
                    lo, hi = max(0, i-self.window), min(n, i+self.window+1)
                    w = arr.iloc[lo:hi].dropna()
                    if not w.empty:
                        arr.iat[i] = w.mode().iat[0]
            X[c] = arr
        return X

class CreditHistoryConverter(BaseEstimator, TransformerMixin):
    def __init__(self, col): self.col = col
    def fit(self, X, y=None): return self
    def transform(self, X):
        X = X.copy()
        def to_m(val):
            m = re.match(r'(\d+)\s+Years?\s+and\s+(\d+)\s+Months?', str(val))
            return int(m[1])*12 + int(m[2]) if m else np.nan
        X[self.col + '_months'] = X[self.col].map(to_m)
        return X

class LabelEncoderImputer(BaseEstimator, TransformerMixin):
    def __init__(self, cols):
        self.cols = cols
        self.encs = {}
    def fit(self, X, y=None):
        for c in self.cols:
            le = LabelEncoder()
            nonnull = X[c].dropna().astype(str)
            if not nonnull.empty:
                le.fit(nonnull)
                self.encs[c] = le
        return self
    def transform(self, X):
        X = X.copy()
        for c, le in self.encs.items():
            mask = X[c].notna()
            X.loc[mask, c] = le.transform(X.loc[mask, c].astype(str))
        return X


In [23]:
# Cell 3: Build `preprocessor` cleanly, excluding Credit_Score

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# 1) Make sure our intended lists do NOT contain the target:
mode_impute_cols  = ['Credit_Mix','Payment_Behaviour','Occupation',
                     'Payment_of_Min_Amount','Month']   # no Credit_Score here!
label_encode_cols = mode_impute_cols + ['Credit_History_Age_months']

# 2) Compute only-they-exist lists from df.columns:
num_cols = [c for c in numeric_strip_cols + ['Credit_History_Age_months']
            if c in df.columns]
cat_cols = [c for c in mode_impute_cols
            if c in df.columns]

print("Numeric features →", num_cols)
print("Categorical features →", cat_cols)

# 3) Two‐stage pipeline:

# Stage A: apply all custom transformers (creates the new '_months' col and label-encodes cats)
full_df_pipeline = Pipeline([
    ('strip',     StripAndNumeric(numeric_strip_cols)),
    ('static',    StaticFieldImputer('Customer_ID', static_cols)),
    ('loan_mode', LoanModeImputer(group_col, loan_col)),
    ('cred_age',  CreditHistoryConverter('Credit_History_Age')),
    ('cat_clean', CategoryCleaner(cat_clean_cols)),
    ('mode_imp',  LocalModeImputer(mode_impute_cols)),
    ('lbl_enc',   LabelEncoderImputer(label_encode_cols)),
])

# Stage B: select & impute-with-mode + scale
selector_and_scaler = ColumnTransformer([
    ('num', Pipeline([
        ('imp_mode', SimpleImputer(strategy='most_frequent')),
        ('std',      StandardScaler())
    ]), num_cols),
    ('cat', Pipeline([
        ('imp_mode', SimpleImputer(strategy='most_frequent')),
        ('mm',       MinMaxScaler())
    ]), cat_cols),
], remainder='drop')

# Final preprocessor
preprocessor = Pipeline([
    ('full_df', full_df_pipeline),
    ('select',  selector_and_scaler),
])


Numeric features → ['Age', 'Annual_Income', 'Monthly_Inhand_Salary', 'Num_Bank_Accounts', 'Num_Credit_Card', 'Interest_Rate', 'Num_of_Loan', 'Delay_from_due_date', 'Num_of_Delayed_Payment', 'Changed_Credit_Limit', 'Num_Credit_Inquiries', 'Outstanding_Debt', 'Total_EMI_per_month', 'Amount_invested_monthly', 'Monthly_Balance']
Categorical features → ['Credit_Mix', 'Payment_Behaviour', 'Occupation', 'Payment_of_Min_Amount', 'Month']


In [None]:
# Cell 4: Model Pipeline, Train/Test & Eval

from sklearn.metrics import classification_report

model = Pipeline([
    ('preproc', preprocessor),
    ('svm',     SVC(kernel='rbf', C=1.0, random_state=42))
])

X_train, X_test, y_train, y_test = train_test_split(
    df, y, test_size=0.2, random_state=42
)

model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print(classification_report(y_test, y_pred))
