In [12]:
# Cell 1: Imports & load
import re
import numpy as np
import pandas as pd
# from scipy.stats import mode # No longer needed for categorical mode
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder , OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.decomposition import PCA
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import KNNImputer

# Load the dataset
df = pd.read_csv('train.csv')
y = df.pop('Credit_Score')  # target variable


  df = pd.read_csv('train.csv')


In [19]:
print(df['Credit_Mix'].value_counts())
ordinal_features = ['Credit_Mix', 'Payment_of_Min_Amount']
nominal_features = ['Occupation', 'Payment_Behaviour', 'Month']
print(df['Occupation'].value_counts())
print(df['Payment_Behaviour'].value_counts())
print(df['Month'].value_counts())
print(df['Payment_of_Min_Amount'].value_counts())
print(df["Type_of_Loan"].value_counts())



Credit_Mix
Standard    36479
Good        24337
_           20195
Bad         18989
Name: count, dtype: int64
Occupation
_______          7062
Lawyer           6575
Architect        6355
Engineer         6350
Scientist        6299
Mechanic         6291
Accountant       6271
Developer        6235
Media_Manager    6232
Teacher          6215
Entrepreneur     6174
Doctor           6087
Journalist       6085
Manager          5973
Musician         5911
Writer           5885
Name: count, dtype: int64
Payment_Behaviour
Low_spent_Small_value_payments      25513
High_spent_Medium_value_payments    17540
Low_spent_Medium_value_payments     13861
High_spent_Large_value_payments     13721
High_spent_Small_value_payments     11340
Low_spent_Large_value_payments      10425
!@9#%8                               7600
Name: count, dtype: int64
Month
January     12500
February    12500
March       12500
April       12500
May         12500
June        12500
July        12500
August      12500
Name: count, d

In [None]:

# --- Custom Transformers ---

# Convert Credit History Age String to Months
class CreditHistoryAgeToMonths(BaseEstimator, TransformerMixin):
    def __init__(self, column='Credit_History_Age', new_column_name='Credit_History_Age_in_months'):
        self.column = column
        self.new_column_name = new_column_name

    def fit(self, X, y=None):
        return self # No fitting needed

    def transform(self, X):
        X = X.copy()
        if self.column not in X.columns:
            # print(f"Warning: Column '{self.column}' not found in CreditHistoryAgeToMonths.") # Optional
            return X

        def to_months(x):
            if pd.isna(x):
                return np.nan
            try:
                s = str(x).strip()
                # More flexible regex to capture numbers, allowing for variations
                match = re.search(r'(\d+)\s*Years?\s*(?:and)?\s*(\d+)?\s*Months?', s, re.IGNORECASE)
                if match:
                    years = int(match.group(1)) if match.group(1) else 0
                    months = int(match.group(2)) if match.group(2) else 0
                    return years * 12 + months
                # Handle cases like "X Years" without months specified
                match_years_only = re.search(r'(\d+)\s*Years?', s, re.IGNORECASE)
                if match_years_only:
                    years = int(match_years_only.group(1))
                    return years * 12
                # Handle cases like "Y Months" without years specified (less common but possible)
                match_months_only = re.search(r'(\d+)\s*Months?', s, re.IGNORECASE)
                if match_months_only:
                    months = int(match_months_only.group(1))
                    return months

            except Exception: # Catch potential errors during conversion
                pass # Return NaN if any error occurs
            return np.nan # Return NaN if no match or error

        # Use .loc for safer assignment
        X.loc[:, self.new_column_name] = X[self.column].apply(to_months)
        return X


# Clean numeric columns: Remove '-' and '_' then convert to numeric
class CleanNumeric(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        for c in self.columns:
            # Ensure column exists and is not entirely NaN/empty before processing
            if c in X.columns and not X[c].dropna().empty:
                # Convert to string first to apply string methods, handle potential non-string types
                X[c] = (
                    X[c].astype(str)
                        .str.replace(r'[-_]', '', regex=True)
                        .pipe(pd.to_numeric, errors='coerce') # Convert to numeric, invalid parsing becomes NaN
                )
            elif c in X.columns:
                # If column exists but is empty/all NaN, ensure it's numeric type if possible
                X[c] = pd.to_numeric(X[c], errors='coerce')
            # else: # Optional warning if column not found
            #     print(f"Warning: Column '{c}' not found in CleanNumeric.")
        return X

# Forward/backward fill by Customer_ID for static fields
class StaticFieldFiller(BaseEstimator, TransformerMixin):
    def __init__(self, columns, group_col='Customer_ID'):
        self.columns = columns
        self.group_col = group_col

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        # Check if group column exists
        if self.group_col not in X.columns:
            # print(f"Warning: Group column '{self.group_col}' not found in StaticFieldFiller. Skipping.") # Optional
            return X

        cols_present = [col for col in self.columns if col in X.columns]
        if cols_present:
            X.loc[:, cols_present] = X.groupby(self.group_col)[cols_present].transform(lambda g: g.ffill().bfill())
        # else: # Optional warning if no target columns found
        #      print(f"Warning: No target columns found in StaticFieldFiller.")
        return X

# Fix Num_of_Loan using mode + IQR clipping per group
class FixLoanCount(BaseEstimator, TransformerMixin):
    def __init__(self, loan_col="Num_of_Loan", type_col="Type_of_Loan"):
        self.loan_col = loan_col
        self.type_col = type_col

    def fit(self, X, y=None):
        # No fitting needed, purely stateless transformation
        return self

    def transform(self, X):
        X = X.copy()

        # Define the counting function
        def count_loans(loan_type):
            if pd.isna(loan_type) or loan_type.strip() == '':
                return 0
            # Use regex to count ',' and 'and'
            # Count commas (,) and 'and' as separators
            parts = re.split(r',|\band\b', loan_type) # \b for word or space
            # Filter out empty parts after split (in case of double commas, etc.)
            valid_parts = [p.strip() for p in parts if p.strip()]
            return len(valid_parts)

        # Apply the counting logic to the specified column
        if self.type_col in X.columns:
            X[self.loan_col] = X[self.type_col].apply(count_loans)

        return X

# Clean Category Strings
class CategoryCleaner(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        for c in self.columns:
            if c in X.columns:
                # Ensure column is string type first
                X[c] = X[c].astype(str)
                # Apply cleaning steps
                X[c] = (
                    X[c]
                        .str.replace(r'[^A-Za-z\s]', '', regex=True) # Keep only letters and spaces
                        .str.strip()
                        .str.replace(r'\s+', '_', regex=True) # Replace spaces with underscore
                        .str.lower()
                        .replace(r'^_+$', np.nan, regex=True) # Handle cases that become only underscores
                        .replace(r'^\s*$', np.nan, regex=True) # Replace empty/whitespace-only with NaN
                        .replace('nan', np.nan) # Replace string 'nan' with NaN
                )
            # else: # Optional warning if column not found
            #     print(f"Warning: Column '{c}' not found in CategoryCleaner.")
        return X

# Impute Categorical using Local Mode (includes rare values)
class LocalModeCatImputer(BaseEstimator, TransformerMixin):
    def __init__(self, columns, window=5):
        self.columns = columns
        self.window = window
        # Store global modes as fallback
        self.global_modes_ = {}

    def fit(self, X, y=None):
        # Fit global modes for fallback
        for col in self.columns:
            if col in X.columns:
                # Calculate mode on non-NaN values using pandas mode
                mode_val = X[col].dropna().mode()
                self.global_modes_[col] = mode_val.iat[0] if not mode_val.empty else np.nan
            # else: # Optional warning if column not found
            #     print(f"Warning: Column '{col}' not found during fit in LocalModeCatImputer.")
        return self

    def transform(self, X):
        X = X.copy()
        n = len(X)
        for col in self.columns:
            if col not in X.columns or col not in self.global_modes_:
                # print(f"Warning: Column '{col}' not found or not fitted in LocalModeCatImputer. Skipping.") # Optional
                continue

            # Ensure NaNs are consistent (use pd.isna) and handle string 'nan'
            vals = X[col].copy().replace('nan', np.nan)

            # Identify rare values (consider if freq=1 is too strict)
            # Calculate counts on non-NaN values
            counts = vals.value_counts(dropna=True)
            # Define rare based on a threshold (e.g., count <= 1)
            rare = set(counts[counts <= 1].index)

            # Find indices needing imputation (NaN or rare)
            indices_to_impute = X.index[vals.isna() | vals.isin(rare)]

            for i in indices_to_impute:
                lo = max(0, i - self.window)
                hi = min(n, i + self.window + 1)

                # Get window data using .loc to handle potential index gaps
                window_indices = X.index[lo:hi].drop(i, errors='ignore')
                # Get non-NaN values from the window
                w = X.loc[window_indices, col].dropna()

                impute_value = self.global_modes_.get(col, np.nan) # Default to global mode

                if not w.empty:
                    # Calculate mode of the window using pandas mode
                    mode_val = w.mode()
                    if not mode_val.empty: # Check if mode calculation returned anything
                        impute_value = mode_val.iat[0] # Use local mode if available

                # Use .loc for assignment
                X.loc[i, col] = impute_value

        return X



class MixedCategoricalEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, ordinal_features=None, nominal_features=None):
        self.ordinal_features = ordinal_features or []
        self.nominal_features = nominal_features or []
        self.label_encoders_ = {}
        self.fill_values_ = {}

    def fit(self, X, y=None):
        X = X.copy()

        # Fit LabelEncoders on ordinal features
        for col in self.ordinal_features:
            if col in X.columns:
                le = LabelEncoder()
                mode_val = X[col].dropna().mode().iloc[0] if not X[col].dropna().empty else 'UNKNOWN'
                self.fill_values_[col] = mode_val
                le.fit(X[col].dropna().astype(str).unique())
                self.label_encoders_[col] = le

        return self

    def transform(self, X):
        X = X.copy()

        # Apply Label Encoding to ordinal features
        for col in self.ordinal_features:
            if col in X.columns:
                le = self.label_encoders_[col]
                fill_val = self.fill_values_.get(col, 'UNKNOWN')
                filled_col = X[col].fillna(fill_val).astype(str)
                known_mask = filled_col.isin(le.classes_)
                encoded_col = pd.Series(index=X.index, dtype=float)
                encoded_col.loc[known_mask] = le.transform(filled_col[known_mask])
                encoded_col.loc[~known_mask] = -1  # For unseen values
                X[col] = encoded_col

        # Apply One-Hot Encoding to nominal features
        if self.nominal_features:
            X = pd.get_dummies(X, columns=self.nominal_features, dummy_na=True)

        return X


# Drop Specified Columns
class DropColumns(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        # Ensure columns to drop actually exist
        cols_to_drop = [col for col in self.columns if col in X.columns]
        return X.drop(columns=cols_to_drop, errors='ignore')




In [None]:
# --- Pipeline Definition ---

# Define column groups based on pipeline flow
initial_numeric_cols = [
    "Age", "Annual_Income", "Monthly_Inhand_Salary", "Num_Bank_Accounts",
    "Num_Credit_Card", "Interest_Rate", "Num_of_Loan", "Delay_from_due_date",
    "Num_of_Delayed_Payment", "Changed_Credit_Limit", "Num_Credit_Inquiries",
    "Outstanding_Debt", "Total_EMI_per_month", "Amount_invested_monthly",
    "Monthly_Balance"
]
# Static columns to fill using ffill/bfill per customer
# Ensure these are handled appropriately by cleaning/imputation steps afterwards
static_cols = ["Age","Occupation","Annual_Income","Monthly_Inhand_Salary","Num_Bank_Accounts","Num_Credit_Card","Interest_Rate"]

# Categorical columns needing string cleaning
cat_clean_cols = ['Credit_Mix','Payment_Behaviour','Occupation','Payment_of_Min_Amount','Month']

# Categorical columns to impute using local mode (AFTER cleaning)
cat_impute_cols = ['Credit_Mix','Payment_Behaviour','Occupation','Payment_of_Min_Amount','Month']

# Categorical columns to mixed encode (AFTER cleaning and categorical imputation)
cat_encode_cols = ['Credit_Mix','Payment_Behaviour','Occupation','Payment_of_Min_Amount','Month']

# Numeric columns to impute using local mode (AFTER cleaning, filling, and conversion)
# Include the newly created numeric credit history column and other numeric columns
num_impute_cols = [
    "Age", "Annual_Income", "Monthly_Inhand_Salary", "Num_Bank_Accounts",
    "Num_Credit_Card", "Interest_Rate", "Num_of_Loan", "Delay_from_due_date",
    "Num_of_Delayed_Payment", "Changed_Credit_Limit", "Num_Credit_Inquiries",
    "Outstanding_Debt", "Total_EMI_per_month", "Amount_invested_monthly",
    "Monthly_Balance", 'Credit_History_Age_in_months' 
]
ordinal_features = ['Credit_Mix', 'Payment_of_Min_Amount']
nominal_features = ['Occupation', 'Payment_Behaviour', 'Month']


# Columns to finally drop (IDs, original text fields replaced by converted/encoded ones)
cols_to_drop = ['Name','Type_of_Loan','ID','SSN', 'Credit_History_Age', 'Customer_ID'] # Drop original history col

# Define the preprocessing pipeline
# Order matters!
full_preprocessor = Pipeline([
    # 1) Convert Credit History Age string to months (numeric)
    ('credit_age_months', CreditHistoryAgeToMonths(column='Credit_History_Age',
        new_column_name='Credit_History_Age_in_months')),

    # 2) Clean initial numeric columns (removes symbols, coerces to numeric)
    ('clean_numeric',     CleanNumeric(columns=initial_numeric_cols)),

    # 3) Forward/backward fill static fields per customer
    ('fill_static',       StaticFieldFiller(columns=static_cols, group_col='Customer_ID')),

    # 4) Fix Num_of_Loan outliers and NaNs using group-wise mode/IQR
    ('fix_loan',          FixLoanCount(loan_col="Num_of_Loan", group_col="Type_of_Loan")),

    # 5) Clean categorical strings (removes symbols, standardizes format)
    ('clean_cats',        CategoryCleaner(columns=cat_clean_cols)),

    # 6) Impute categorical NaNs and rare values using local mode (AFTER cleaning)
    ('impute_cat_mode',   LocalModeCatImputer(columns=cat_impute_cols)),

    # 7) Label encode your cleaned and imputed categoricals
    #('label_encode',      LabelEncodeColumns(columns=cat_encode_cols)),
    ('encode', MixedCategoricalEncoder(
    ordinal_features=['Credit_Mix', 'Payment_of_Min_Amount', 'Month'],
    nominal_features=['Occupation', 'Payment_Behaviour']
    )),
    # 8) Drop original text/ID columns and the original credit history column
    ('drop_cols',         DropColumns(columns=cols_to_drop)),

    # 9) Impute numeric NaNs and outliers using local mode (AFTER cleaning, conversion, static fill, loan fix)
    #('impute_num_mode',   LocalModeNumImputer(columns=num_impute_cols)),

    # 10) Fill any remaining NaNs via KNN imputation (cat columns are now numeric after label encoding)
    ('knn_impute', KNNImputer(n_neighbors=5, weights='uniform')),

    # 11) Scale numeric features
    #('scaler',    StandardScaler()),
    # ('minmax_scaler', MinMaxScaler()), # Example if you prefer MinMaxScaler

    # 12) Apply PCA for dimensionality reduction
    #('pca',       PCA(n_components=0.95)), # Keep 95% of variance

])


In [16]:

# Build final model pipeline (including preprocessor and potentially a classifier)
# Currently only includes the preprocessor for testing cleaning
pipeline = Pipeline(steps=[
    ("preprocessor", full_preprocessor),
    # Uncomment and add your classifier here when ready, e.g.:
    # ("classifier", SVC(kernel="rbf", C=1.0, probability=True, random_state=42))
])

# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    df, y, test_size=0.2, random_state=42, stratify=y # Added stratify for classification
)

print("Starting pipeline fitting...")
# Fit the pipeline on the training data
pipeline.fit(X_train, y_train)
print("Pipeline fitting finished.")

# Optional: Transform the training data and inspect the output
print("\nSample data after preprocessing (first 5 rows):")
# Apply the fitted preprocessor to the first 5 rows of the original X_train
X_train_processed_sample = pipeline.named_steps['preprocessor'].transform(X_train)
# Convert the output numpy array back to a DataFrame for easier inspection
# Need to get the column names after preprocessing - this can be tricky with custom transformers
# For a simple check, we can just print the numpy array or try to infer names if possible
# A more robust way is to fit/transform a small sample separately to get column names
try:
    # Attempt to get feature names after preprocessing (might not work with all custom transformers)
    # This is a common challenge with custom transformers that don't implement get_feature_names_out
    # For now, we'll just show the numpy array output
    print(X_train_processed_sample)
except Exception as e:
    print(f"Could not display as DataFrame: {e}")
    print(X_train_processed_sample) # Print the numpy array

# print("\nData types after preprocessing:")
# # This will also be tricky without column names.
# # You would typically check the dtype of the resulting numpy array or DataFrame
# if isinstance(X_train_processed_sample, np.ndarray):
#      print(f"Output is a numpy array with dtype: {X_train_processed_sample.dtype}")
# else:
#      print(X_train_processed_sample.info())


# # Predict & evaluate (uncomment when classifier is added and fitted)
# print("Starting prediction...")
# y_pred = pipeline.predict(X_test)
# print("Prediction finished.")

# print("\nClassification Report:")
# print(classification_report(y_test, y_pred))
# --> KNN <--


Starting pipeline fitting...
Pipeline fitting finished.

Sample data after preprocessing (first 5 rows):
[[7.0000000e+00 5.1000000e+01 1.0158348e+05 ... 0.0000000e+00
  0.0000000e+00 0.0000000e+00]
 [1.0000000e+00 2.3000000e+01 1.0192695e+05 ... 1.0000000e+00
  0.0000000e+00 0.0000000e+00]
 [2.0000000e+00 4.9000000e+01 1.5887112e+05 ... 1.0000000e+00
  0.0000000e+00 0.0000000e+00]
 ...
 [1.0000000e+00 4.6000000e+01 3.5032660e+04 ... 0.0000000e+00
  1.0000000e+00 0.0000000e+00]
 [7.0000000e+00 4.2000000e+01 1.2968028e+05 ... 0.0000000e+00
  0.0000000e+00 0.0000000e+00]
 [7.0000000e+00 5.3000000e+01 1.1285322e+05 ... 0.0000000e+00
  0.0000000e+00 0.0000000e+00]]


In [None]:
X_train_processed_sample2 = pipeline.named_steps['preprocessor'].transform(X_train)

X_train_processed_sample_df = pd.DataFrame(X_train_processed_sample2)

# Define the filename for the CSV
output_filename = "X_train_processed_sample.csv"

# Save the DataFrame to a CSV file
# index=False prevents pandas from writing the DataFrame index as a column

X_train_processed_sample_df.to_csv(output_filename, index=False)
# Show number of rows and columns in the output
print("✅ Transformed Output Info:")
print(f"Number of rows: {X_train_processed_sample2.shape[0]}")
print(f"Number of columns (features): {X_train_processed_sample2.shape[1]}")


In [18]:
X_train_processed_sample_df.to_csv(output_filename, index=False)
# Show number of rows and columns in the output
print("✅ Transformed Output Info:")
print(f"Number of rows: {X_train_processed_sample2.shape[0]}")
print(f"Number of columns (features): {X_train_processed_sample2.shape[1]}")


✅ Transformed Output Info:
Number of rows: 80000
Number of columns (features): 43
