In [None]:
# Cell 1: Imports & Data Load
import re
import numpy as np
import pandas as pd
from scipy.stats import mode
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC

# Load data
df = pd.read_csv('data/train.csv')
y = df.pop('Credit_Score')    # ← adjust to your actual target column name


  df = pd.read_csv('train.csv')


KeyError: 'Target'

In [10]:
class LoanModeImputer(BaseEstimator, TransformerMixin):
    """Fill NaNs & cap outliers to the group MODE, without dropping any rows."""
    def __init__(self, group_col, value_col, na_placeholder="__MISSING__"):
        self.group_col = group_col
        self.value_col = value_col
        self.na_placeholder = na_placeholder

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        # 1) fill missing loan-type so groupby won't drop rows
        X[self.group_col] = X[self.group_col].fillna(self.na_placeholder)

        def clean_grp(g):
            s = g[self.value_col]
            # compute group mode
            try:
                m = mode(s.dropna())[0][0]
            except:
                m = np.nan
            # IQR bounds
            q1, q3 = s.quantile([.25, .75])
            iqr = q3 - q1
            low, high = q1 - 1.5 * iqr, q3 + 1.5 * iqr
            # replace NaNs & cap outliers to mode
            s = s.fillna(m).apply(lambda x: m if x < low or x > high else x)
            g[self.value_col] = s.fillna(m)
            return g

        # group_keys=False preserves original ordering and length
        return X.groupby(self.group_col, group_keys=False).apply(clean_grp)


In [11]:
# Cell 3: Full-DF → ColumnSelector+Scaler Preprocessor
numeric_strip_cols = [
    "Age", "Annual_Income", "Monthly_Inhand_Salary", "Num_Bank_Accounts",
    "Num_Credit_Card", "Interest_Rate", "Num_of_Loan", "Delay_from_due_date",
    "Num_of_Delayed_Payment", "Changed_Credit_Limit", "Num_Credit_Inquiries",
    "Outstanding_Debt", "Total_EMI_per_month", "Amount_invested_monthly",
    "Monthly_Balance"
]
static_cols = ["Age", "Occupation", "Annual_Income", "Monthly_Inhand_Salary",
               "Num_Bank_Accounts", "Num_Credit_Card", "Interest_Rate"]
loan_col, group_col = "Num_of_Loan", "Type_of_Loan"
cat_clean_cols = ['Credit_Mix','Payment_Behaviour','Occupation']
mode_impute_cols = cat_clean_cols + ['Payment_of_Min_Amount','Month','Credit_Score']
label_encode_cols = mode_impute_cols + ['Credit_History_Age_months']

# Stage 1: run all custom transformers to completion on the full DF
full_df_pipeline = Pipeline([
    ('strip', StripAndNumeric(numeric_strip_cols)),
    ('static', StaticFieldImputer('Customer_ID', static_cols)),
    ('loan_mode', LoanModeImputer(group_col, loan_col)),
    ('cred_age', CreditHistoryConverter('Credit_History_Age')),
    ('cat_clean', CategoryCleaner(cat_clean_cols)),
    ('mode_imp', LocalModeImputer(mode_impute_cols)),
    ('lbl_enc', LabelEncoderImputer(label_encode_cols)),
])

# Stage 2: select & scale
selector_and_scaler = ColumnTransformer([
    ('num_scale',
     Pipeline([('impute', SimpleImputer(strategy='median')),
               ('std', StandardScaler())]),
     numeric_strip_cols + ['Credit_History_Age_months']),
    ('cat_scale',
     Pipeline([('impute0', SimpleImputer(strategy='constant', fill_value=0)),
               ('minmax', MinMaxScaler())]),
     mode_impute_cols),
], remainder='drop')

preprocessor = Pipeline([
    ('full_df', full_df_pipeline),
    ('sel_scale', selector_and_scaler)
])


# SVM

In [None]:
import pandas as pd
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC, SVR, LinearSVC, OneClassSVM
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import numpy as np
import mlflow
import os
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
import seaborn as sns

In [None]:
params = {
    "C": 1.0,
    "kernel": 'rbf',
    "gamma": 'scale',
    "degree": 3,
    "coef0": 0.0,
    "probability": True,
    "shrinking": True,
    "tol": 0.001,
    "max_iter": -1,
    "class_weight": None,
    "decision_function_shape": 'ovr',
}

# Train the model
svc = SVC(**params)
svc.fit(X_train, y_train)

# Predict on the test set
y_pred = svc.predict(X_test)

In [None]:
report_dict = classification_report(y_test, y_pred, output_dict=True)

# Save classification report text
with open("outputs/classification_report.txt", "w") as f:
    f.write(str(report_dict))

# Create and save confusion matrix plot
cm = confusion_matrix(y_true, y_pred)
plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("True")
os.makedirs("outputs", exist_ok=True)
conf_matrix_path = "outputs/confusion_matrix.png"
plt.savefig(conf_matrix_path)
plt.close()

classification_report(y_test, y_pred)

In [None]:
mlflow.set_experiment("SVM_Classifier")
mlflow.set_tracking_uri(uri="http://127.0.0.1:5000/")

with mlflow.start_run(run_name="SVM_run"):
    mlflow.log_params(params)
    mlflow.log_metrics({
        'accuracy': report_dict['accuracy'],
        'weighted_avg_precision': report_dict['weighted avg']['precision'],
        'weighted_avg_recall': report_dict['weighted avg']['recall'],
        'weighted_avg_f1': report_dict['weighted avg']['f1-score']
    })
    mlflow.sklearn.log_model(svc, "Supprt Vector Classifier")
    mlflow.log_artifact("outputs/classification_report.txt")
    mlflow.log_artifact(conf_matrix_path)
    mlflow.log_text("Support Vector Classifier primary trial.", artifact_file="outputs/evaluation_notes.txt")