In [42]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
import joblib  
from sklearn.metrics import accuracy_score, classification_report

#load csv
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
sub = pd.read_csv("sample_submission.csv")

# Prepare X, y
X = train.drop(columns=["Loan_Status", "Loan_ID","Gender","Dependents","Education","Self_Employed","Property_Area"])
X = pd.get_dummies(X,drop_first=True)
X = X.fillna(0)
feature_cols = X.columns
y = train["Loan_Status"]

# Split (for final test evaluation)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

# Pipeline
svm_scaled_pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("svm", SVC(class_weight="balanced")
)
])

# Grid Search (CV inside)
param_grid = {
    "svm__kernel": ["rbf"],
    "svm__C": [0.1, 1, 10, 50],
    "svm__gamma": ["scale", 0.1, 0.01]
}

gs = GridSearchCV(
    svm_scaled_pipeline,
    param_grid,
    cv=5,
    scoring="accuracy"
)

# Fit GridSearch on train split (so test stays "clean")
gs.fit(X_train, y_train)

print("Best params:", gs.best_params_)
print("Best CV accuracy (on train):", gs.best_score_)

# Evaluate best model on holdout test split
best_model = gs.best_estimator_
y_pred = best_model.predict(X_test)
print(" test accuracy:", accuracy_score(y_test, y_pred))

#dump model


metrics = {
    "accuracy": accuracy_score(y_test, y_pred),
    "report": classification_report(y_test, y_pred, output_dict=True)
}

joblib.dump(
    {"model": best_model, "feature_cols": feature_cols, "metrics": metrics},
    "loan_shark_with_metrics.joblib"
)


Best params: {'svm__C': 1, 'svm__gamma': 0.01, 'svm__kernel': 'rbf'}
Best CV accuracy (on train): 0.7505335157318742
 test accuracy: 0.8054054054054054


['loan_shark_with_metrics.joblib']