In [1]:
# === 1. Import libraries ===
import pandas as pd
from datasets import load_dataset
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from xgboost import XGBClassifier
import joblib

# === 2. Load dataset ===
dataset = load_dataset("13nishit/LoanApprovalPrediction")
df = pd.DataFrame(dataset['train'])


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/27.0 [00:00<?, ?B/s]

dataset.csv: 0.00B [00:00, ?B/s]

Generating train split:   0%|          | 0/614 [00:00<?, ? examples/s]

In [2]:

# === 3. Prepare target ===
df['Loan_Status'] = df['Loan_Status'].map({'Y': 1, 'N': 0})

# === 4. Drop Loan_ID since it is unique ID and not useful as feature ===
df = df.drop(columns=['Loan_ID'])

# === 5. Define feature columns and target ===
X = df.drop(columns=['Loan_Status'])
y = df['Loan_Status']

# === 6. Identify categorical and numerical columns ===
num_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
cat_cols = X.select_dtypes(include='object').columns.tolist()

# === 7. Preprocessing pipeline ===
# Numeric: fill missing with median, scale
from sklearn.impute import SimpleImputer
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Categorical: fill missing with mode, then one-hot encode
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, num_cols),
        ('cat', categorical_transformer, cat_cols)
    ])

# === 8. Oversampling is a bit tricky in pipeline, so do it after splitting ===
# Split the data first
X_train_raw, X_test_raw, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

# Preprocess train data for SMOTE
X_train_num = numeric_transformer.fit_transform(X_train_raw[num_cols])
X_train_cat = categorical_transformer.fit_transform(X_train_raw[cat_cols])

import numpy as np
from scipy import sparse

# Combine numeric and categorical after preprocessing
if sparse.issparse(X_train_cat):
    X_train_preprocessed = sparse.hstack([X_train_num, X_train_cat])
else:
    X_train_preprocessed = np.hstack([X_train_num, X_train_cat.toarray()])

# === 9. Apply SMOTE ===
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_preprocessed, y_train)


In [9]:
from sklearn.linear_model import LogisticRegression

# === 10. Now train model ===
model = LogisticRegression(max_iter=2000, random_state=42)
model.fit(X_train_resampled, y_train_resampled)

# === 11. Build a final pipeline for inference ===
final_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', model)
])

# Fit the final pipeline on the full unbalanced training data
final_pipeline.fit(X_train_raw, y_train)

# === 12. Save pipeline ===
joblib.dump(final_pipeline, 'loan_approval_final_pipeline.pkl')

print("✅ Logistic Regression model pipeline saved successfully!")


✅ Logistic Regression model pipeline saved successfully!
