In [11]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

# 1. Load dataset
df = pd.read_csv("loan.csv")

# Target variable
y = df["Loan_Status"]
X = df.drop(columns=["Loan_Status"])

# Encode target (Loan_Status: Y/N)
le = LabelEncoder()
y = le.fit_transform(y)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# =========================
# Step 1: Missing Value Imputation (trf1)
# =========================
num_cols = X_train.select_dtypes(include=["int64", "float64"]).columns
cat_cols = X_train.select_dtypes(include=["object"]).columns

num_imputer = SimpleImputer(strategy="mean")
cat_imputer = SimpleImputer(strategy="most_frequent")

trf1 = ColumnTransformer([
    ("num_imputer", num_imputer, num_cols),
    ("cat_imputer", cat_imputer, cat_cols)
], remainder="passthrough")

# =========================
# Step 2: Feature Construction (trf2)
# =========================
def feature_construction(X):
    # Convert to DataFrame with proper column names
    df = pd.DataFrame(X, columns=num_cols.tolist() + cat_cols.tolist())
    
    # Ensure numeric columns are floats
    for col in ["LoanAmount", "ApplicantIncome", "CoapplicantIncome"]:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce')
    
    # Create new features
    if "LoanAmount" in df.columns:
        df["LoanAmount_log"] = np.log1p(df["LoanAmount"].fillna(0))
    if "ApplicantIncome" in df.columns:
        df["TotalIncome"] = df["ApplicantIncome"] + df.get("CoapplicantIncome", 0)
    
    return df

trf2 = FunctionTransformer(feature_construction, validate=False)

# =========================
# Step 3: Handle Categorical Features (OneHotEncoding) (trf3)
# =========================
one_hot = OneHotEncoder(handle_unknown="ignore", sparse_output=False)

trf3 = ColumnTransformer([
    ("onehot", one_hot, cat_cols)
], remainder="passthrough")

# =========================
# Step 4: Scaling (trf4)
# =========================
trf4 = StandardScaler()

# =========================
# Step 5: Feature Selection (trf5)
# =========================
trf5 = SelectKBest(score_func=f_classif, k=10)

# =========================
# Step 6: Model (trf6)
# =========================
trf6 = RandomForestClassifier(random_state=42)

# =========================
# Pipeline
# =========================
pipe = Pipeline([
    ('trf1', trf1),
    ('trf2', trf2),
    ('trf3', trf3),
    ('trf4', trf4),
    ('trf5', trf5),
    ('trf6', trf6)
])

# Fit pipeline
pipe.fit(X_train, y_train)

# Accuracy
print("Train Accuracy:", pipe.score(X_train, y_train))
print("Test Accuracy:", pipe.score(X_test, y_test))

Train Accuracy: 0.8228105906313645
Test Accuracy: 0.7886178861788617


0.7886178861788617

In [13]:
y_pred

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1,
       1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0,
       1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1,
       1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [21]:
pipe.fit(x_train, y_train)
x_train_transformed = pipe.named_steps['trf6'].transform(...)  # last transformer


NameError: name 'x_train' is not defined

In [15]:
df.isnull().sum()

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64