In [52]:
# Import Required Libraries
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
%matplotlib inline 

In [53]:
import pickle

# Load trained model
with open("loan_model.pkl", "rb") as f:
    model = pickle.load(f)

# Load the trained scaler
with open("scaler.pkl", "rb") as f:
    scaler = pickle.load(f)

# Load the expected feature columns
with open("feature_columns.pkl", "rb") as f:
    expected_columns = pickle.load(f)


In [56]:
def clean_data(df):
    # Fill missing categorical values with mode
    for col in ['Gender', 'Married', 'Dependents', 'Self_Employed', 'Credit_History', 'Loan_Amount_Term']:
        df[col] = df[col].fillna(df[col].mode()[0])
    
    # Fill missing numeric values with mean
    df['LoanAmount'] = df['LoanAmount'].fillna(df['LoanAmount'].mean())
    
    return df


In [58]:
def engineer_features_test(df, scaler, expected_columns):
    # HasCoapplicant
    df['HasCoapplicant'] = df['CoapplicantIncome'].apply(lambda x: 1 if x > 0 else 0)
    
    # One-hot encode
    categorical_cols = ['Gender', 'Married', 'Education', 'Self_Employed', 'Property_Area', 'Dependents']
    df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

    # Convert to boolean
    df['Credit_History'] = df['Credit_History'].astype(bool)
    df['HasCoapplicant'] = df['HasCoapplicant'].astype(bool)

    # Scale numeric features using previously trained scaler
    numeric_cols = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount']
    df[numeric_cols] = scaler.transform(df[numeric_cols])

    # Align features with training set
    for col in expected_columns:
        if col not in df.columns:
            df[col] = 0
    df = df[expected_columns]
    
    return df


In [60]:
df_test = pd.read_csv('test.csv')

In [62]:
# Clean and engineer test data
df_test = clean_data(df_test)  # use the same clean_data function from training
df_test = engineer_features_test(df_test, scaler, expected_columns)

# Make predictions
predictions = model.predict(df_test)

# Convert predictions to Y/N
predictions = ['Y' if p == 1 else 'N' for p in predictions]


In [66]:
d

['Y',
 'Y',
 'Y',
 'Y',
 'Y',
 'Y',
 'Y',
 'N',
 'Y',
 'Y',
 'Y',
 'Y',
 'Y',
 'N',
 'Y',
 'Y',
 'Y',
 'Y',
 'Y',
 'Y',
 'Y',
 'Y',
 'Y',
 'Y',
 'Y',
 'N',
 'Y',
 'Y',
 'Y',
 'Y',
 'Y',
 'Y',
 'Y',
 'Y',
 'Y',
 'N',
 'Y',
 'Y',
 'Y',
 'Y',
 'Y',
 'Y',
 'Y',
 'Y',
 'Y',
 'Y',
 'Y',
 'Y',
 'Y',
 'Y',
 'Y',
 'Y',
 'Y',
 'Y',
 'Y',
 'N',
 'Y',
 'Y',
 'N',
 'Y',
 'Y',
 'Y',
 'Y',
 'N',
 'Y',
 'Y',
 'N',
 'N',
 'Y',
 'N',
 'Y',
 'Y',
 'Y',
 'Y',
 'Y',
 'Y',
 'Y',
 'Y',
 'Y',
 'Y',
 'N',
 'Y',
 'N',
 'Y',
 'N',
 'Y',
 'Y',
 'Y',
 'Y',
 'Y',
 'Y',
 'Y',
 'Y',
 'Y',
 'N',
 'Y',
 'Y',
 'Y',
 'Y',
 'Y',
 'Y',
 'N',
 'Y',
 'Y',
 'Y',
 'Y',
 'N',
 'Y',
 'Y',
 'Y',
 'Y',
 'Y',
 'Y',
 'Y',
 'Y',
 'Y',
 'Y',
 'N',
 'N',
 'N',
 'Y',
 'Y',
 'Y',
 'N',
 'N',
 'Y',
 'N',
 'Y',
 'Y',
 'Y',
 'Y',
 'Y',
 'Y',
 'Y',
 'Y',
 'Y',
 'Y',
 'Y',
 'Y',
 'Y',
 'N',
 'Y',
 'N',
 'Y',
 'Y',
 'Y',
 'Y',
 'N',
 'Y',
 'Y',
 'Y',
 'Y',
 'Y',
 'N',
 'Y',
 'Y',
 'Y',
 'Y',
 'Y',
 'Y',
 'Y',
 'N',
 'Y',
 'Y',
 'Y',
 'N',
 'N'