In [2]:
# Import the libraries we need
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Load the dataset from the CSV file into a 'DataFrame' (like a table)
df = pd.read_csv('loan_data.csv')

# Show the first 5 rows to make sure it loaded correctly
df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [4]:
# Count the number of missing values in each column
print("Missing values before cleaning:")
print(df.isnull().sum())

# Fill missing text data with the most common value (mode)
df['Gender'] = df['Gender'].fillna(df['Gender'].mode()[0])
df['Married'] = df['Married'].fillna(df['Married'].mode()[0])
df['Dependents'] = df['Dependents'].fillna(df['Dependents'].mode()[0])
df['Self_Employed'] = df['Self_Employed'].fillna(df['Self_Employed'].mode()[0])
df['Credit_History'] = df['Credit_History'].fillna(df['Credit_History'].mode()[0])

# Fill missing number data with the middle value (median)
df['LoanAmount'] = df['LoanAmount'].fillna(df['LoanAmount'].median())
df['Loan_Amount_Term'] = df['Loan_Amount_Term'].fillna(df['Loan_Amount_Term'].median())

# Check again to make sure all missing values are gone
print("\nMissing values after cleaning:")
print(df.isnull().sum())

Missing values before cleaning:
Loan_ID              0
Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64

Missing values after cleaning:
Loan_ID              0
Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64


In [5]:
# Drop the Loan_ID column
df = df.drop('Loan_ID', axis=1)

# Import the tool for converting text to numbers
from sklearn.preprocessing import LabelEncoder

# Create the converter
le = LabelEncoder()

# List of text columns to convert
categorical_cols = ['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'Property_Area', 'Loan_Status']

# Loop through the list and convert each column
for col in categorical_cols:
    df[col] = le.fit_transform(df[col])

# Check the first 5 rows to see the changes. Everything should be a number now!
df.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,1,0,0,0,0,5849,0.0,128.0,360.0,1.0,2,1
1,1,1,1,0,0,4583,1508.0,128.0,360.0,1.0,0,0
2,1,1,0,0,1,3000,0.0,66.0,360.0,1.0,2,1
3,1,1,0,1,0,2583,2358.0,120.0,360.0,1.0,2,1
4,1,0,0,0,0,6000,0.0,141.0,360.0,1.0,2,1


In [8]:
# --- Step 4: Split the Data ---
# Separate the features (X) from the target (y)
X = df.drop('Loan_Status', axis=1)
y = df['Loan_Status']

# Import the tool for splitting data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# --- NEW: Scale the Data ---
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

# Fit the scaler on the training data and transform both the training and testing sets
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


# --- Step 5: Train the Model (using the scaled data) ---
# Import the Logistic Regression model
from sklearn.linear_model import LogisticRegression

# Create the model
model = LogisticRegression(max_iter=1000)

# Train the model using our NEW scaled training data
model.fit(X_train_scaled, y_train)

print("Model trained successfully!")

Model trained successfully!


In [10]:
# Make predictions on the SCALED test data
y_pred = model.predict(X_test_scaled)

# Import tools for evaluation
from sklearn.metrics import accuracy_score, classification_report

# Calculate the accuracy score
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy * 100:.2f}%")

# Print a detailed report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Model Accuracy: 78.86%

Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.42      0.58        43
           1       0.76      0.99      0.86        80

    accuracy                           0.79       123
   macro avg       0.85      0.70      0.72       123
weighted avg       0.83      0.79      0.76       123



In [11]:
# Step 7: Predict on New Data

# We need the 'scaler' and 'model' that you already trained and created.
# Let's create a sample of a new applicant's data.
new_applicant_data = {
    'Gender': 'Male',
    'Married': 'Yes',
    'Dependents': '1',
    'Education': 'Graduate',
    'Self_Employed': 'No',
    'ApplicantIncome': 5800,
    'CoapplicantIncome': 0,
    'LoanAmount': 180,
    'Loan_Amount_Term': 360,
    'Credit_History': 1.0,
    'Property_Area': 'Semiurban'
}

# Convert this dictionary to a pandas DataFrame
new_applicant_df = pd.DataFrame([new_applicant_data])

# IMPORTANT: We must apply the SAME preprocessing as the training data.

# 1. Encode the categorical features to numbers
# Note: For a real application, you would save and load the LabelEncoder objects.
# For this project, we can manually map them based on our data.
encoding_map = {
    'Gender': {'Male': 1, 'Female': 0},
    'Married': {'Yes': 1, 'No': 0},
    'Dependents': {'0': 0, '1': 1, '2': 2, '3+': 3},
    'Education': {'Graduate': 0, 'Not Graduate': 1},
    'Self_Employed': {'Yes': 1, 'No': 0},
    'Property_Area': {'Urban': 2, 'Semiurban': 1, 'Rural': 0}
}
for col, mapping in encoding_map.items():
    new_applicant_df[col] = new_applicant_df[col].map(mapping)

# 2. Scale the numerical features using the SAME scaler from before
# We only use .transform() here, not .fit_transform(), because we're not re-learning.
new_applicant_scaled = scaler.transform(new_applicant_df)

# 3. Make the final prediction
prediction = model.predict(new_applicant_scaled)
prediction_proba = model.predict_proba(new_applicant_scaled)

# 4. Interpret and print the result
if prediction[0] == 1:
    result = "Approved"
else:
    result = "Not Approved"

print(f"The model's prediction for the new applicant is: {result}")
print(f"Probability of [Not Approved, Approved]: {prediction_proba}")

The model's prediction for the new applicant is: Approved
Probability of [Not Approved, Approved]: [[0.14893425 0.85106575]]
