# Import Libraries
- Pandas for data loading and analysis
- Numpy for numerical operations 
- Matplotlib for visualizations
- Seaborn for visualizations

In [29]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Loading Dataset


In [30]:
df = pd.read_csv(r"E:\Project\AI_Powered_Loan_Eligibility_Advisor\train.csv")

df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


# Check Data Info and Missing values
- df.info() -> shows the data type of each column and tells you where missing values exist.
- df.isnull().sum() -> counts missing values in each column, need this to decide how to clean the data.

In [31]:
df.info()
df.isnull().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             601 non-null    object 
 2   Married            611 non-null    object 
 3   Dependents         599 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      582 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         592 non-null    float64
 9   Loan_Amount_Term   600 non-null    float64
 10  Credit_History     564 non-null    float64
 11  Property_Area      614 non-null    object 
 12  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 62.5+ KB


Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

# Handle Missing values
We must fill missing values because ML models cannot handle NaN.


- Categorical columns(string values)
    - filled with mode because missing gender, dependents, or married status cannot be averaged.

- Numerical columns(integer or floats)
    - Filled with median because income and loan amount may have outliers(an observation that lies an abnormal distance from other values in a random sample from a population).

- Credit history
    - It is a binary value(0/1), so we use mode(most common value).

In [32]:
# Step 4: Handle missing values (safe method without warnings)

df = df.copy()  # ensures no chained assignment warnings

df['Gender'] = df['Gender'].fillna(df['Gender'].mode()[0])
df['Married'] = df['Married'].fillna(df['Married'].mode()[0])
df['Dependents'] = df['Dependents'].fillna(df['Dependents'].mode()[0])
df['Self_Employed'] = df['Self_Employed'].fillna(df['Self_Employed'].mode()[0])

df['LoanAmount'] = df['LoanAmount'].fillna(df['LoanAmount'].median())
df['Loan_Amount_Term'] = df['Loan_Amount_Term'].fillna(df['Loan_Amount_Term'].median())
df['Credit_History'] = df['Credit_History'].fillna(df['Credit_History'].mode()[0])


In [33]:
df.isnull().sum()


Loan_ID              0
Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64

# Drop the Loan_ID Column
- We never use Loan_ID for prediction. It has no meaning for eligibility.

In [34]:
df.drop('Loan_ID', axis=1, inplace=True)

In [35]:
df.head()


Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,Male,No,0,Graduate,No,5849,0.0,128.0,360.0,1.0,Urban,Y
1,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


# Encoding the Categorical Columns
- Before training the model, all text-based columns (like "Male", "Urban", "Graduate") must be converted into numbers because machine-learning models cannot understand text.

In [36]:
from sklearn.preprocessing import LabelEncoder

# Step 5: Convert categorical columns into numbers
le = LabelEncoder()

categorical_cols = ['Gender', 'Married', 'Dependents', 'Education', 
                    'Self_Employed', 'Property_Area', 'Loan_Status']

for col in categorical_cols:
    df[col] = le.fit_transform(df[col])


In [37]:
df.head()


Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,1,0,0,0,0,5849,0.0,128.0,360.0,1.0,2,1
1,1,1,1,0,0,4583,1508.0,128.0,360.0,1.0,0,0
2,1,1,0,0,1,3000,0.0,66.0,360.0,1.0,2,1
3,1,1,0,1,0,2583,2358.0,120.0,360.0,1.0,2,1
4,1,0,0,0,0,6000,0.0,141.0,360.0,1.0,2,1


# Split the dataset (X, y)
- We separate the features (input variables) and the target (Loan_Status).


In [38]:
x = df.drop('Loan_Status', axis=1)
y = df['Loan_Status']

x.head(), y.head()


(   Gender  Married  Dependents  Education  Self_Employed  ApplicantIncome  \
 0       1        0           0          0              0             5849   
 1       1        1           1          0              0             4583   
 2       1        1           0          0              1             3000   
 3       1        1           0          1              0             2583   
 4       1        0           0          0              0             6000   
 
    CoapplicantIncome  LoanAmount  Loan_Amount_Term  Credit_History  \
 0                0.0       128.0             360.0             1.0   
 1             1508.0       128.0             360.0             1.0   
 2                0.0        66.0             360.0             1.0   
 3             2358.0       120.0             360.0             1.0   
 4                0.0       141.0             360.0             1.0   
 
    Property_Area  
 0              2  
 1              0  
 2              2  
 3              2  
 4

# Train/Test Split
We need to split the data into:
- Training data → used to teach the model
- Testing data → used to check how well the model learned
------------------------------------------------------------------
- train_test_split separates data randomly.
- test_size=0.2 → 20% is for testing.
-random_state=42 → keeps the split same every time for reproducibility.
----------------------------------------------------------
- 491 rows used for training
- 123 rows used for testing
- 11 columns (features)

In [39]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.2, random_state=42
)

x_train.shape, x_test.shape

((491, 11), (123, 11))

# Train the Machine Learning Model
We will use Logistic Regression, because:
- It works well for binary classification (Y/N).
- It is simple and accurate for this dataset.
- Easy to integrate with Flask.

In [40]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Step 8: Train the model
model = LogisticRegression(max_iter=5000)

model.fit(x_train, y_train)

# Predict on test data
y_pred = model.predict(x_test)

# Check accuracy
accuracy = accuracy_score(y_test, y_pred)

accuracy


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=5000).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.7886178861788617

# Save the Model
- This creates model.pkl. This is what your the app will use to predict loan eligibility.

In [60]:
import pickle
model = pickle.load(open("model.pkl", "rb"))

print(model.feature_names_in_)



['Gender' 'Married' 'Dependents' 'Education' 'Self_Employed'
 'ApplicantIncome' 'CoapplicantIncome' 'LoanAmount' 'Loan_Amount_Term'
 'Credit_History' 'Property_Area']


In [45]:
# Replace these with actual user input or test values
Gender_value = 1            # Male=1, Female=0
Married_value = 0           # Yes=1, No=0
Dependents_value = 2        # number of dependents
Education_value = 1         # Graduate=1, Not Graduate=0
Self_Employed_value = 0     # Yes=1, No=0
ApplicantIncome_value = 5000
CoapplicantIncome_value = 2000
LoanAmount_value = 150
Loan_Amount_Term_value = 360
Credit_History_value = 1
Property_Area_value = 2     # e.g., 0=Rural,1=Semiurban,2=Urban

# Put them in a list in the same order as training columns
features = [
    Gender_value, Married_value, Dependents_value, Education_value,
    Self_Employed_value, ApplicantIncome_value, CoapplicantIncome_value,
    LoanAmount_value, Loan_Amount_Term_value, Credit_History_value,
    Property_Area_value
]


In [46]:
import pandas as pd

train_columns = ['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed',
                 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term',
                 'Credit_History', 'Property_Area']

input_df = pd.DataFrame([features], columns=train_columns)


# Fix Flask Prediction Code

# ensuring consistency

In [47]:
print(len(features))      # Should be 11
print(len(train_columns)) # Should be 11


11
11


In [48]:
# Suppose these were your training columns
train_columns = ['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed',
                 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term',
                 'Credit_History', 'Property_Area']

# Create a DataFrame for prediction using the same columns
import pandas as pd

input_df = pd.DataFrame([features], columns=train_columns)
prediction = model.predict(input_df)


In [51]:
# Example: One-hot encoding for categorical features
categorical_features = ['Gender', 'Married', 'Education', 'Self_Employed', 'Property_Area']

input_df = pd.get_dummies(input_df, columns=categorical_features)

# Ensure all training columns exist
for col in x_train.columns:
    if col not in input_df.columns:
        input_df[col] = 0

# Reorder columns to match training data
input_df = input_df[x_train.columns]

prediction = model.predict(input_df)


In [52]:
print("Expected features:", model.coef_.shape[1])
print("Input features:", len(features))


Expected features: 11
Input features: 11


In [53]:
prediction = model.predict(input_df)
print("Prediction:", prediction)


Prediction: [1]


In [54]:
print(input_df.dtypes)


Gender               int64
Married              int64
Dependents           int64
Education            int64
Self_Employed        int64
ApplicantIncome      int64
CoapplicantIncome    int64
LoanAmount           int64
Loan_Amount_Term     int64
Credit_History       int64
Property_Area        int64
dtype: object


In [55]:
# Align input columns to training data order
input_df = input_df[x_train.columns]  

# Make prediction
prediction = model.predict(input_df)
print("Prediction:", prediction)


Prediction: [1]


In [58]:
import pandas as pd

# List of features your model was trained on
TRAIN_COLUMNS = ['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed',
                 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term',
                 'Credit_History', 'Property_Area']

def make_prediction(model, input_data: dict):
    """
    model: trained scikit-learn model
    input_data: dict containing exactly the 11 features
    """
    # Ensure DataFrame matches training columns
    input_df = pd.DataFrame([input_data], columns=TRAIN_COLUMNS)

    # Optional: convert all to numeric (int/float)
    input_df = input_df.astype(float)

    # Predict
    prediction = model.predict(input_df)
    return prediction[0]  # return single prediction


In [59]:
import streamlit as st

# Suppose these are values collected from Streamlit sidebar
user_input = {
    'Gender': 1,
    'Married': 0,
    'Dependents': 2,
    'Education': 1,
    'Self_Employed': 0,
    'ApplicantIncome': 5000,
    'CoapplicantIncome': 2000,
    'LoanAmount': 150,
    'Loan_Amount_Term': 360,
    'Credit_History': 1,
    'Property_Area': 2
}

prediction = make_prediction(model, user_input)
st.write("Prediction:", prediction)


