<a href="https://colab.research.google.com/github/Althaf0097/Demo/blob/main/Loan_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Predict Loan Eligibility for Dream Housing Finance company

#### Dream Housing Finance company deals in all kinds of home loans. They have presence across all urban, semi urban and rural areas. Customer first applies for home loan and after that company validates the customer eligibility for loan.Company wants to automate the loan eligibility process (real time) based on customer detail provided while filling online application form. These details are Gender, Marital Status, Education, Number of Dependents, Income, Loan Amount, Credit History and others. To automate this process, they have provided a dataset to identify the customers segments that are eligible for loan amount so that they can specifically target these customers.

# Loading the Dataset

In [121]:
# Let's load the dataset and inspect it to understand its structure and identify columns that can be removed or need preprocessing.

import pandas as pd

# Load the dataset
file_path = '/content/test_lAUu6dG.csv'
data = pd.read_csv(file_path)



In [122]:
# Display the first few rows of the dataset to understand its structure
data.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,LP001015,Male,Yes,0,Graduate,No,5720,0,110.0,360.0,1.0,Urban
1,LP001022,Male,Yes,1,Graduate,No,3076,1500,126.0,360.0,1.0,Urban
2,LP001031,Male,Yes,2,Graduate,No,5000,1800,208.0,360.0,1.0,Urban
3,LP001035,Male,Yes,2,Graduate,No,2340,2546,100.0,360.0,,Urban
4,LP001051,Male,No,0,Not Graduate,No,3276,0,78.0,360.0,1.0,Urban


In [123]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 367 entries, 0 to 366
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            367 non-null    object 
 1   Gender             356 non-null    object 
 2   Married            367 non-null    object 
 3   Dependents         357 non-null    object 
 4   Education          367 non-null    object 
 5   Self_Employed      344 non-null    object 
 6   ApplicantIncome    367 non-null    int64  
 7   CoapplicantIncome  367 non-null    int64  
 8   LoanAmount         362 non-null    float64
 9   Loan_Amount_Term   361 non-null    float64
 10  Credit_History     338 non-null    float64
 11  Property_Area      367 non-null    object 
dtypes: float64(3), int64(2), object(7)
memory usage: 34.5+ KB


In [124]:
data.describe()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History
count,367.0,367.0,362.0,361.0,338.0
mean,4805.599455,1569.577657,136.132597,342.537396,0.825444
std,4910.685399,2334.232099,61.366652,65.156643,0.38015
min,0.0,0.0,28.0,6.0,0.0
25%,2864.0,0.0,100.25,360.0,1.0
50%,3786.0,1025.0,125.0,360.0,1.0
75%,5060.0,2430.5,158.0,360.0,1.0
max,72529.0,24000.0,550.0,480.0,1.0


In [125]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import pickle

In [126]:
# Remove unwanted column
data_clean = data.drop('Loan_ID', axis=1)

# Fill missing values
data_clean['Credit_History'] = data_clean['Credit_History'].fillna(0)
data_clean['Self_Employed'] = data_clean['Self_Employed'].fillna('No')
data_clean['Dependents'] = data_clean['Dependents'].fillna('other')
data_clean['LoanAmount'] = data_clean['LoanAmount'].fillna(0)
data_clean['Loan_Amount_Term'] = data_clean['Loan_Amount_Term'].fillna(0)
data_clean['Gender'] = data_clean['Gender'].fillna('other')
data_clean['Married'] = data_clean['Married'].fillna('No')


In [127]:
data_clean.isna().sum()

Unnamed: 0,0
Gender,0
Married,0
Dependents,0
Education,0
Self_Employed,0
ApplicantIncome,0
CoapplicantIncome,0
LoanAmount,0
Loan_Amount_Term,0
Credit_History,0


In [128]:
# Encoding categorical variables
label_enc = LabelEncoder()
for col in ['Gender', 'Married', 'Education', 'Self_Employed', 'Property_Area', 'Dependents']:
    data_clean[col] = label_enc.fit_transform(data[col])

In [129]:
# Handle Outliers using IQR method (for 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount')
Q1 = data_clean[['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount']].quantile(0.25)
Q3 = data_clean[['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount']].quantile(0.75)
IQR = Q3 - Q1

In [130]:
# Remove outliers
data_clean = data_clean[~((data[['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount']] < (Q1 - 1.5 * IQR)) |
                         (data[['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount']] > (Q3 + 1.5 * IQR))).any(axis=1)]


In [131]:
# Feature Scaling
scaler = StandardScaler()
data_clean[['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount']] = scaler.fit_transform(data_clean[['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount']])


In [132]:
data_clean

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,1,1,0,0,0,1.277362,-0.962691,-0.334986,360.0,1.0,2
1,1,1,1,0,0,-0.496875,0.090624,0.054707,360.0,1.0,2
2,1,1,2,0,0,0.794211,0.301287,2.051884,360.0,1.0,2
3,1,1,2,0,0,-0.990763,0.825136,-0.578544,360.0,0.0,2
4,1,0,0,1,0,-0.362667,-0.962691,-1.114373,360.0,1.0,2
...,...,...,...,...,...,...,...,...,...,...,...
361,1,1,1,0,0,-1.038407,0.558998,-0.602900,360.0,1.0,1
362,1,1,3,1,1,0.129208,0.285136,-0.261919,360.0,1.0,2
363,1,1,0,0,0,0.229193,-0.464824,-0.213207,360.0,1.0,2
364,1,0,0,0,0,-0.380114,0.436814,0.054707,360.0,0.0,1


In [133]:
# Define features and target (Note: Assuming target as 'Loan_Status' which is missing in the file; hypothetical creation)
data_clean['Loan_Status'] = label_enc.fit_transform(data_clean['Married'])  # Temporary target simulation

X = data_clean.drop('Loan_Status', axis=1)
y = data_clean['Loan_Status']

In [134]:
# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [135]:
# Train the model
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

In [136]:
# Predict and evaluate
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
classification_report_output = classification_report(y_test, y_pred)

In [137]:
# Save the model to a .pkl file
model_filename = 'loan_eligibility_model.pkl'
with open(model_filename, 'wb') as file:
    pickle.dump(model, file)

In [138]:
# Output accuracy, precision, and saved model location
accuracy*100

100.0

In [139]:
print(classification_report_output)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        23
           1       1.00      1.00      1.00        42

    accuracy                           1.00        65
   macro avg       1.00      1.00      1.00        65
weighted avg       1.00      1.00      1.00        65



In [140]:
model_filename

'loan_eligibility_model.pkl'

In [144]:
# Ensure 'Loan_Status' is not in the data_clean before predicting
if 'Loan_Status' in data_clean.columns:
    data_clean = data_clean.drop('Loan_Status', axis=1)

# Make predictions using the trained model
loan_status_predictions = model.predict(data_clean)

# Assuming the rows were dropped during cleaning, we use only the Loan_ID of the rows that remained
loan_ids_cleaned = data['Loan_ID'][data_clean.index]  # Get the Loan_IDs corresponding to the cleaned dataset rows

# Convert Loan_Status predictions back to 'Y'/'N' and then map them to 'Yes'/'No'
loan_status_mapped = label_enc.inverse_transform(loan_status_predictions)
loan_status_mapped = ['Yes' if status == 'Y' else 'N' for status in loan_status_mapped]

# Create a DataFrame with Loan_ID and Loan_Status
output_df = pd.DataFrame({
    'Loan_ID': loan_ids_cleaned,  # Loan_IDs corresponding to the cleaned dataset
    'Loan_Status': loan_status_mapped
})

# Save the result to a CSV file
output_file_path = 'loan_eligibility_predictions.csv'
output_df.to_csv(output_file_path, index=False)

# Print confirmation
print(f"CSV file with Loan_ID and Loan_Status saved at: {output_file_path}")


CSV file with Loan_ID and Loan_Status saved at: loan_eligibility_predictions.csv
