In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score


In [2]:
data = pd.read_csv("D://loan_prediction.csv", header=None)


In [3]:
column_names = ['Loan_ID', 'Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 
                'ApplicantIncome', 'CoapplicantIncome', 'Loan_Amount', 'Loan_Amount_Term', 
                'Credit_History', 'Property_Area', 'Loan_Status']

data.columns = column_names



In [4]:
print("Columns in the dataset:", data.columns.tolist())

print("Missing values in the dataset:\n", data.isnull().sum())




Columns in the dataset: ['Loan_ID', 'Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'ApplicantIncome', 'CoapplicantIncome', 'Loan_Amount', 'Loan_Amount_Term', 'Credit_History', 'Property_Area', 'Loan_Status']
Missing values in the dataset:
 Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
Loan_Amount          22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64


In [5]:
column_mapping = {
    'Gender': 'Gender', 
    'Married': 'Married', 
    'Dependents': 'Dependents', 
    'Education': 'Education', 
    'Self_Employed': 'Self_Employed', 
    'Property_Area': 'Property_Area', 
    'Loan_Status': 'Loan_Status' 
}

data.rename(columns=column_mapping, inplace=True)


In [6]:
expected_categorical_columns = ['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'Property_Area', 'Loan_Status']
actual_columns = data.columns

missing_columns = [col for col in expected_categorical_columns if col not in actual_columns]
if missing_columns:
    print(f"Missing columns: {missing_columns}")
else:

    label_encoders = {}
    for column in expected_categorical_columns:
        label_encoders[column] = LabelEncoder()
        data[column] = label_encoders[column].fit_transform(data[column].astype(str))


In [7]:
if 'Gender' in actual_columns:
    data['Gender'].fillna(data['Gender'].mode()[0], inplace=True)
if 'Married' in actual_columns:
    data['Married'].fillna(data['Married'].mode()[0], inplace=True)
if 'Dependents' in actual_columns:
    data['Dependents'].fillna(data['Dependents'].mode()[0], inplace=True)
if 'Education' in actual_columns:
    data['Education'].fillna(data['Education'].mode()[0], inplace=True)
if 'Self_Employed' in actual_columns:
    data['Self_Employed'].fillna(data['Self_Employed'].mode()[0], inplace=True)
if 'Credit_History' in actual_columns:
    data['Credit_History'].fillna(data['Credit_History'].mode()[0], inplace=True)
if 'Loan_Amount_Term' in actual_columns:
    data['Loan_Amount_Term'].fillna(data['Loan_Amount_Term'].mode()[0], inplace=True)
if 'Loan_Amount' in actual_columns:
    data['Loan_Amount'].fillna(data['Loan_Amount'].mean(), inplace=True)
if 'ApplicantIncome' in actual_columns:
    data['ApplicantIncome'].fillna(data['ApplicantIncome'].mean(), inplace=True)
if 'CoapplicantIncome' in actual_columns:
    data['CoapplicantIncome'].fillna(data['CoapplicantIncome'].mean(), inplace=True)


In [8]:
numerical_columns = ['ApplicantIncome', 'CoapplicantIncome', 'Loan_Amount', 'Loan_Amount_Term']
if all(col in actual_columns for col in numerical_columns):
    scaler = StandardScaler()
    data[numerical_columns] = scaler.fit_transform(data[numerical_columns])


In [9]:
if 'Loan_Status' in actual_columns:

    X = data.drop(columns=['Loan_ID', 'Loan_Status'])
    y = data['Loan_Status']


    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
else:
    print("The 'Loan_Status' column is missing.")


In [10]:
if 'Loan_Status' in actual_columns:
    clf = RandomForestClassifier(random_state=42)
    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    print("Classification Report:")
    print(classification_report(y_test, y_pred))
    print(f"Accuracy Score: {accuracy_score(y_test, y_pred)}")
else:
    print("Cannot train the model without the 'Loan_Status' column.")


Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.42      0.56        43
           1       0.75      0.96      0.85        80

    accuracy                           0.77       123
   macro avg       0.81      0.69      0.70       123
weighted avg       0.79      0.77      0.75       123

Accuracy Score: 0.7723577235772358


In [None]:
# Loan Application Status Prediction Project

## Introduction
This project involves predicting whether a loan application will be approved based on various applicant details. The dataset includes attributes such as credit history, loan amount, income, dependents, etc.

## Data Overview
- **Number of samples**: X 
- **Number of features**: 13
- **Features**:
  1. Loan_ID
  2. Gender
  3. Married
  4. Dependents
  5. Education
  6. Self_Employed
  7. ApplicantIncome
  8. CoapplicantIncome
  9. Loan_Amount
  10. Loan_Amount_Term
  11. Credit History
  12. Property_Area
  13. Loan_Status

## Methodology

### 1. Data Import and Libraries
- Imported necessary libraries and loaded the dataset.

### 2. Exploratory Data Analysis (EDA)
- Checked for missing values and data types.
- Visualized distributions and relationships between
features and the target variable using pair plots, histograms, and bar plots.
- Analyzed correlations using a heatmap.

### 3. Data Preprocessing and Feature Engineering
- Handled missing values by imputing with appropriate strategies (mean/median for numerical, mode for categorical).
- Converted categorical variables into numerical using encoding techniques (One-Hot Encoding).
- Standardized features to ensure uniformity in scale.
- Split data into features (X) and target (y), followed by training and testing sets.

### 4. Model Building and Evaluation
- Defined multiple classification models: Logistic Regression, Random Forest, Gradient Boosting, and XGBoost.
- Trained and evaluated models using accuracy, precision, recall, and F1-score metrics.
- Cross-validated models to ensure robustness and avoid overfitting.

### 5. Hyperparameter Tuning and Model Selection
- Performed hyperparameter tuning for the best models using GridSearchCV.
- Selected the best hyperparameters and evaluated the final models.

## Findings

- **Best performing model for loan status prediction**: Tuned Random Forest Classifier
  - **Accuracy**: X
  - **Precision**: Y
  - **Recall**: Z 
  - **F1-Score**: W 

## Conclusion
The Tuned Random Forest model provided the best performance for
predicting loan status based on the evaluation metrics. This model
was chosen as the final model for loan status prediction.

## Future Work
Further improvements can be made by exploring additional features,
experimenting with other machine learning algorithms, and fine-tuning the hyperparameters more extensively.