In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer 
from sklearn.model_selection import train_test_split 

In [2]:
# Load the dataset
df = pd.read_csv("C:\\Users\\asaha\\Downloads\\archive\\Training Dataset.csv")

In [3]:
# Initial inspection
print(df.head())

    Loan_ID Gender Married Dependents     Education Self_Employed  \
0  LP001002   Male      No          0      Graduate            No   
1  LP001003   Male     Yes          1      Graduate            No   
2  LP001005   Male     Yes          0      Graduate           Yes   
3  LP001006   Male     Yes          0  Not Graduate            No   
4  LP001008   Male      No          0      Graduate            No   

   ApplicantIncome  CoapplicantIncome  LoanAmount  Loan_Amount_Term  \
0             5849                0.0         NaN             360.0   
1             4583             1508.0       128.0             360.0   
2             3000                0.0        66.0             360.0   
3             2583             2358.0       120.0             360.0   
4             6000                0.0       141.0             360.0   

   Credit_History Property_Area Loan_Status  
0             1.0         Urban           Y  
1             1.0         Rural           N  
2             1.0   

In [4]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             601 non-null    object 
 2   Married            611 non-null    object 
 3   Dependents         599 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      582 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         592 non-null    float64
 9   Loan_Amount_Term   600 non-null    float64
 10  Credit_History     564 non-null    float64
 11  Property_Area      614 non-null    object 
 12  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 62.5+ KB
None


In [5]:
print(df.describe())

       ApplicantIncome  CoapplicantIncome  LoanAmount  Loan_Amount_Term  \
count       614.000000         614.000000  592.000000         600.00000   
mean       5403.459283        1621.245798  146.412162         342.00000   
std        6109.041673        2926.248369   85.587325          65.12041   
min         150.000000           0.000000    9.000000          12.00000   
25%        2877.500000           0.000000  100.000000         360.00000   
50%        3812.500000        1188.500000  128.000000         360.00000   
75%        5795.000000        2297.250000  168.000000         360.00000   
max       81000.000000       41667.000000  700.000000         480.00000   

       Credit_History  
count      564.000000  
mean         0.842199  
std          0.364878  
min          0.000000  
25%          1.000000  
50%          1.000000  
75%          1.000000  
max          1.000000  


In [6]:
# Handling missing values
# For categorical features: Use mode (most frequent value)
cat_features = ['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'Property_Area']
imputer_cat = SimpleImputer(strategy='most_frequent')
df[cat_features] = imputer_cat.fit_transform(df[cat_features])

In [7]:
# For numerical features: Use median
num_features = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term', 'Credit_History']
imputer_num = SimpleImputer(strategy='median')
df[num_features] = imputer_num.fit_transform(df[num_features])

In [8]:
# Encoding categorical variables
# Label Encoding for binary categorical variables
label_enc = LabelEncoder()
df['Gender'] = label_enc.fit_transform(df['Gender'])
df['Married'] = label_enc.fit_transform(df['Married'])
df['Self_Employed'] = label_enc.fit_transform(df['Self_Employed'])

In [9]:
# One-Hot Encoding for multi-class categorical variables
df = pd.get_dummies(df, columns=['Dependents', 'Education', 'Property_Area'])

In [10]:
# Feature Scaling
# Scaling numerical features
scaler = StandardScaler()
df[num_features] = scaler.fit_transform(df[num_features])

In [11]:
# Feature Engineering
# Creating new features
# Check which columns exist after one-hot encoding
print(df.columns)

Index(['Loan_ID', 'Gender', 'Married', 'Self_Employed', 'ApplicantIncome',
       'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term', 'Credit_History',
       'Loan_Status', 'Dependents_0', 'Dependents_1', 'Dependents_2',
       'Dependents_3+', 'Education_Graduate', 'Education_Not Graduate',
       'Property_Area_Rural', 'Property_Area_Semiurban',
       'Property_Area_Urban'],
      dtype='object')


In [12]:
# Handle missing dependents columns if they are not present
dependents_columns = ['Dependents_0', 'Dependents_1', 'Dependents_2', 'Dependents_3']
for col in dependents_columns:
    if col not in df.columns:
        df[col] = 0

In [13]:
df['TotalIncome'] = df['ApplicantIncome'] + df['CoapplicantIncome']
df['IncomePerPerson'] = df['TotalIncome'] / (df['Dependents_0'] + df['Dependents_1'] + df['Dependents_2'] + df['Dependents_3'] + 1)

In [14]:
# Drop the old or less useful features
df = df.drop(columns=['Loan_ID', 'ApplicantIncome', 'CoapplicantIncome', 'TotalIncome'])

In [15]:
# # Final inspection
print(df.head())

   Gender  Married  Self_Employed  LoanAmount  Loan_Amount_Term  \
0       1        0              0   -0.211241          0.273231   
1       1        1              0   -0.211241          0.273231   
2       1        1              1   -0.948996          0.273231   
3       1        1              0   -0.306435          0.273231   
4       1        0              0   -0.056551          0.273231   

   Credit_History Loan_Status  Dependents_0  Dependents_1  Dependents_2  \
0        0.411733           Y          True         False         False   
1        0.411733           N         False          True         False   
2        0.411733           Y          True         False         False   
3        0.411733           Y          True         False         False   
4        0.411733           Y          True         False         False   

   Dependents_3+  Education_Graduate  Education_Not Graduate  \
0          False                True                   False   
1          False  

In [16]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 18 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Gender                   614 non-null    int32  
 1   Married                  614 non-null    int32  
 2   Self_Employed            614 non-null    int32  
 3   LoanAmount               614 non-null    float64
 4   Loan_Amount_Term         614 non-null    float64
 5   Credit_History           614 non-null    float64
 6   Loan_Status              614 non-null    object 
 7   Dependents_0             614 non-null    bool   
 8   Dependents_1             614 non-null    bool   
 9   Dependents_2             614 non-null    bool   
 10  Dependents_3+            614 non-null    bool   
 11  Education_Graduate       614 non-null    bool   
 12  Education_Not Graduate   614 non-null    bool   
 13  Property_Area_Rural      614 non-null    bool   
 14  Property_Area_Semiurban  6

In [17]:
# Train-Test Split
X = df.drop(columns=['Loan_Status'])
y = df['Loan_Status']

In [18]:
# Convert target variable to binary
y = label_enc.fit_transform(y)

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [20]:
print("Train features shape:", X_train.shape)
print("Test features shape:", X_test.shape)
print("Train target shape:", y_train.shape)
print("Test target shape:", y_test.shape)

Train features shape: (491, 17)
Test features shape: (123, 17)
Train target shape: (491,)
Test target shape: (123,)
