In [900]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [901]:
df=pd.read_csv("loan_data_1.csv")

In [902]:
df.dtypes

Unnamed: 0             int64
Loan_ID               object
Gender                object
Married               object
Dependents            object
Education             object
Self_Employed         object
ApplicantIncome      float64
CoapplicantIncome    float64
LoanAmount           float64
Loan_Amount_Term     float64
Credit_History       float64
Property_Area         object
Loan_Status           object
dtype: object

In [903]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 381 entries, 0 to 380
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Unnamed: 0         381 non-null    int64  
 1   Loan_ID            381 non-null    object 
 2   Gender             376 non-null    object 
 3   Married            381 non-null    object 
 4   Dependents         373 non-null    object 
 5   Education          375 non-null    object 
 6   Self_Employed      360 non-null    object 
 7   ApplicantIncome    369 non-null    float64
 8   CoapplicantIncome  363 non-null    float64
 9   LoanAmount         373 non-null    float64
 10  Loan_Amount_Term   370 non-null    float64
 11  Credit_History     351 non-null    float64
 12  Property_Area      381 non-null    object 
 13  Loan_Status        381 non-null    object 
dtypes: float64(5), int64(1), object(8)
memory usage: 41.8+ KB


In [904]:
df.isnull().sum()

Unnamed: 0            0
Loan_ID               0
Gender                5
Married               0
Dependents            8
Education             6
Self_Employed        21
ApplicantIncome      12
CoapplicantIncome    18
LoanAmount            8
Loan_Amount_Term     11
Credit_History       30
Property_Area         0
Loan_Status           0
dtype: int64

In [905]:
# Convert the object columns to numeric, forcing errors to NaNs
df['Dependents'] = pd.to_numeric(df['Dependents'], errors='coerce')


In [906]:
df.describe()

Unnamed: 0.1,Unnamed: 0,Dependents,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History
count,381.0,345.0,369.0,363.0,373.0,370.0,351.0
mean,190.0,0.492754,3563.422764,1267.005289,104.914209,340.864865,0.837607
std,110.129469,0.770518,1427.371257,2388.048316,28.484822,68.549257,0.369338
min,0.0,0.0,150.0,0.0,9.0,12.0,0.0
25%,95.0,0.0,2583.0,0.0,90.0,360.0,1.0
50%,190.0,0.0,3326.0,830.0,110.0,360.0,1.0
75%,285.0,1.0,4226.0,2008.0,127.0,360.0,1.0
max,380.0,2.0,9703.0,33837.0,150.0,480.0,1.0


In [907]:
# Select numerical columns
numerical_cols = df.select_dtypes(include=[np.number])
# Compute the mean of each numerical column
means = numerical_cols.mean()
# Replace null values in each numerical column with its mean
df[numerical_cols.columns] = numerical_cols.apply(lambda x: x.fillna(x.mean()))

In [908]:
# Replace null values with mode for each column(catagorical data)
for column in df.columns:
    mode_value = df[column].mode()[0]  # Calculate mode for the column
    df[column].fillna(mode_value, inplace=True)  # Fill NaN values with mode

In [909]:
df.isnull().sum()

Unnamed: 0           0
Loan_ID              0
Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64

In [910]:
# Convert categorical variables to numeric
label_encoders = {}
for column in ['Gender', 'Married', 'Education', 'Self_Employed', 'Property_Area', 'Loan_Status']:
    label_encoders[column] = LabelEncoder()
    df[column] = label_encoders[column].fit_transform(df[column])

# Handle 'Dependents' separately if it has '3+' as a category
df['Dependents'] = df['Dependents'].replace('3+', 3).astype(int)


In [911]:

# Select numerical columns for outlier fixing
numerical_cols = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term']

# Calculate IQR for each column
Q1 = df[numerical_cols].quantile(0.25)
Q3 = df[numerical_cols].quantile(0.75)
IQR = Q3 - Q1

# Define outliers using IQR rule
outliers_lower = df[numerical_cols] < (Q1 - 1.5 * IQR)
outliers_upper = df[numerical_cols] > (Q3 + 1.5 * IQR)

# Fix outliers by replacing with median
for col in numerical_cols:
    median = df[col].median()
    df.loc[outliers_lower[col], col] = median
    df.loc[outliers_upper[col], col] = median

# Display updated data with fixed outliers
print("Data after fixing outliers with median:")
df[numerical_cols].describe()

Data after fixing outliers with median:


Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term
count,381.0,381.0,381.0,381.0
mean,3400.105179,1053.052533,106.51001,360.0
std,1073.465086,1133.778069,25.848411,0.0
min,645.0,0.0,35.0,360.0
25%,2600.0,0.0,94.0,360.0
50%,3357.0,1041.0,110.0,360.0
75%,3941.0,1857.0,127.0,360.0
max,6500.0,4490.0,150.0,360.0


In [912]:
df

Unnamed: 0.1,Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,0,LP001003,1,1,1,0,0,4583.0,1508.000000,128.0,360.0,1.0,0,0
1,1,LP001005,1,1,0,0,1,3000.0,0.000000,66.0,360.0,1.0,2,1
2,2,LP001006,1,1,0,1,0,2583.0,2358.000000,120.0,360.0,1.0,2,1
3,3,LP001008,1,0,0,0,0,6000.0,0.000000,141.0,360.0,1.0,2,1
4,4,LP001013,1,1,0,1,0,2333.0,1516.000000,95.0,360.0,1.0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
376,376,LP002953,1,1,0,0,0,5703.0,0.000000,128.0,360.0,1.0,2,1
377,377,LP002974,1,1,0,0,0,3232.0,1267.005289,108.0,360.0,1.0,0,1
378,378,LP002978,0,0,0,0,0,2900.0,0.000000,71.0,360.0,1.0,0,1
379,379,LP002979,1,1,0,0,0,4106.0,0.000000,40.0,360.0,1.0,0,1


In [913]:
X = df.drop(columns=['Loan_ID', 'Loan_Status'])
y = df['Loan_Status']


In [914]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [915]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [916]:
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)


In [917]:
y_pred = model.predict(X_test)

In [918]:
print("Accuracy:", accuracy_score(y_test, y_pred))
#print("Classification Report:\n", classification_report(y_test, y_pred))
#print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


Accuracy: 0.8173913043478261


In [919]:

# Convert predictions back to original encoding
y_pred_labels = label_encoders['Loan_Status'].inverse_transform(y_pred)

# Create a DataFrame for predictions
predicted_df = df.iloc[y_test.index].copy()
predicted_df['Predicted_Loan_Status'] = y_pred_labels

In [920]:
predicted_df

Unnamed: 0.1,Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status,Predicted_Loan_Status
266,266,LP002348,1,1,0,0,0,5829.0,0.0,138.0,360.0,1.0,0,1,Y
192,192,LP001977,1,1,1,0,0,1625.0,1803.0,96.0,360.0,1.0,2,1,Y
46,46,LP001206,1,1,0,0,0,3029.0,0.0,99.0,360.0,1.0,2,1,Y
55,55,LP001259,1,1,1,0,1,1000.0,3022.0,110.0,360.0,1.0,2,0,Y
57,57,LP001265,0,0,0,0,0,3846.0,0.0,111.0,360.0,1.0,1,1,Y
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17,17,LP001068,1,1,0,0,0,2799.0,2253.0,122.0,360.0,1.0,1,1,Y
157,157,LP001836,0,0,2,0,0,3427.0,0.0,138.0,360.0,1.0,2,0,Y
24,24,LP001109,1,1,0,0,0,1828.0,1330.0,100.0,360.0,0.0,2,0,N
116,116,LP001658,1,0,0,0,0,3858.0,0.0,76.0,360.0,1.0,1,1,Y


In [921]:
# Convert back to categorical variables
for column in ['Gender', 'Married', 'Education', 'Self_Employed', 'Property_Area', 'Loan_Status']:
    predicted_df[column] = label_encoders[column].inverse_transform(predicted_df[column])

In [922]:
predicted_df

Unnamed: 0.1,Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status,Predicted_Loan_Status
266,266,LP002348,Male,Yes,0,Graduate,No,5829.0,0.0,138.0,360.0,1.0,Rural,Y,Y
192,192,LP001977,Male,Yes,1,Graduate,No,1625.0,1803.0,96.0,360.0,1.0,Urban,Y,Y
46,46,LP001206,Male,Yes,0,Graduate,No,3029.0,0.0,99.0,360.0,1.0,Urban,Y,Y
55,55,LP001259,Male,Yes,1,Graduate,Yes,1000.0,3022.0,110.0,360.0,1.0,Urban,N,Y
57,57,LP001265,Female,No,0,Graduate,No,3846.0,0.0,111.0,360.0,1.0,Semiurban,Y,Y
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17,17,LP001068,Male,Yes,0,Graduate,No,2799.0,2253.0,122.0,360.0,1.0,Semiurban,Y,Y
157,157,LP001836,Female,No,2,Graduate,No,3427.0,0.0,138.0,360.0,1.0,Urban,N,Y
24,24,LP001109,Male,Yes,0,Graduate,No,1828.0,1330.0,100.0,360.0,0.0,Urban,N,N
116,116,LP001658,Male,No,0,Graduate,No,3858.0,0.0,76.0,360.0,1.0,Semiurban,Y,Y


In [923]:
# Feature engineering
df['TotalIncome'] = df['ApplicantIncome'] + df['CoapplicantIncome']

# Select features and target
features = ['ApplicantIncome', 'CoapplicantIncome', 'Loan_Amount_Term', 'Credit_History']
X = df[features]
y = df['LoanAmount']  # Target variable is LoanAmount

# Split dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)

# Make predictions on the entire dataset
loan_amount_predictions = model.predict(X)

# Add predictions to the original dataframe
df['loan_amount_prediction'] = loan_amount_predictions

In [924]:
df.head(20)

Unnamed: 0.1,Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status,TotalIncome,loan_amount_prediction
0,0,LP001003,1,1,1,0,0,4583.0,1508.0,128.0,360.0,1.0,0,0,6091.0,120.483333
1,1,LP001005,1,1,0,0,1,3000.0,0.0,66.0,360.0,1.0,2,1,3000.0,68.564853
2,2,LP001006,1,1,0,1,0,2583.0,2358.0,120.0,360.0,1.0,2,1,4941.0,124.26
3,3,LP001008,1,0,0,0,0,6000.0,0.0,141.0,360.0,1.0,2,1,6000.0,135.52
4,4,LP001013,1,1,0,1,0,2333.0,1516.0,95.0,360.0,1.0,2,1,3849.0,102.64
5,5,LP001024,1,1,2,0,0,3200.0,700.0,70.0,360.0,1.0,2,1,3900.0,94.9
6,6,LP001027,1,1,2,0,0,2500.0,1840.0,109.0,360.0,1.0,2,1,4340.0,114.04
7,7,LP001029,1,0,0,0,0,1853.0,2840.0,114.0,360.0,1.0,0,0,4693.0,113.04
8,8,LP001030,1,1,2,0,0,1299.0,1086.0,110.0,360.0,1.0,2,1,2385.0,99.41
9,9,LP001032,1,0,0,0,0,4950.0,0.0,125.0,360.0,1.0,2,1,4950.0,128.68


In [931]:
final_df = pd.concat([predicted_df,df[['TotalIncome','loan_amount_prediction']]], axis=1)

In [939]:
final_df.head(20)

Unnamed: 0.1,Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status,Predicted_Loan_Status,TotalIncome,loan_amount_prediction
266,266.0,LP002348,Male,Yes,0.0,Graduate,No,5829.0,0.0,138.0,360.0,1.0,Rural,Y,Y,5829.0,140.766667
192,192.0,LP001977,Male,Yes,1.0,Graduate,No,1625.0,1803.0,96.0,360.0,1.0,Urban,Y,Y,3428.0,92.97
46,46.0,LP001206,Male,Yes,0.0,Graduate,No,3029.0,0.0,99.0,360.0,1.0,Urban,Y,Y,3029.0,70.679142
55,55.0,LP001259,Male,Yes,1.0,Graduate,Yes,1000.0,3022.0,110.0,360.0,1.0,Urban,N,Y,4022.0,112.67
57,57.0,LP001265,Female,No,0.0,Graduate,No,3846.0,0.0,111.0,360.0,1.0,Semiurban,Y,Y,3846.0,91.33
39,39.0,LP001164,Female,No,0.0,Graduate,No,4230.0,0.0,112.0,360.0,1.0,Semiurban,N,Y,4230.0,115.132279
163,163.0,LP001871,Female,No,0.0,Graduate,No,3357.0,0.0,120.0,360.0,1.0,Rural,Y,Y,3357.0,121.73789
364,364.0,LP002893,Male,No,0.0,Graduate,No,1836.0,1041.0,90.0,360.0,1.0,Urban,N,Y,2877.0,88.95
210,210.0,LP002097,Male,No,1.0,Graduate,No,4384.0,1793.0,117.0,360.0,1.0,Urban,Y,Y,6177.0,126.94
94,94.0,LP001532,Male,Yes,2.0,Not Graduate,No,2281.0,0.0,113.0,360.0,1.0,Rural,N,Y,2281.0,61.2925
