In [2]:
import pandas as pd

In [3]:
df = pd.read_csv("data/LoanApprovalPrediction.csv")

## Duplicate display

In [4]:
# Find and display the number of duplicates based on all columns
duplicate_counts = df.duplicated().sum()
print(f'Number of duplicate rows: {duplicate_counts}')

# Display the duplicate rows in a new DataFrame
duplicates_df = df[df.duplicated(keep=False)]
print('\nDuplicate Rows:')
print(duplicates_df)

Number of duplicate rows: 3

Duplicate Rows:
     Loan_ID Gender Married  Dependents Education Self_Employed  \
0   LP001002   Male      No         0.0  Graduate            No   
1   LP001002   Male      No         0.0  Graduate            No   
5   LP001011   Male     Yes         2.0  Graduate           Yes   
8   LP001011   Male     Yes         2.0  Graduate           Yes   
15  LP001032   Male      No         0.0  Graduate            No   
16  LP001032   Male      No         0.0  Graduate            No   

    ApplicantIncome  CoapplicantIncome  LoanAmount  Loan_Amount_Term  \
0              5849                0.0         NaN             360.0   
1              5849                0.0         NaN             360.0   
5              5417             4196.0       267.0             360.0   
8              5417             4196.0       267.0             360.0   
15             4950                0.0       125.0             360.0   
16             4950                0.0       125.0   

In [4]:
import numpy as np
from numbers import Number
import datetime


## Inconsistent values

In [5]:
columns = list(df.columns)

def get_col_counts(x):
    numeric_counts = 0
    object_counts = 0
    date_types = 0

    for val in x:
        if isinstance(val, Number):
            numeric_counts+=1
        elif isinstance(val, str):
            object_counts+=1
        elif isinstance(val, datetime.date):
            date_types+=1

    return numeric_counts, object_counts, date_types

In [6]:
numeric_counts_ = []
object_counts_ = []
date_types_ = []
inconsistencies_ = []
for col in columns:
    numeric_counts, object_counts, date_types = get_col_counts(df[col])    
    inconsistency = 100-(max(numeric_counts, object_counts, date_types)/len(df))*100    
    numeric_counts_.append(numeric_counts)
    object_counts_.append(object_counts)
    date_types_.append(date_types)
    inconsistencies_.append(inconsistency)

inconsistency_df = pd.DataFrame(columns=['column', 'numeric_vals', 'object_vals', 'date_vals', 'inconsistency_percentage'])
inconsistency_df['column'] = columns
inconsistency_df['numeric_vals'] = numeric_counts_
inconsistency_df['object_vals'] = object_counts_
inconsistency_df['date_vals'] = date_types_
inconsistency_df['inconsistency_percentage'] = inconsistencies_

In [8]:
inconsistency_df

Unnamed: 0,column,numeric_vals,object_vals,date_vals,inconsistency_percentage
0,Loan_ID,0,598,0,0.0
1,Gender,0,598,0,0.0
2,Married,0,598,0,0.0
3,Dependents,598,0,0,0.0
4,Education,0,598,0,0.0
5,Self_Employed,0,598,0,0.0
6,ApplicantIncome,598,0,0,0.0
7,CoapplicantIncome,598,0,0,0.0
8,LoanAmount,598,0,0,0.0
9,Loan_Amount_Term,598,0,0,0.0


In [9]:
import plotly.express as px

In [10]:
numeric_columns = df.select_dtypes(include=['number']).columns

In [5]:
df.columns

Index(['Loan_ID', 'Gender', 'Married', 'Dependents', 'Education',
       'Self_Employed', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Property_Area', 'Loan_Status'],
      dtype='object')

In [17]:
df[numeric_columns].describe()

Unnamed: 0,Dependents,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History
count,586.0,598.0,598.0,577.0,584.0,549.0
mean,0.755973,5292.252508,1631.499866,144.968804,341.917808,0.843352
std,1.007751,5807.265364,2953.315785,82.704182,65.205994,0.3638
min,0.0,150.0,0.0,9.0,12.0,0.0
25%,0.0,2877.5,0.0,100.0,360.0,1.0
50%,0.0,3806.0,1211.5,127.0,360.0,1.0
75%,1.75,5746.0,2324.0,167.0,360.0,1.0
max,3.0,81000.0,41667.0,650.0,480.0,1.0


In [7]:
import plotly.express as px

In [10]:
df = df.iloc[:,1:]

In [12]:
duplicate_action = 'keep'
null_action_radios = ['drop', 'impute', 'drop']
incon_action_radios = ['drop', 'impute']

In [15]:
columns = list(df.columns)

In [26]:
msg = ""
if duplicate_action == "keep":
    msg+="Keeping duplicate columns\n"
else:
    msg+="Dropping duplicate columns\n"

msg += "Null value handling operations performed"    
for index, action in enumerate(null_action_radios):
    msg += "%sing column - %s\n "%(action, columns[index])

msg += "Inconsistency value handling operations performed"    
for index, action in enumerate(incon_action_radios):
    msg += "%sing column - %s\n "%(action, columns[index])


In [27]:
msg

'Keeping duplicate columns\nNull value handling operations performeddroping column - Gender\n imputeing column - Married\n droping column - Dependents\n Inconsistency value handling operations performeddroping column - Gender\n imputeing column - Married\n '

In [13]:
for i, col_name in df.columns:

Index(['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed',
       'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Property_Area', 'Loan_Status'],
      dtype='object')