In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
import pandas_profiling
%matplotlib inline

In [2]:
sns.set_style('whitegrid')
sns.set_color_codes('dark')

In [3]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

In [4]:
df = train_df.copy()

### Head

In [5]:
df.head(5)

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


#### from above we can see that Loan_Id feature will not help us in anyway futher. So will drop this feature.

In [6]:
df = df.drop('Loan_ID', axis=1)

In [7]:
df.head(5)

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [8]:
# report = df.profile_report()
# report.to_file('report.html')

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 12 columns):
Gender               601 non-null object
Married              611 non-null object
Dependents           599 non-null object
Education            614 non-null object
Self_Employed        582 non-null object
ApplicantIncome      614 non-null int64
CoapplicantIncome    614 non-null float64
LoanAmount           592 non-null float64
Loan_Amount_Term     600 non-null float64
Credit_History       564 non-null float64
Property_Area        614 non-null object
Loan_Status          614 non-null object
dtypes: float64(4), int64(1), object(7)
memory usage: 57.7+ KB


#### There are 5 features of number datatype and 7 features of string|object datatype
#### Also, there are missing values present in the dataset.
#### there is 44% of 0 present in Coapplicant feature

### Five Number Summary

In [10]:
df.describe()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History
count,614.0,614.0,592.0,600.0,564.0
mean,5403.459283,1621.245798,146.412162,342.0,0.842199
std,6109.041673,2926.248369,85.587325,65.12041,0.364878
min,150.0,0.0,9.0,12.0,0.0
25%,2877.5,0.0,100.0,360.0,1.0
50%,3812.5,1188.5,128.0,360.0,1.0
75%,5795.0,2297.25,168.0,360.0,1.0
max,81000.0,41667.0,700.0,480.0,1.0


#### ApplicantIncome, CoapplicantIncome and LoanAmount features are right skewed and Loan Amount Term and Credit History are left skewed, Hence the dataset is skewed dataset.
#### Outliers is present in ApplicantIncome, CoapplicantIncome, LoanAmount features.

## EDA

### Checking for missing values

In [11]:
def remove_missing_values(df):
    for i in df.columns:
        if df[i].isnull().sum() > 0:
            print("Removing missing values from feature -> {}".format(i))
            if df[i].dtype == 'object':
                df[i].fillna(value = df[i].mode()[0], inplace=True)
            else:
                df[i].fillna(value = df[i].median(), inplace=True)

In [12]:
missing_values = df.isnull().sum().sum()
if missing_values > 0:
    remove_missing_values(df)
else:
    print('There is no missing values present in the dataset')

Removing missing values from feature -> Gender
Removing missing values from feature -> Married
Removing missing values from feature -> Dependents
Removing missing values from feature -> Self_Employed
Removing missing values from feature -> LoanAmount
Removing missing values from feature -> Loan_Amount_Term
Removing missing values from feature -> Credit_History


### Checking for duplicate values

In [13]:
def remove_duplicate_values(df):
    for i in df.columns:
        if df[i].duplicated().sum() > 0:
            print("Removing duplicated values from feature -> {}".format(i))
            df[i].drop_duplicates()

In [14]:
duplicate_values = df.duplicated().sum()
if duplicate_values > 0:
    remove_duplicate_values(df)
else:
    print('There is no duplicate values present in the dataset')

There is no duplicate values present in the dataset
