In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('loan_prediction_train.csv')

### Missing values

In [3]:
missing_cat_value = [feature for feature in df.columns if df[feature].isna().sum() > 1 and df[feature].dtype == 'O']
for feature in missing_cat_value:
    print(feature, df[feature].isna().mean().round(4) * 100, '% missing values' )

Gender 2.12 % missing values
Married 0.49 % missing values
Dependents 2.44 % missing values
Self_Employed 5.21 % missing values


In [4]:
# Handling missing values in gender column by evenly filling with male and female
from random import choice

mask = df["Gender"].isna()

df.loc[mask,["Gender"]] = df.loc[mask,["Gender"]].applymap(lambda _: choice(["Male","Female"]))

In [5]:
df['Gender'].unique()

array(['Male', 'Female'], dtype=object)

In [6]:
# Replacing missing values with a new label
def replace_cat_features(dataset, missing_cat_value):
    data = df.copy()
    
    data[missing_cat_value] = data[missing_cat_value].fillna('Missing')
    return data

df = replace_cat_features(df, missing_cat_value)
df[missing_cat_value].isnull().sum()

Gender           0
Married          0
Dependents       0
Self_Employed    0
dtype: int64

In [7]:
# Dealing the 0 in coapplicantIncome
df['CoapplicantIncome'].value_counts()

0.0       273
2500.0      5
2083.0      5
1666.0      5
2250.0      3
         ... 
2791.0      1
1010.0      1
1695.0      1
2598.0      1
240.0       1
Name: CoapplicantIncome, Length: 287, dtype: int64

In [8]:
# Dropping coapplicantIncome columns
df.drop(['CoapplicantIncome'], inplace = True,  axis=1)

In [9]:
# Numerical variable that contains missing values

numerical_with_na = [feature for feature in df.columns if df[feature].isnull().sum() > 1 and  df[feature].dtype != 'O']

for feature in numerical_with_na:
    print('{} : {}% missing value'.format(feature, np.round(df[feature].isnull().mean(), 4)))

LoanAmount : 0.0358% missing value
Loan_Amount_Term : 0.0228% missing value
Credit_History : 0.0814% missing value


In [10]:
# Replacing the numerical missing values

for feature in numerical_with_na:
    median_value = df[feature].median()
    
    # create a new feature to capture nan value
    df[feature+'nan'] = np.where(df[feature].isnull(), 1, 0)
    df[feature].fillna(median_value, inplace = True)
    
df[numerical_with_na].isnull().sum()

LoanAmount          0
Loan_Amount_Term    0
Credit_History      0
dtype: int64

In [11]:
df.head(10)

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status,LoanAmountnan,Loan_Amount_Termnan,Credit_Historynan
0,LP001002,Male,No,0,Graduate,No,5849,128.0,360.0,1.0,Urban,Y,1,0,0
1,LP001003,Male,Yes,1,Graduate,No,4583,128.0,360.0,1.0,Rural,N,0,0,0
2,LP001005,Male,Yes,0,Graduate,Yes,3000,66.0,360.0,1.0,Urban,Y,0,0,0
3,LP001006,Male,Yes,0,Not Graduate,No,2583,120.0,360.0,1.0,Urban,Y,0,0,0
4,LP001008,Male,No,0,Graduate,No,6000,141.0,360.0,1.0,Urban,Y,0,0,0
5,LP001011,Male,Yes,2,Graduate,Yes,5417,267.0,360.0,1.0,Urban,Y,0,0,0
6,LP001013,Male,Yes,0,Not Graduate,No,2333,95.0,360.0,1.0,Urban,Y,0,0,0
7,LP001014,Male,Yes,3+,Graduate,No,3036,158.0,360.0,0.0,Semiurban,N,0,0,0
8,LP001018,Male,Yes,2,Graduate,No,4006,168.0,360.0,1.0,Urban,Y,0,0,0
9,LP001020,Male,Yes,1,Graduate,No,12841,349.0,360.0,1.0,Semiurban,N,0,0,0


In [12]:
# logarithmic transformation for LoanAmount feature

df['LoanAmount'] = np.log(df['LoanAmount'])

In [13]:
df.head(50)

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status,LoanAmountnan,Loan_Amount_Termnan,Credit_Historynan
0,LP001002,Male,No,0,Graduate,No,5849,4.85203,360.0,1.0,Urban,Y,1,0,0
1,LP001003,Male,Yes,1,Graduate,No,4583,4.85203,360.0,1.0,Rural,N,0,0,0
2,LP001005,Male,Yes,0,Graduate,Yes,3000,4.189655,360.0,1.0,Urban,Y,0,0,0
3,LP001006,Male,Yes,0,Not Graduate,No,2583,4.787492,360.0,1.0,Urban,Y,0,0,0
4,LP001008,Male,No,0,Graduate,No,6000,4.94876,360.0,1.0,Urban,Y,0,0,0
5,LP001011,Male,Yes,2,Graduate,Yes,5417,5.587249,360.0,1.0,Urban,Y,0,0,0
6,LP001013,Male,Yes,0,Not Graduate,No,2333,4.553877,360.0,1.0,Urban,Y,0,0,0
7,LP001014,Male,Yes,3+,Graduate,No,3036,5.062595,360.0,0.0,Semiurban,N,0,0,0
8,LP001018,Male,Yes,2,Graduate,No,4006,5.123964,360.0,1.0,Urban,Y,0,0,0
9,LP001020,Male,Yes,1,Graduate,No,12841,5.855072,360.0,1.0,Semiurban,N,0,0,0


In [14]:
categorical_features = [feature for feature in df.columns if df[feature].dtype == 'O']
categorical_features

['Loan_ID',
 'Gender',
 'Married',
 'Dependents',
 'Education',
 'Self_Employed',
 'Property_Area',
 'Loan_Status']

In [15]:
for feature in categorical_features:
    temp = df.groupby(feature)['Loan_Status'].count()/len(df)
    temp_df = temp[temp>0.01].index
    df[feature] = np.where(df[feature].isin(temp_df), df[feature], 'Rare_var')

In [21]:
for feature in categorical_features:
    labels_ordered=df.groupby([feature])['Loan_Status'].index
    labels_ordered={k:i for i,k in enumerate(labels_ordered,0)}
    df[feature]=df[feature].map(labels_ordered)

AttributeError: 'SeriesGroupBy' object has no attribute 'index'

In [20]:
## Feature scaling
feature_scale = [feature for feature in df.columns if feature not in ['Loan_ID' , 'SalePrice']]

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(df[feature_scale])

ValueError: could not convert string to float: 'Male'