In [20]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn import metrics
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import BaggingClassifier
from imblearn.over_sampling import SMOTE
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import cross_val_score

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [2]:
dftrain = pd.read_csv('trainloan.csv')
dftest = pd.read_csv('testloan.csv')
df = pd.concat([dftrain, dftest], ignore_index=True)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  This is separate from the ipykernel package so we can avoid doing imports until


In [3]:
df.head(5)

Unnamed: 0,ApplicantIncome,CoapplicantIncome,Credit_History,Dependents,Education,Gender,LoanAmount,Loan_Amount_Term,Loan_ID,Loan_Status,Married,Property_Area,Self_Employed
0,5849,0.0,1.0,0,Graduate,Male,,360.0,LP001002,Y,No,Urban,No
1,4583,1508.0,1.0,1,Graduate,Male,128.0,360.0,LP001003,N,Yes,Rural,No
2,3000,0.0,1.0,0,Graduate,Male,66.0,360.0,LP001005,Y,Yes,Urban,Yes
3,2583,2358.0,1.0,0,Not Graduate,Male,120.0,360.0,LP001006,Y,Yes,Urban,No
4,6000,0.0,1.0,0,Graduate,Male,141.0,360.0,LP001008,Y,No,Urban,No


In [4]:
df['ApplicantIncome'] = df['ApplicantIncome'].astype(np.int32)
df['CoapplicantIncome'] = df['CoapplicantIncome'] .astype(np.int32)
df['Credit_History'] = df['Credit_History'].astype('category')
df['Credit_History'] = df['Credit_History'].astype('category')
df['Dependents'] = df['Dependents'].astype('category')
df['Education'] = df['Education'].astype('category')
df['Gender'] = df['Gender'].astype('category')
df['LoanAmount'] = df['LoanAmount'].astype(np.float16)
df['Loan_Amount_Term'] = df['Loan_Amount_Term'].astype('category')
df['Married'] = df['Married'].astype('category')
df['Property_Area'] = df['Property_Area'].astype('category')
df['Self_Employed'] = df['Self_Employed'].astype('category')
df.drop('Loan_ID', inplace=True, axis=1)
df['Loan_Status'] = (df['Loan_Status'] == 'Y').astype(np.int8)

In [5]:
df.isnull().sum()

ApplicantIncome       0
CoapplicantIncome     0
Credit_History       79
Dependents           25
Education             0
Gender               24
LoanAmount           27
Loan_Amount_Term     20
Loan_Status           0
Married               3
Property_Area         0
Self_Employed        55
dtype: int64

In [6]:
df = df.copy()

In [7]:
df.loc[df['Loan_Status'] == 1, 'Credit_History'] = df.loc[df['Loan_Status'] == 1, 'Credit_History'].fillna(1)
df.loc[df['Loan_Status'] == 0, 'Credit_History'] = df.loc[df['Loan_Status'] == 0, 'Credit_History'].fillna(0)
df.loc[df['Married'] == 'No', 'Dependents'] = df.loc[df['Married'] == 'No', 'Dependents'].fillna('0')
df.loc[df['Credit_History'] == 1, 'Dependents'] = df.loc[df['Credit_History'] == 1, 'Dependents'].fillna('0')
df.loc[:,'Dependents'] = df['Dependents'].fillna(df['Dependents'].mode()[0])
df.loc[(df['Married'] == 'Yes') & (df['ApplicantIncome'] > 20000), 'Gender'] = df.loc[(df['Married'] == 'Yes') & (df['ApplicantIncome'] > 20000), 'Gender'].fillna('Male')
df.loc[(df['Education'] == 'Graduate') & (df['ApplicantIncome'] > 10000), 'Gender'] = df.loc[(df['Education'] == 'Graduate') & (df['ApplicantIncome'] > 10000), 'Gender'].fillna('Male')
df.loc[:,'Gender'] = df['Gender'].fillna(df['Gender'].mode()[0])
df.loc[:,'LoanAmount'] = df['LoanAmount'].fillna(df['LoanAmount'].median())
df.loc[:,'Loan_Amount_Term'] = df['Loan_Amount_Term'].fillna(df['Loan_Amount_Term'].mode()[0])
df.loc[(df['Dependents'] == 'Graduate') & (df['ApplicantIncome'] > 10000), 'Gender'] = df.loc[(df['Dependents'] == 'Graduate') & (df['ApplicantIncome'] > 10000), 'Gender'].fillna('Male')
df.loc[:,'Married'] = df['Married'].fillna(df['Married'].mode()[0])
df.loc[:,'Self_Employed'] = df['Self_Employed'].fillna(df['Self_Employed'].mode()[0])

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 981 entries, 0 to 980
Data columns (total 12 columns):
ApplicantIncome      981 non-null int32
CoapplicantIncome    981 non-null int32
Credit_History       981 non-null category
Dependents           981 non-null category
Education            981 non-null category
Gender               981 non-null category
LoanAmount           981 non-null float16
Loan_Amount_Term     981 non-null category
Loan_Status          981 non-null int8
Married              981 non-null category
Property_Area        981 non-null category
Self_Employed        981 non-null category
dtypes: category(8), float16(1), int32(2), int8(1)
memory usage: 19.4 KB


In [10]:
x_train = df.iloc[:dftrain.shape[0]][:].drop(columns='Loan_Status')
y_train = df.iloc[:dftrain.shape[0]]['Loan_Status']
x_test = df.iloc[dftrain.shape[0]:][:].drop(columns='Loan_Status')
x_train = pd.get_dummies(x_train, drop_first=True)
x_test = pd.get_dummies(x_test, drop_first=True)

In [16]:
logit = LogisticRegression(solver='liblinear')

In [17]:
cv = cross_val_score(logit, x_train, y_train, cv=3)
print(cv.mean()*100)
print(cv)

82.90371433126097
[0.82439024 0.80487805 0.85784314]


In [18]:
param = {'C': np.linspace(0.01,1,10),
        'penalty': ['l1', 'l2'],
        'class_weight': ['balanced', None]}

In [19]:
grid = GridSearchCV(logit, param_grid=param, scoring='accuracy', cv=3)
grid.fit(x_train, y_train)
print(grid.best_estimator_)

LogisticRegression(C=0.89, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l1', random_state=None, solver='liblinear',
          tol=0.0001, verbose=0, warm_start=False)


In [21]:
bagg = BaggingClassifier(grid.best_estimator_, 400, max_features=20, max_samples=300)
cv = cross_val_score(bagg, x_train, y_train, cv=3)
print(cv.mean()*100)
print(cv)
bagg.fit(x_train, y_train)

82.90371433126097
[0.82439024 0.80487805 0.85784314]


BaggingClassifier(base_estimator=LogisticRegression(C=0.89, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l1', random_state=None, solver='liblinear',
          tol=0.0001, verbose=0, warm_start=False),
         bootstrap=True, bootstrap_features=False, max_features=20,
         max_samples=300, n_estimators=400, n_jobs=None, oob_score=False,
         random_state=None, verbose=0, warm_start=False)

In [None]:
pred_test = bagg.predict(x_test)
pred_test = pd.Series(pred_test).map({1: 'Y', 0: 'N'})

In [None]:
submit = pd.DataFrame({'Loan_ID': dftest['Loan_ID'], 'Loan_Status': pred_test})
submit.to_csv('submit.csv', index=False)