# Predict Loan Eligibility for a Finance company

### The loan eligibility process is a time-consuming process, so automation is required for faster and better outputs. The customer first applies for a home loan, and after that, the company validates the customer's eligibility for a loan. The company wants to automate the loan eligibility process (in real-time) based on the customer details provided while filling out the form, like gender, marriage, dependents, education, self-employed, loan term, credit history, applicant income, co-applicant income, and loan amount. We built a classification model to target customers who are eligible for loan amounts.

In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 

In [10]:
train = pd.read_csv(r"D:\Data Science\1st Trimester (Aug-Oct,2023)\Machine Learning\ML Project\Loan Eligibilty Prediction\train.csv")
train.head(5)

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [11]:
test = pd.read_csv(r"D:\Data Science\1st Trimester (Aug-Oct,2023)\Machine Learning\ML Project\Loan Eligibilty Prediction\test.csv")
test.head(5)

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,LP001015,Male,Yes,0,Graduate,No,5720,0,110.0,360.0,1.0,Urban
1,LP001022,Male,Yes,1,Graduate,No,3076,1500,126.0,360.0,1.0,Urban
2,LP001031,Male,Yes,2,Graduate,No,5000,1800,208.0,360.0,1.0,Urban
3,LP001035,Male,Yes,2,Graduate,No,2340,2546,100.0,360.0,,Urban
4,LP001051,Male,No,0,Not Graduate,No,3276,0,78.0,360.0,1.0,Urban


In [12]:
print(train.shape)

(614, 13)


In [13]:
print(test.shape)

(367, 12)


In [14]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             601 non-null    object 
 2   Married            611 non-null    object 
 3   Dependents         599 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      582 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         592 non-null    float64
 9   Loan_Amount_Term   600 non-null    float64
 10  Credit_History     564 non-null    float64
 11  Property_Area      614 non-null    object 
 12  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 62.5+ KB


### Missing value analysis

In [15]:
train.isnull().sum()

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [16]:
test.isnull().sum()

Loan_ID               0
Gender               11
Married               0
Dependents           10
Education             0
Self_Employed        23
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount            5
Loan_Amount_Term      6
Credit_History       29
Property_Area         0
dtype: int64

Studying the input variables

In [17]:
test['Gender'].value_counts()

Male      286
Female     70
Name: Gender, dtype: int64

In [18]:
train['LoanAmount'].value_counts()

120.0    20
110.0    17
100.0    15
160.0    12
187.0    12
         ..
240.0     1
214.0     1
59.0      1
166.0     1
253.0     1
Name: LoanAmount, Length: 203, dtype: int64

looks like, LoanAmount has outliers, we will standardize it later

In [19]:
test['Dependents'].value_counts()

0     200
2      59
1      58
3+     40
Name: Dependents, dtype: int64

In [20]:
test['Credit_History'].value_counts()

1.0    279
0.0     59
Name: Credit_History, dtype: int64

In [21]:
train['Self_Employed'].value_counts()

No     500
Yes     82
Name: Self_Employed, dtype: int64

In [22]:
train['Loan_Amount_Term'].value_counts()

360.0    512
180.0     44
480.0     15
300.0     13
240.0      4
84.0       4
120.0      3
60.0       2
36.0       2
12.0       1
Name: Loan_Amount_Term, dtype: int64

### IMPUTING FOR CATEGORICAL VARIABLES

In [23]:
train['Gender'] = train['Gender'].fillna(train['Gender'].mode()[0]) #imputing missing values for gender
test['Gender'] = test['Gender'].fillna(test['Gender'].mode()[0])

In [24]:
train['Married'] = train['Married'].fillna(train['Married'].mode()[0]) #imputing missing values for married
test['Married'] = test['Married'].fillna(test['Married'].mode()[0]) 

In [25]:
train['Dependents'] = train['Dependents'].fillna(train['Dependents'].mode()[0]) #imputing missing values for Dependents
test['Dependents'] = test['Dependents'].fillna(test['Dependents'].mode()[0]) 

In [26]:
train['Self_Employed'].fillna('No',inplace=True)
test['Self_Employed'].fillna('No',inplace=True)

In [27]:
train['Credit_History'] = train['Credit_History'].fillna(train['Credit_History'].mode()[0])
test['Credit_History'] = test['Credit_History'].fillna(test['Credit_History'].mode()[0])

### IMPUTING FOR CONTINUOUS VARIABLE

In [28]:
train['LoanAmount'] = train['LoanAmount'].fillna(train['LoanAmount'].median())
test['LoanAmount'] = test['LoanAmount'].fillna(test['LoanAmount'].median())

In [29]:
train['Loan_Amount_Term'] = train['Loan_Amount_Term'].fillna(train['Loan_Amount_Term'].median())
test['Loan_Amount_Term'] = test['Loan_Amount_Term'].fillna(test['Loan_Amount_Term'].median())

In [30]:
train.isnull().sum()

Loan_ID              0
Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64

In [31]:
test.isnull().sum()

Loan_ID              0
Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
dtype: int64

In [32]:
#Credit History 
train["Credit_History"]=train["Credit_History"].astype("object")
test["Credit_History"]=test["Credit_History"].astype("object")

In [33]:
train['Loan_Amount_Term']=train['Loan_Amount_Term'].astype(int)

### chi-square analysis(to undersand the dependency between input categorical variables)

In [34]:
df_chi=train.copy()

In [35]:
df_chi.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,128.0,360,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360,1.0,Urban,Y


In [36]:
#Assigning levels to the categories
lis = []
for i in range(0, df_chi.shape[1]):
    if(df_chi.iloc[:,i].dtypes == 'object'):
        df_chi.iloc[:,i] = pd.Categorical(df_chi.iloc[:,i])
        df_chi.iloc[:,i] = df_chi.iloc[:,i].cat.codes 
        lis.append(df_chi.columns[i])

  df_chi.iloc[:,i] = pd.Categorical(df_chi.iloc[:,i])
  df_chi.iloc[:,i] = df_chi.iloc[:,i].cat.codes
  df_chi.iloc[:,i] = pd.Categorical(df_chi.iloc[:,i])
  df_chi.iloc[:,i] = df_chi.iloc[:,i].cat.codes
  df_chi.iloc[:,i] = pd.Categorical(df_chi.iloc[:,i])
  df_chi.iloc[:,i] = df_chi.iloc[:,i].cat.codes
  df_chi.iloc[:,i] = pd.Categorical(df_chi.iloc[:,i])
  df_chi.iloc[:,i] = df_chi.iloc[:,i].cat.codes
  df_chi.iloc[:,i] = pd.Categorical(df_chi.iloc[:,i])
  df_chi.iloc[:,i] = df_chi.iloc[:,i].cat.codes
  df_chi.iloc[:,i] = pd.Categorical(df_chi.iloc[:,i])
  df_chi.iloc[:,i] = df_chi.iloc[:,i].cat.codes
  df_chi.iloc[:,i] = pd.Categorical(df_chi.iloc[:,i])
  df_chi.iloc[:,i] = pd.Categorical(df_chi.iloc[:,i])
  df_chi.iloc[:,i] = df_chi.iloc[:,i].cat.codes
  df_chi.iloc[:,i] = pd.Categorical(df_chi.iloc[:,i])
  df_chi.iloc[:,i] = df_chi.iloc[:,i].cat.codes
  df_chi.iloc[:,i] = pd.Categorical(df_chi.iloc[:,i])
  df_chi.iloc[:,i] = df_chi.iloc[:,i].cat.codes


In [70]:
cat_var=["Gender","Married","Dependents","Education","Self_Employed","Credit_History","Loan_Status"] 
catdf=df_chi[cat_var]

In [71]:
catdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype
---  ------          --------------  -----
 0   Gender          614 non-null    int8 
 1   Married         614 non-null    int8 
 2   Dependents      614 non-null    int8 
 3   Education       614 non-null    int8 
 4   Self_Employed   614 non-null    int8 
 5   Credit_History  614 non-null    int8 
 6   Loan_Status     614 non-null    int8 
dtypes: int8(7)
memory usage: 4.3 KB


In [39]:
from sklearn.feature_selection import chi2
n= 7
for i in range(0,6):
    X=catdf.iloc[:,i+1:n]
    y=catdf.iloc[:,i]
    chi_scores = chi2(X,y)
    p_values = pd.Series(chi_scores[1],index = X.columns)
    print("for",i)
    print(p_values)
    for j in range (0, len(p_values)):
        if (p_values[j]<0.05):
            print(p_values[j])

for 0
Married           1.033668e-07
Dependents        5.440191e-07
Education         3.202845e-01
Self_Employed     9.903387e-01
Credit_History    9.310624e-01
Loan_Status       8.031823e-01
dtype: float64
1.0336676344223253e-07
5.440190932725145e-07
for 1
Dependents        3.535914e-22
Education         7.874868e-01
Self_Employed     9.175394e-01
Credit_History    9.178124e-01
Loan_Status       2.049573e-01
dtype: float64
3.5359141698071287e-22
for 2
Education         0.602805
Self_Employed     0.159554
Credit_History    0.955121
Loan_Status       0.804772
dtype: float64
for 3
Self_Employed     0.810728
Credit_History    0.487127
Loan_Status       0.234030
dtype: float64
for 4
Credit_History    0.988330
Loan_Status       0.959107
dtype: float64
for 5
Loan_Status    6.878727e-14
dtype: float64
6.878726741563634e-14


will drop dependents and gender as they are highly correlated with 2 other variables

In [72]:
#Loan ID is dropped because it is not required
train=train.drop(["Gender"],axis=1)
test=test.drop(["Gender"],axis=1)

KeyError: "['Gender'] not found in axis"

In [None]:
#gender is correlated with married and dependent
train=train.drop(["Loan_ID"],axis=1)
test=test.drop(["Loan_ID"],axis=1)

In [None]:
train=train.drop(["Dependents"],axis=1)
test=test.drop(["Dependents"],axis=1)

In [None]:
train.info()

### dummy variable for categorical variables

In [42]:
df_final= pd.get_dummies(train[["Married","Education","Self_Employed","Credit_History","Property_Area"]], drop_first=True, dtype=bool)

  df_final= pd.get_dummies(train[["Married","Education","Self_Employed","Credit_History","Property_Area"]], drop_first=True, dtype=bool)


In [43]:
df_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 6 columns):
 #   Column                   Non-Null Count  Dtype
---  ------                   --------------  -----
 0   Married_Yes              614 non-null    bool 
 1   Education_Not Graduate   614 non-null    bool 
 2   Self_Employed_Yes        614 non-null    bool 
 3   Credit_History_1.0       614 non-null    bool 
 4   Property_Area_Semiurban  614 non-null    bool 
 5   Property_Area_Urban      614 non-null    bool 
dtypes: bool(6)
memory usage: 3.7 KB


In [44]:
train=train.drop(["Married","Education","Self_Employed","Credit_History","Property_Area"],axis=1)

In [45]:
mergedDf = train.merge(df_final, left_index=True, right_index=True)

In [46]:
#Standardizing the numerical variables
mergedDf["ApplicantIncome"]= (mergedDf["ApplicantIncome"] - mergedDf["ApplicantIncome"].mean())/mergedDf["ApplicantIncome"].std()
mergedDf["CoapplicantIncome"]= (mergedDf["CoapplicantIncome"] - mergedDf["CoapplicantIncome"].mean())/mergedDf["CoapplicantIncome"].std()
mergedDf["LoanAmount"]= (mergedDf["LoanAmount"] - mergedDf["LoanAmount"].mean())/mergedDf["LoanAmount"].std()
mergedDf["Loan_Amount_Term"]= (mergedDf["Loan_Amount_Term"] - mergedDf["Loan_Amount_Term"].mean())/mergedDf["Loan_Amount_Term"].std()


In [47]:
lis = []
for i in range(0, mergedDf.shape[1]):
    if(mergedDf.iloc[:,i].dtypes == 'object'):
        mergedDf.iloc[:,i] = pd.Categorical(mergedDf.iloc[:,i])
        mergedDf.iloc[:,i] = mergedDf.iloc[:,i].cat.codes 
        lis.append(mergedDf.columns[i])

  mergedDf.iloc[:,i] = pd.Categorical(mergedDf.iloc[:,i])
  mergedDf.iloc[:,i] = mergedDf.iloc[:,i].cat.codes
  mergedDf.iloc[:,i] = pd.Categorical(mergedDf.iloc[:,i])
  mergedDf.iloc[:,i] = mergedDf.iloc[:,i].cat.codes
  mergedDf.iloc[:,i] = pd.Categorical(mergedDf.iloc[:,i])
  mergedDf.iloc[:,i] = mergedDf.iloc[:,i].cat.codes


In [48]:
mergedDf.tail(5)

Unnamed: 0,Loan_ID,Dependents,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Loan_Status,Married_Yes,Education_Not Graduate,Self_Employed_Yes,Credit_History_1.0,Property_Area_Semiurban,Property_Area_Urban
609,609,0,-0.409796,-0.554036,-0.888775,0.273009,1,False,False,False,True,False,False
610,610,3,-0.212383,-0.554036,-1.257353,-2.52078,1,True,False,False,True,False,False
611,611,1,0.436818,-0.472019,1.275129,0.273009,1,True,False,False,True,False,True
612,612,2,0.356773,-0.554036,0.490416,0.273009,1,True,False,False,True,False,True
613,613,0,-0.134302,-0.554036,-0.151621,0.273009,0,False,False,True,False,True,False


In [49]:
mergedDf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Loan_ID                  614 non-null    int16  
 1   Dependents               614 non-null    int8   
 2   ApplicantIncome          614 non-null    float64
 3   CoapplicantIncome        614 non-null    float64
 4   LoanAmount               614 non-null    float64
 5   Loan_Amount_Term         614 non-null    float64
 6   Loan_Status              614 non-null    int8   
 7   Married_Yes              614 non-null    bool   
 8   Education_Not Graduate   614 non-null    bool   
 9   Self_Employed_Yes        614 non-null    bool   
 10  Credit_History_1.0       614 non-null    bool   
 11  Property_Area_Semiurban  614 non-null    bool   
 12  Property_Area_Urban      614 non-null    bool   
dtypes: bool(6), float64(4), int16(1), int8(2)
memory usage: 25.3 KB


### Preparing target and feature variables

In [50]:
X=mergedDf.drop(["Loan_Status"],axis=1)
Y=mergedDf["Loan_Status"]
Y=Y.astype(int)
x=np.array(X)
y=np.array(Y)

In [51]:
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier

## **Checking for the f1 scores for Naive Bayes**

In [52]:
#Naive Bayes
parameters = {'priors':[[0.01, 0.99],[0.1, 0.9], [0.2, 0.8], [0.25, 0.75], [0.3, 0.7],[0.35, 0.65], [0.4, 0.6],[0.45,0.55],[0.5,0.5],[0.55,0.45],[0.6,0.4]]}
nb = GridSearchCV(GaussianNB(), parameters, scoring = 'f1', n_jobs=-1)
nb.fit(x, y)
scores = cross_val_score(nb, x, y, cv=5,scoring = 'f1')
print("Cross validation score: {0:.2%} (+/- {1:.2%})".format(np.mean(scores), np.std(scores)*2))

Cross validation score: 87.23% (+/- 4.16%)


## Checking for the f1 scores for Decision trees

In [53]:
from sklearn import tree
from sklearn.pipeline import Pipeline
decisiontree = tree.DecisionTreeClassifier()
pipe = Pipeline(steps=[('decisiontree', decisiontree)])
criterion = ['gini', 'entropy']
max_depth = list(range(1,20))
parameters = dict(decisiontree__criterion=criterion,decisiontree__max_depth=max_depth)
dt = GridSearchCV(pipe, parameters,cv=5,scoring="f1", n_jobs=-1)
dt.fit(x,y)
scores = cross_val_score(dt, x, y,scoring="f1", cv=5)
print("DT Cross validation f1 score: {0:.2%} (+/- {1:.2%})".format(np.mean(scores), np.std(scores)*2))

DT Cross validation f1 score: 87.67% (+/- 3.22%)


## Checking for the f1 scores for knn

In [54]:
number_of_neighbors = range(1,20)
params = {'n_neighbors':number_of_neighbors}
knn = KNeighborsClassifier()
knnmodel = GridSearchCV(knn, params, cv=5,scoring="f1", n_jobs=-1)
knnmodel.fit(x,y)
scores = cross_val_score(knnmodel, x, y,scoring="f1", cv=5)
print("KNN Cross validation f1 score: {0:.2%} (+/- {1:.2%})".format(np.mean(scores), np.std(scores)*2))

KNN Cross validation f1 score: 76.17% (+/- 20.91%)


### working on the test data

In [55]:
df_cat= pd.get_dummies(test[["Married","Education","Self_Employed","Credit_History","Property_Area"]], drop_first=True, dtype=bool)

  df_cat= pd.get_dummies(test[["Married","Education","Self_Employed","Credit_History","Property_Area"]], drop_first=True, dtype=bool)


In [56]:
df_cat.head()

Unnamed: 0,Married_Yes,Education_Not Graduate,Self_Employed_Yes,Credit_History_1.0,Property_Area_Semiurban,Property_Area_Urban
0,True,False,False,True,False,True
1,True,False,False,True,False,True
2,True,False,False,True,False,True
3,True,False,False,True,False,True
4,False,True,False,True,False,True


In [57]:
test=test.drop(["Married","Education","Self_Employed","Credit_History","Property_Area"],axis=1)

In [58]:
finaldDf = test.merge(df_cat, left_index=True, right_index=True)

In [59]:
finaldDf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 367 entries, 0 to 366
Data columns (total 12 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Loan_ID                  367 non-null    object 
 1   Dependents               367 non-null    object 
 2   ApplicantIncome          367 non-null    int64  
 3   CoapplicantIncome        367 non-null    int64  
 4   LoanAmount               367 non-null    float64
 5   Loan_Amount_Term         367 non-null    float64
 6   Married_Yes              367 non-null    bool   
 7   Education_Not Graduate   367 non-null    bool   
 8   Self_Employed_Yes        367 non-null    bool   
 9   Credit_History_1.0       367 non-null    bool   
 10  Property_Area_Semiurban  367 non-null    bool   
 11  Property_Area_Urban      367 non-null    bool   
dtypes: bool(6), float64(2), int64(2), object(2)
memory usage: 19.5+ KB


In [60]:
finaldDf['Loan_Amount_Term']=finaldDf['Loan_Amount_Term'].astype(int)

In [61]:
finaldDf["ApplicantIncome"]= (finaldDf["ApplicantIncome"] - finaldDf["ApplicantIncome"].mean())/finaldDf["ApplicantIncome"].std()
finaldDf["CoapplicantIncome"]= (finaldDf["CoapplicantIncome"] - finaldDf["CoapplicantIncome"].mean())/finaldDf["CoapplicantIncome"].std()
finaldDf["LoanAmount"]= (finaldDf["LoanAmount"] - finaldDf["LoanAmount"].mean())/finaldDf["LoanAmount"].std()
finaldDf["Loan_Amount_Term"]= (finaldDf["Loan_Amount_Term"] - finaldDf["Loan_Amount_Term"].mean())/finaldDf["Loan_Amount_Term"].std()

In [62]:
finaldDf.head()

Unnamed: 0,Loan_ID,Dependents,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Married_Yes,Education_Not Graduate,Self_Employed_Yes,Credit_History_1.0,Property_Area_Semiurban,Property_Area_Urban
0,LP001015,0,0.186206,-0.672417,-0.426198,0.265659,True,False,False,True,False,True
1,LP001022,1,-0.352211,-0.029808,-0.16373,0.265659,True,False,False,True,False,True
2,LP001031,2,0.039587,0.098714,1.18142,0.265659,True,False,False,True,False,True
3,LP001035,2,-0.502089,0.418306,-0.590241,0.265659,True,False,False,True,False,True
4,LP001051,0,-0.311484,-0.672417,-0.951135,0.265659,False,True,False,True,False,True


In [63]:
x=np.array(finaldDf)

In [64]:
#Prediction for ouput variable
ypred=dt.predict(x)

ValueError: could not convert string to float: 'LP001015'

In [None]:
test=pd.read_csv("../input/analytics-vidhya-loan-prediction/test.csv")
test["Loan_Status"]=ypred

In [None]:
dict = {1: 'Y', 0: 'N'} 

In [None]:
test['Loan_Status']= test['Loan_Status'].map(dict) 

In [65]:
test.columns

Index(['Loan_ID', 'Dependents', 'ApplicantIncome', 'CoapplicantIncome',
       'LoanAmount', 'Loan_Amount_Term'],
      dtype='object')

In [69]:
test=test.drop(['Gender', 'Married', 'Dependents', 'Education',
       'Self_Employed', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Property_Area'],axis=1)

KeyError: "['Gender', 'Married', 'Education', 'Self_Employed', 'Credit_History', 'Property_Area'] not found in axis"

In [67]:
test.head()

Unnamed: 0,Loan_ID,Dependents,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term
0,LP001015,0,5720,0,110.0,360.0
1,LP001022,1,3076,1500,126.0,360.0
2,LP001031,2,5000,1800,208.0,360.0
3,LP001035,2,2340,2546,100.0,360.0
4,LP001051,0,3276,0,78.0,360.0


In [68]:
test.to_csv("submission.csv")

# So we get the best f1 score of approx 87.67% with DT!!!