In [5]:
import pandas as pd 
import numpy as np 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder 
from sklearn.impute import SimpleImputer 
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [6]:
test = pd.read_csv('TEST.csv')
train = pd.read_csv('TRAIN.csv')
submission = pd.read_csv('sample_submission.csv')

In [7]:
print("Test data CSV")
print(test.head())
print("Tain data CSV")
print(train.head())

Test data CSV
    Loan_ID Gender Married Dependents     Education Self_Employed  \
0  LP001015   Male     Yes          0      Graduate            No   
1  LP001022   Male     Yes          1      Graduate            No   
2  LP001031   Male     Yes          2      Graduate            No   
3  LP001035   Male     Yes          2      Graduate            No   
4  LP001051   Male      No          0  Not Graduate            No   

   ApplicantIncome  CoapplicantIncome  LoanAmount  Loan_Amount_Term  \
0             5720                  0       110.0             360.0   
1             3076               1500       126.0             360.0   
2             5000               1800       208.0             360.0   
3             2340               2546       100.0             360.0   
4             3276                  0        78.0             360.0   

   Credit_History Property_Area  
0             1.0         Urban  
1             1.0         Urban  
2             1.0         Urban  
3       

In [8]:
test.isnull().sum()

Loan_ID               0
Gender               11
Married               0
Dependents           10
Education             0
Self_Employed        23
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount            5
Loan_Amount_Term      6
Credit_History       29
Property_Area         0
dtype: int64

In [9]:
train.isnull().sum()

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [10]:
train['is_train'] = 1
test['is_train'] = 0 
test['Loan_Status'] = np.nan
data = pd.concat([train,test],sort = False)


In [11]:
imputer = SimpleImputer(strategy = 'most_frequent')
cols_to_impute = ['Gender','Married','Dependents','Self_Employed',
                  'Credit_History','Loan_Amount_Term']
data[cols_to_impute]=imputer.fit_transform(data[cols_to_impute])
data['LoanAmount'].fillna(data['LoanAmount'].median())

0      126.0
1      128.0
2       66.0
3      120.0
4      141.0
       ...  
362    113.0
363    115.0
364    126.0
365    158.0
366     98.0
Name: LoanAmount, Length: 981, dtype: float64

In [12]:
data['LoanAmount'] = data['LoanAmount'].fillna(data['LoanAmount'].median())


In [13]:
for col in ['Gender','Married','Education','Self_Employed',
'Property_Area','Dependents']:
    data[col] = LabelEncoder().fit_transform(data[col].astype(str))


In [14]:
data ['Loan_Status'] = data['Loan_Status'].map({'Y':1,'N':0})

In [15]:
data['Total_Income'] = data['ApplicantIncome']+data['CoapplicantIncome']
data['Income_by_Loan'] = data['Total_Income'] /(data['LoanAmount'] + 1)

In [16]:
train = data[data['is_train'] == 1].drop(['Loan_ID','is_train'],axis = 1)
test = data[data['is_train'] == 0].drop(['Loan_ID','Loan_Status','is_train'], axis = 1)

In [17]:
x = train.drop('Loan_Status',axis= 1)
y = train['Loan_Status']

In [18]:
x_train,x_val,y_train,y_val = train_test_split(x,y,test_size = 0.2,random_state = 42)
model = RandomForestClassifier(n_estimators = 100,random_state = 42)
model.fit(x_train,y_train)

In [19]:
y_pred = model.predict(x_val)
print("Validation Accuracy:" ,accuracy_score(y_val,y_pred))

Validation Accuracy: 0.7886178861788617


In [20]:
submission = pd.DataFrame()
submission['Loan_ID'] = data[data['is_train'] == 0]['Loan_ID'].values
final_preds = model.predict(test)
submission ['Loan_Status'] = np.where(final_preds ==1,'Y','N')
submission.to_csv('Vidhya Loan Solved.csv',index = False) 


In [21]:
pd.read_csv('Vidhya Loan Solved.csv')

Unnamed: 0,Loan_ID,Loan_Status
0,LP001015,Y
1,LP001022,Y
2,LP001031,Y
3,LP001035,Y
4,LP001051,Y
...,...,...
362,LP002971,Y
363,LP002975,Y
364,LP002980,Y
365,LP002986,Y


In [22]:
loans_approved = (submission['Loan_Status'] == 'N').sum()
print("Total Approved",loans_approved)

Total Approved 77
