In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

In [2]:
dataset = pd.read_csv('train.csv')

In [3]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             601 non-null    object 
 2   Married            611 non-null    object 
 3   Dependents         599 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      582 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         592 non-null    float64
 9   Loan_Amount_Term   600 non-null    float64
 10  Credit_History     564 non-null    float64
 11  Property_Area      614 non-null    object 
 12  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 62.5+ KB


In [4]:
dataset

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y
...,...,...,...,...,...,...,...,...,...,...,...,...,...
609,LP002978,Female,No,0,Graduate,No,2900,0.0,71.0,360.0,1.0,Rural,Y
610,LP002979,Male,Yes,3+,Graduate,No,4106,0.0,40.0,180.0,1.0,Rural,Y
611,LP002983,Male,Yes,1,Graduate,No,8072,240.0,253.0,360.0,1.0,Urban,Y
612,LP002984,Male,Yes,2,Graduate,No,7583,0.0,187.0,360.0,1.0,Urban,Y


# Gender

In [5]:
# sns.pairplot(dataset, hue='Gender')

In [6]:
dataset['Gender'].replace(np.nan, 'Male', inplace=True)

# Married

In [7]:
dataset['Married'].value_counts()

Yes    398
No     213
Name: Married, dtype: int64

In [8]:
dataset['Married'].replace(np.nan, 'Yes', inplace=True)

# Dependents

In [9]:
dataset['Dependents'].value_counts()

0     345
1     102
2     101
3+     51
Name: Dependents, dtype: int64

In [10]:
dataset['Dependents'].replace(np.nan, '0', inplace=True)

# Self_Employed

In [11]:
dataset['Self_Employed'].value_counts()

No     500
Yes     82
Name: Self_Employed, dtype: int64

In [12]:
dataset['Self_Employed'].replace(np.nan, 'No', inplace=True)

# Credit_History

In [13]:
dataset.Credit_History.value_counts()

1.0    475
0.0     89
Name: Credit_History, dtype: int64

In [14]:
dataset['Credit_History'].replace(np.nan, 1, inplace=True)

# Data Preprocessing

In [15]:
X_train = dataset.iloc[:, 1:-1].values
y_train = dataset.iloc[:, -1].values

In [16]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
X_train[:, 7:9] = imputer.fit_transform(X_train[:, 7:9])

In [17]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
y_train = encoder.fit_transform(y_train)

gender_encoder = LabelEncoder()
X_train[:, 1] = gender_encoder.fit_transform(X_train[:, 1])

self_employed_encoder = LabelEncoder()
X_train[:, 4] = self_employed_encoder.fit_transform(X_train[:, 4])

In [18]:
from sklearn.compose import  ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

transformer = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0, 2, 3, 10])], remainder='passthrough')
X_train = transformer.fit_transform(X_train)

# Model

In [19]:
from imblearn.combine import SMOTETomek

smk = SMOTETomek()

X_train, y_train = smk.fit_resample(X_train, y_train)

In [20]:
from sklearn.ensemble import RandomForestClassifier

classifier = RandomForestClassifier()
classifier.fit(X_train, y_train)

RandomForestClassifier()

# Testing

In [21]:
test_dataset = pd.read_csv('test.csv')

In [22]:
test_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 367 entries, 0 to 366
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            367 non-null    object 
 1   Gender             356 non-null    object 
 2   Married            367 non-null    object 
 3   Dependents         357 non-null    object 
 4   Education          367 non-null    object 
 5   Self_Employed      344 non-null    object 
 6   ApplicantIncome    367 non-null    int64  
 7   CoapplicantIncome  367 non-null    int64  
 8   LoanAmount         362 non-null    float64
 9   Loan_Amount_Term   361 non-null    float64
 10  Credit_History     338 non-null    float64
 11  Property_Area      367 non-null    object 
dtypes: float64(3), int64(2), object(7)
memory usage: 34.5+ KB


In [23]:
test_dataset['Gender'].replace(np.nan, 'Male', inplace=True)
test_dataset['Married'].replace(np.nan, 'Yes', inplace=True)
test_dataset['Dependents'].replace(np.nan, '0', inplace=True)
test_dataset['Self_Employed'].replace(np.nan, 'No', inplace=True)
test_dataset['Credit_History'].replace(np.nan, 1, inplace=True)

In [24]:
test_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 367 entries, 0 to 366
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            367 non-null    object 
 1   Gender             367 non-null    object 
 2   Married            367 non-null    object 
 3   Dependents         367 non-null    object 
 4   Education          367 non-null    object 
 5   Self_Employed      367 non-null    object 
 6   ApplicantIncome    367 non-null    int64  
 7   CoapplicantIncome  367 non-null    int64  
 8   LoanAmount         362 non-null    float64
 9   Loan_Amount_Term   361 non-null    float64
 10  Credit_History     367 non-null    float64
 11  Property_Area      367 non-null    object 
dtypes: float64(3), int64(2), object(7)
memory usage: 34.5+ KB


In [25]:
test_dataset

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,LP001015,Male,Yes,0,Graduate,No,5720,0,110.0,360.0,1.0,Urban
1,LP001022,Male,Yes,1,Graduate,No,3076,1500,126.0,360.0,1.0,Urban
2,LP001031,Male,Yes,2,Graduate,No,5000,1800,208.0,360.0,1.0,Urban
3,LP001035,Male,Yes,2,Graduate,No,2340,2546,100.0,360.0,1.0,Urban
4,LP001051,Male,No,0,Not Graduate,No,3276,0,78.0,360.0,1.0,Urban
...,...,...,...,...,...,...,...,...,...,...,...,...
362,LP002971,Male,Yes,3+,Not Graduate,Yes,4009,1777,113.0,360.0,1.0,Urban
363,LP002975,Male,Yes,0,Graduate,No,4158,709,115.0,360.0,1.0,Urban
364,LP002980,Male,No,0,Graduate,No,3250,1993,126.0,360.0,1.0,Semiurban
365,LP002986,Male,Yes,0,Graduate,No,5000,2393,158.0,360.0,1.0,Rural


In [26]:
X_test = test_dataset.iloc[:, 1:].values

In [27]:
X_test[:, 7:9] = imputer.transform(X_test[:, 7:9])

In [28]:
X_test[:, 1] = gender_encoder.transform(X_test[:, 1])
X_test[:, 4] = self_employed_encoder.transform(X_test[:, 4])

In [29]:
X_test = transformer.transform(X_test)

In [30]:
y_pred = classifier.predict(X_test)
y_pred = encoder.inverse_transform(y_pred)

# Creating Sample

In [31]:
sample_dataset = pd.read_csv('test.csv')

In [33]:
sample = pd.DataFrame(columns=['Loan_ID', 'Loan_Status'])

In [34]:
sample.Loan_ID = sample_dataset.iloc[:, 0].values
sample.Loan_Status = y_pred

In [35]:
sample.to_csv('sample.csv', index=False)