##**Step-1: Install/ Import the required Python Packages/ Libraries, Mount the Google Drive and read and check the Data and Customer files**

**1) Install/ Import the required Python Packages/ Libraries**

In [None]:
#Import required python packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb
from sklearn.impute import SimpleImputer
from sklearn import preprocessing
from sklearn import svm
from sklearn.model_selection import cross_val_score
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline

In [None]:
pip install category_encoders

Collecting category_encoders
  Downloading category_encoders-2.2.2-py2.py3-none-any.whl (80 kB)
[?25l[K     |████                            | 10 kB 27.2 MB/s eta 0:00:01[K     |████████▏                       | 20 kB 27.0 MB/s eta 0:00:01[K     |████████████▏                   | 30 kB 11.6 MB/s eta 0:00:01[K     |████████████████▎               | 40 kB 9.0 MB/s eta 0:00:01[K     |████████████████████▎           | 51 kB 5.1 MB/s eta 0:00:01[K     |████████████████████████▍       | 61 kB 5.6 MB/s eta 0:00:01[K     |████████████████████████████▍   | 71 kB 5.4 MB/s eta 0:00:01[K     |████████████████████████████████| 80 kB 3.5 MB/s 
Installing collected packages: category-encoders
Successfully installed category-encoders-2.2.2


**2) Mounting the Google Drive**

In [None]:
# Mount the Google Drive
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


**3) Read the Data file and Customer file and check**

In [None]:
# Read the Diabetes Data from .csv file and check the data shape (number of Rows and Columns)
train_df = pd.read_csv('gdrive/My Drive/SRM-MLP-Internship-2021/Projects/Classification/01-Loan-Status-Prediction/Data-Files/Train_Loan_Status.csv')
test_df = pd.read_csv('gdrive/My Drive/SRM-MLP-Internship-2021/Projects/Classification/01-Loan-Status-Prediction/Data-Files/Customer_Loan_Status.csv')
print(train_df.shape)
print(test_df.shape)

(614, 13)
(367, 12)


##**Step-2: Combine the Train and Test File**

In [None]:
train_df['train']=1
test_df['test'] = 0

In [None]:
print(train_df.shape)
print(test_df.shape)

(614, 14)
(367, 13)


In [None]:
train_df.info()
print()
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             601 non-null    object 
 2   Married            611 non-null    object 
 3   Dependents         599 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      582 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         592 non-null    float64
 9   Loan_Amount_Term   600 non-null    float64
 10  Credit_History     564 non-null    float64
 11  Property_Area      614 non-null    object 
 12  Loan_Status        614 non-null    object 
 13  train              614 non-null    int64  
dtypes: float64(4), int64(2), object(8)
memory usage: 67.3+ KB

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 367 entries, 

In [None]:
combined_df  = pd.concat([train_df, test_df])
combined_df.shape

(981, 15)

In [None]:
combined_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 981 entries, 0 to 366
Data columns (total 15 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            981 non-null    object 
 1   Gender             957 non-null    object 
 2   Married            978 non-null    object 
 3   Dependents         956 non-null    object 
 4   Education          981 non-null    object 
 5   Self_Employed      926 non-null    object 
 6   ApplicantIncome    981 non-null    int64  
 7   CoapplicantIncome  981 non-null    float64
 8   LoanAmount         954 non-null    float64
 9   Loan_Amount_Term   961 non-null    float64
 10  Credit_History     902 non-null    float64
 11  Property_Area      981 non-null    object 
 12  Loan_Status        614 non-null    object 
 13  train              614 non-null    float64
 14  test               367 non-null    float64
dtypes: float64(6), int64(1), object(8)
memory usage: 122.6+ KB


##**Step-3: Check the Data Types of the Columns as well as Missing Data**

**1) Execute the "info()" command and check datatypes of the Columns and Missing Data**

In [None]:
combined_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 981 entries, 0 to 366
Data columns (total 15 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            981 non-null    object 
 1   Gender             957 non-null    object 
 2   Married            978 non-null    object 
 3   Dependents         956 non-null    object 
 4   Education          981 non-null    object 
 5   Self_Employed      926 non-null    object 
 6   ApplicantIncome    981 non-null    int64  
 7   CoapplicantIncome  981 non-null    float64
 8   LoanAmount         954 non-null    float64
 9   Loan_Amount_Term   961 non-null    float64
 10  Credit_History     902 non-null    float64
 11  Property_Area      981 non-null    object 
 12  Loan_Status        614 non-null    object 
 13  train              614 non-null    float64
 14  test               367 non-null    float64
dtypes: float64(6), int64(1), object(8)
memory usage: 122.6+ KB


**2) Summarize the columnwise Missing Data**

In [None]:
combined_df.isnull().sum()

Loan_ID                0
Gender                24
Married                3
Dependents            25
Education              0
Self_Employed         55
ApplicantIncome        0
CoapplicantIncome      0
LoanAmount            27
Loan_Amount_Term      20
Credit_History        79
Property_Area          0
Loan_Status          367
train                367
test                 614
dtype: int64

**Observations:**
* **a) We have the missing data, hence we need to handle this.**

##**Step-4: Check on Data Preprocessing applicability (Initial)**


###**1) Checking the Missing Values and its Handling**

**a) Check the Missing Values, if any**

In [None]:
combined_df.isnull().sum()

Loan_ID                0
Gender                24
Married                3
Dependents            25
Education              0
Self_Employed         55
ApplicantIncome        0
CoapplicantIncome      0
LoanAmount            27
Loan_Amount_Term      20
Credit_History        79
Property_Area          0
Loan_Status          367
train                367
test                 614
dtype: int64

**b) Checking the total number of rows having the missing Values**

In [None]:
combined_df[combined_df.isnull().any(axis=1)]

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status,train,test
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y,1.0,
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N,1.0,
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y,1.0,
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y,1.0,
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y,1.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
362,LP002971,Male,Yes,3+,Not Graduate,Yes,4009,1777.0,113.0,360.0,1.0,Urban,,,0.0
363,LP002975,Male,Yes,0,Graduate,No,4158,709.0,115.0,360.0,1.0,Urban,,,0.0
364,LP002980,Male,No,0,Graduate,No,3250,1993.0,126.0,360.0,,Semiurban,,,0.0
365,LP002986,Male,Yes,0,Graduate,No,5000,2393.0,158.0,360.0,1.0,Rural,,,0.0


**c) Observations, Decisions and Actions**

**Observations:**
* a) Here, the data values of 7 columns are missing
* b) The total number rows having missing values is 981 against the total number of rows (981) in the dataset. 
###**So, we cannot use the option of dropping the rows having missing values.**

**Decision and Actions:**

###**Fill the missing values of the columns with that of the most_frequent values of the respective columns.**

**d) Imputation of Missing Values using the "fillna" command and checking**

In [None]:
combined_df['Gender'].fillna(combined_df['Gender'].mode().iloc[0], inplace=True)
combined_df['Married'].fillna(combined_df['Married'].mode().iloc[0], inplace=True)
combined_df['Dependents'].fillna(combined_df['Dependents'].mode().iloc[0], inplace=True)
combined_df['Self_Employed'].fillna(combined_df['Self_Employed'].mode().iloc[0], inplace=True)
combined_df['LoanAmount'].fillna(combined_df['LoanAmount'].mode().iloc[0], inplace=True)
combined_df['Loan_Amount_Term'].fillna(combined_df['Loan_Amount_Term'].mode().iloc[0], inplace=True)
combined_df['Credit_History'].fillna(combined_df['Credit_History'].mode().iloc[0], inplace=True)
combined_df['Loan_Status'].fillna(combined_df['Loan_Status'].mode().iloc[0], inplace=True)

In [None]:
combined_df.isnull().sum()

Loan_ID                0
Gender                 0
Married                0
Dependents             0
Education              0
Self_Employed          0
ApplicantIncome        0
CoapplicantIncome      0
LoanAmount             0
Loan_Amount_Term       0
Credit_History         0
Property_Area          0
Loan_Status            0
train                367
test                 614
dtype: int64

###**2) Check the unique Values of each column and observe the following:**
* **a) Wrong Data in the columns, if any** 
* **b) Wrong format of the data in the columns, if any**
* **c) Identify the columns which need to be categorically converted to numeric values by using Nominal method/ Ordinal Method**


###**Column-1: Loan_ID**

In [None]:
combined_df['Loan_ID'].value_counts()

LP002443    1
LP002059    1
LP001448    1
LP002018    1
LP001131    1
           ..
LP001900    1
LP002837    1
LP001650    1
LP001185    1
LP001669    1
Name: Loan_ID, Length: 981, dtype: int64

**Observations:**
* a) Data in this column will not be contributing to the prediction of the Depenedent variable

**Decsion:**

**We will be dropping this column**

**Action:**

In [None]:
combined_df.drop(['Loan_ID'], axis = 1, inplace = True)
combined_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 981 entries, 0 to 366
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Gender             981 non-null    object 
 1   Married            981 non-null    object 
 2   Dependents         981 non-null    object 
 3   Education          981 non-null    object 
 4   Self_Employed      981 non-null    object 
 5   ApplicantIncome    981 non-null    int64  
 6   CoapplicantIncome  981 non-null    float64
 7   LoanAmount         981 non-null    float64
 8   Loan_Amount_Term   981 non-null    float64
 9   Credit_History     981 non-null    float64
 10  Property_Area      981 non-null    object 
 11  Loan_Status        981 non-null    object 
 12  train              614 non-null    float64
 13  test               367 non-null    float64
dtypes: float64(6), int64(1), object(7)
memory usage: 115.0+ KB


###**Column-2: Gender**

In [None]:
combined_df['Gender'].value_counts()

Male      799
Female    182
Name: Gender, dtype: int64

**Observations:**
* a) Data in this column is of "Object" or "String" datatype. Also, the data levels are "Nominal" Type.

**Decsion:**

**We will be converting the data in this column into Numerical values using Nominal Type method "pd.get_dummies".**

**Action:**

In [None]:
#encode the data
gender = pd.DataFrame(combined_df['Gender'])
gender_encoded=pd.get_dummies(data= gender, drop_first=True)
gender_encoded

Unnamed: 0,Gender_Male
0,1
1,1
2,1
3,1
4,1
...,...
362,1
363,1
364,1
365,1


###**Column-3: Married**

In [None]:
combined_df['Married'].value_counts()

Yes    634
No     347
Name: Married, dtype: int64

**Observations:**
* a) Data in this column is of "Object" or "String" datatype. Also, the data levels are "Nominal" Type.

**Decsion:**

**We will be converting the data in this column into Numerical values using Nominal Type method "pd.get_dummies".**

**Action:**

In [None]:
#encode the data
married = pd.DataFrame(combined_df['Married'])
married_encoded=pd.get_dummies(data= married, drop_first=True)
married_encoded

Unnamed: 0,Married_Yes
0,0
1,1
2,1
3,1
4,0
...,...
362,1
363,1
364,0
365,1


###**Column-4: Dependents**

In [None]:
combined_df['Dependents'].value_counts()

0     570
2     160
1     160
3+     91
Name: Dependents, dtype: int64

**Observations:**
* a) Data in this column is of "Object" or "String" datatype. But the data elements are in integer format.
* b) We have one set of values "3+" which is in "Wrong Data Format"

**Decision and Actions to be taken:**

* a) Replace the values "3+" wtih "3"
* b) Convert the data type of the column to "Integer" Type.


**Action:**

In [None]:
combined_df.replace("3+", "3", inplace=True)
combined_df['Dependents'].value_counts()

0    570
2    160
1    160
3     91
Name: Dependents, dtype: int64

In [None]:
combined_df['Dependents']=combined_df['Dependents'].astype(int)
combined_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 981 entries, 0 to 366
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Gender             981 non-null    object 
 1   Married            981 non-null    object 
 2   Dependents         981 non-null    int64  
 3   Education          981 non-null    object 
 4   Self_Employed      981 non-null    object 
 5   ApplicantIncome    981 non-null    int64  
 6   CoapplicantIncome  981 non-null    float64
 7   LoanAmount         981 non-null    float64
 8   Loan_Amount_Term   981 non-null    float64
 9   Credit_History     981 non-null    float64
 10  Property_Area      981 non-null    object 
 11  Loan_Status        981 non-null    object 
 12  train              614 non-null    float64
 13  test               367 non-null    float64
dtypes: float64(6), int64(2), object(6)
memory usage: 115.0+ KB


###**Column-5: Education**

In [None]:
combined_df['Education'].value_counts()

Graduate        763
Not Graduate    218
Name: Education, dtype: int64

**Observations:**
* a) Data in this column is of "Object" or "String" datatype. Also, the data levels are "Ordinal" Type.

**Decsion:**

**We will be converting the data in this column into Numerical values using Ordnial Type method "preprocessing.LabelEncoder()**

**Action:**

In [None]:
le = preprocessing.LabelEncoder()
combined_df['Education'] = le.fit_transform(combined_df.Education.values)
combined_df['Education'].value_counts()

0    763
1    218
Name: Education, dtype: int64

###**Column-6: Self_Employed**

In [None]:
combined_df['Self_Employed'].value_counts()

No     862
Yes    119
Name: Self_Employed, dtype: int64

**Observations:**
* a) Data in this column is of "Object" or "String" datatype. Also, the data levels are "Nominal" Type.

**Decsion:**

**We will be converting the data in this column into Numerical values using Nominal Type method "pd.get_dummies".**

**Action:**

In [None]:
#encode the data
self_employed = pd.DataFrame(combined_df['Self_Employed'])
self_employed_encoded=pd.get_dummies(data= self_employed, drop_first=True)
self_employed_encoded

Unnamed: 0,Self_Employed_Yes
0,0
1,0
2,1
3,0
4,0
...,...
362,1
363,0
364,0
365,0


###**Column-7 to 11 : ApplicantIncome,	CoapplicantIncome,	LoanAmount,	Loan_Amount_Term and Credit_History**

In [None]:
combined_df.describe()

Unnamed: 0,Dependents,Education,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,train,test
count,981.0,981.0,981.0,981.0,981.0,981.0,981.0,614.0,367.0
mean,0.767584,0.222222,5179.795107,1601.91633,141.891947,342.56473,0.849134,1.0,0.0
std,1.030657,0.415952,5695.104533,2718.772806,76.436639,64.482011,0.358101,0.0,0.0
min,0.0,0.0,0.0,0.0,9.0,6.0,0.0,1.0,0.0
25%,0.0,0.0,2875.0,0.0,101.0,360.0,1.0,1.0,0.0
50%,0.0,0.0,3800.0,1110.0,125.0,360.0,1.0,1.0,0.0
75%,2.0,0.0,5516.0,2365.0,160.0,360.0,1.0,1.0,0.0
max,3.0,1.0,81000.0,41667.0,700.0,480.0,1.0,1.0,0.0


**Observations:**
* a) Here, all the Integer and float Column values are described.
* b) Each column has got a Standard Deviation, Min and Max Values.
* c) We can assume that there is no wrong data and wrong data format.
* **d) But we need to do Scaling**

###**Column-12: Property_Area**

In [None]:
combined_df['Property_Area'].value_counts()

Semiurban    349
Urban        342
Rural        290
Name: Property_Area, dtype: int64

**Observations:**
* a) Data in this column is of "Object" or "String" datatype. Also, the data levels are "Ordinal" Type.

**Decsion:**

**We will be converting the data in this column into Numerical values using Ordnial Type method "preprocessing.LabelEncoder()**

**Action:**

In [None]:
le = preprocessing.LabelEncoder()
combined_df['Property_Area'] = le.fit_transform(combined_df.Property_Area.values)
combined_df['Property_Area'].value_counts()

1    349
2    342
0    290
Name: Property_Area, dtype: int64

###**Column-13: Loan_Status**

In [None]:
combined_df['Loan_Status'].value_counts()

Y    789
N    192
Name: Loan_Status, dtype: int64

**Observations:**
* a) Data in this column is of "Object" or "String" datatype. Also, the data levels are "Ordinal" Type [Dependent Variable Column]

**Decsion:**

**We will be converting the data in this column into Numerical values using Ordnial Type method "preprocessing.LabelEncoder()**

**Action:**

In [None]:
le = preprocessing.LabelEncoder()
combined_df['Loan_Status'] = le.fit_transform(combined_df.Loan_Status.values)
combined_df['Loan_Status'].value_counts()

1    789
0    192
Name: Loan_Status, dtype: int64

##**Step-6: Drop the columns which are to be categorically converted and include the their respective coverted Numeric Values**

In [None]:
combined_df.drop(['Gender', 'Married','Self_Employed',], axis = 1, inplace = True)
combined_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 981 entries, 0 to 366
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Dependents         981 non-null    int64  
 1   Education          981 non-null    int64  
 2   ApplicantIncome    981 non-null    int64  
 3   CoapplicantIncome  981 non-null    float64
 4   LoanAmount         981 non-null    float64
 5   Loan_Amount_Term   981 non-null    float64
 6   Credit_History     981 non-null    float64
 7   Property_Area      981 non-null    int64  
 8   Loan_Status        981 non-null    int64  
 9   train              614 non-null    float64
 10  test               367 non-null    float64
dtypes: float64(6), int64(5)
memory usage: 92.0 KB


In [None]:
combined_df = pd.concat([combined_df,gender_encoded, married_encoded,self_employed_encoded], axis=1)
combined_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 981 entries, 0 to 366
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Dependents         981 non-null    int64  
 1   Education          981 non-null    int64  
 2   ApplicantIncome    981 non-null    int64  
 3   CoapplicantIncome  981 non-null    float64
 4   LoanAmount         981 non-null    float64
 5   Loan_Amount_Term   981 non-null    float64
 6   Credit_History     981 non-null    float64
 7   Property_Area      981 non-null    int64  
 8   Loan_Status        981 non-null    int64  
 9   train              614 non-null    float64
 10  test               367 non-null    float64
 11  Gender_Male        981 non-null    uint8  
 12  Married_Yes        981 non-null    uint8  
 13  Self_Employed_Yes  981 non-null    uint8  
dtypes: float64(6), int64(5), uint8(3)
memory usage: 94.8 KB


##**Step-5: Seggregate the Train and Test Data**

In [None]:
train_df1 = combined_df[combined_df["train"] == 1]
test_df1 = combined_df[combined_df["test"] == 0]
train_df1.drop(["train", "test"], axis=1, inplace=True)
test_df1.drop(["test", "train", "Loan_Status"], axis=1, inplace=True)

In [None]:
train_df1.shape

(614, 12)

In [None]:
test_df1.shape

(367, 11)

##**Step-5: Slice X and y Values**

In [None]:
X = train_df1.drop(['Loan_Status'], axis = 1)
y = train_df1['Loan_Status']
X.head()

Unnamed: 0,Dependents,Education,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Gender_Male,Married_Yes,Self_Employed_Yes
0,0,0,5849,0.0,120.0,360.0,1.0,2,1,0,0
1,1,0,4583,1508.0,128.0,360.0,1.0,0,1,1,0
2,0,0,3000,0.0,66.0,360.0,1.0,2,1,1,1
3,0,1,2583,2358.0,120.0,360.0,1.0,2,1,1,0
4,0,0,6000,0.0,141.0,360.0,1.0,2,1,0,0


In [None]:
y.head()

0    1
1    0
2    1
3    1
4    1
Name: Loan_Status, dtype: int64

In [None]:
columnNames = ['Dependents', 'Education', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term', 'Credit_History', 'Property_Area', 'Gender_Male', 'Married_Yes', 'Self_Employed_Yes']

In [None]:
min_max_scaler_object = preprocessing.MinMaxScaler()
X1 = min_max_scaler_object.fit_transform(X)
X1 = pd.DataFrame(X1 , columns = columnNames)
X1.head()

Unnamed: 0,Dependents,Education,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Gender_Male,Married_Yes,Self_Employed_Yes
0,0.0,0.0,0.070489,0.0,0.160637,0.74359,1.0,1.0,1.0,0.0,0.0
1,0.333333,0.0,0.05483,0.036192,0.172214,0.74359,1.0,0.0,1.0,1.0,0.0
2,0.0,0.0,0.03525,0.0,0.082489,0.74359,1.0,1.0,1.0,1.0,1.0
3,0.0,1.0,0.030093,0.056592,0.160637,0.74359,1.0,1.0,1.0,1.0,0.0
4,0.0,0.0,0.072356,0.0,0.191027,0.74359,1.0,1.0,1.0,0.0,0.0


##**Step-6: Execute Train-Test-Split Command and Verify**

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X1, y, test_size = 0.2, random_state = 66)

In [None]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(491, 11)
(491,)
(123, 11)
(123,)


##**Step-7: Learn the Data and Predict the dependent Variable values for the "X_test"data using "SVC()" algorithm**

In [None]:
from sklearn.svm import SVC
svc_clf = SVC(kernel = 'rbf', random_state = 0)
svc_clf.fit(X_train, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=0, shrinking=True, tol=0.001,
    verbose=False)

In [None]:
#predictions
y_pred = svc_clf.predict(X_test)

In [None]:
svc_Train_acc=svc_clf.score(X_train,y_train)
svc_Test_acc=svc_clf.score(X_test,y_test)

##**Step-8: Calculate the Accuracy of the Model**

In [None]:
print('Accuracy on training set:',kernal_SVM_Train_acc)
print('Accuracy on test set:',kernal_SVM_Test_acc)

Accuracy on training set: 0.8065173116089613
Accuracy on test set: 0.8292682926829268


##**Step-9: Display the Confusion Matrix and Classification Report of the Model**

In [None]:
from sklearn.metrics import classification_report, confusion_matrix  
print(confusion_matrix(y_test,y_pred))  
print(classification_report(y_test,y_pred))  

[[15 20]
 [ 1 87]]
              precision    recall  f1-score   support

           0       0.94      0.43      0.59        35
           1       0.81      0.99      0.89        88

    accuracy                           0.83       123
   macro avg       0.88      0.71      0.74       123
weighted avg       0.85      0.83      0.81       123



##**Step-10: SVC Algorithm Parameters Fine Tuning using GridSearch CV Method**

In [None]:
model_params = {
     'svc': {
        'model': svm.SVC(gamma='auto'),
        'params' : {
            'C': [1,10,20],
            'kernel': ['rbf','linear']
        }  
    },
 }

In [None]:
from sklearn.model_selection import GridSearchCV
import pandas as pd
scores = []

for model_name, mp in model_params.items():
    clf =  GridSearchCV(mp['model'], mp['params'], cv=5, return_train_score=False)
    clf.fit(X1, y)
    scores.append({
        'model': model_name,
        'best_score': clf.best_score_,
        'best_params': clf.best_params_
    })
    
df = pd.DataFrame(scores,columns=['model','best_score','best_params'])
df

Unnamed: 0,model,best_score,best_params
0,svc,0.809463,"{'C': 1, 'kernel': 'rbf'}"


In [None]:
#SV Classifier
svc_grid_acc = cross_val_score(SVC(C=1, kernel='rbf', gamma = 'auto'),X1, y, cv=5)
print("svc_grid_acc (CV_based) :", svc_grid_acc)
svc_grid_acc_avg=np.average(svc_grid_acc)
print()
print("svc_grid_acc_avg : ", svc_grid_acc_avg)

svc_grid_acc (CV_based) : [0.81300813 0.7804878  0.7804878  0.85365854 0.81967213]

svc_grid_acc_avg :  0.809462881514061


In [None]:
min_max_scaler_object = preprocessing.MinMaxScaler()
test_df2 = min_max_scaler_object.fit_transform(test_df1)
test_df3 = pd.DataFrame(test_df2 , columns = columnNames)
test_df3.head()

Unnamed: 0,Dependents,Education,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Gender_Male,Married_Yes,Self_Employed_Yes
0,0.0,0.0,0.078865,0.0,0.157088,0.746835,1.0,1.0,1.0,1.0,0.0
1,0.333333,0.0,0.042411,0.0625,0.187739,0.746835,1.0,1.0,1.0,1.0,0.0
2,0.666667,0.0,0.068938,0.075,0.344828,0.746835,1.0,1.0,1.0,1.0,0.0
3,0.666667,0.0,0.032263,0.106083,0.137931,0.746835,1.0,1.0,1.0,1.0,0.0
4,0.0,1.0,0.045168,0.0,0.095785,0.746835,1.0,1.0,1.0,0.0,0.0


In [None]:
#predictions for Customer Data
cust_data_pred = svc_clf.predict(test_df3)

In [None]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 367 entries, 0 to 366
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            367 non-null    object 
 1   Gender             356 non-null    object 
 2   Married            367 non-null    object 
 3   Dependents         357 non-null    object 
 4   Education          367 non-null    object 
 5   Self_Employed      344 non-null    object 
 6   ApplicantIncome    367 non-null    int64  
 7   CoapplicantIncome  367 non-null    int64  
 8   LoanAmount         362 non-null    float64
 9   Loan_Amount_Term   361 non-null    float64
 10  Credit_History     338 non-null    float64
 11  Property_Area      367 non-null    object 
 12  test               367 non-null    int64  
dtypes: float64(3), int64(3), object(7)
memory usage: 37.4+ KB


In [None]:
test_df.drop(["test"], axis=1, inplace=True)
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 367 entries, 0 to 366
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            367 non-null    object 
 1   Gender             356 non-null    object 
 2   Married            367 non-null    object 
 3   Dependents         357 non-null    object 
 4   Education          367 non-null    object 
 5   Self_Employed      344 non-null    object 
 6   ApplicantIncome    367 non-null    int64  
 7   CoapplicantIncome  367 non-null    int64  
 8   LoanAmount         362 non-null    float64
 9   Loan_Amount_Term   361 non-null    float64
 10  Credit_History     338 non-null    float64
 11  Property_Area      367 non-null    object 
dtypes: float64(3), int64(2), object(7)
memory usage: 34.5+ KB


In [None]:
test_df["Predicted_Loan_Status"]=cust_data_pred
print(test_df.shape)
test_df.head()

(367, 13)


Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Predicted_Loan_Status
0,LP001015,Male,Yes,0,Graduate,No,5720,0,110.0,360.0,1.0,Urban,1
1,LP001022,Male,Yes,1,Graduate,No,3076,1500,126.0,360.0,1.0,Urban,1
2,LP001031,Male,Yes,2,Graduate,No,5000,1800,208.0,360.0,1.0,Urban,1
3,LP001035,Male,Yes,2,Graduate,No,2340,2546,100.0,360.0,,Urban,1
4,LP001051,Male,No,0,Not Graduate,No,3276,0,78.0,360.0,1.0,Urban,1


In [None]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 367 entries, 0 to 366
Data columns (total 13 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Loan_ID                367 non-null    object 
 1   Gender                 356 non-null    object 
 2   Married                367 non-null    object 
 3   Dependents             357 non-null    object 
 4   Education              367 non-null    object 
 5   Self_Employed          344 non-null    object 
 6   ApplicantIncome        367 non-null    int64  
 7   CoapplicantIncome      367 non-null    int64  
 8   LoanAmount             362 non-null    float64
 9   Loan_Amount_Term       361 non-null    float64
 10  Credit_History         338 non-null    float64
 11  Property_Area          367 non-null    object 
 12  Predicted_Loan_Status  367 non-null    int64  
dtypes: float64(3), int64(3), object(7)
memory usage: 37.4+ KB


In [None]:
test_df['Predicted_Loan_Status']=test_df['Predicted_Loan_Status'].astype(str)
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 367 entries, 0 to 366
Data columns (total 13 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Loan_ID                367 non-null    object 
 1   Gender                 356 non-null    object 
 2   Married                367 non-null    object 
 3   Dependents             357 non-null    object 
 4   Education              367 non-null    object 
 5   Self_Employed          344 non-null    object 
 6   ApplicantIncome        367 non-null    int64  
 7   CoapplicantIncome      367 non-null    int64  
 8   LoanAmount             362 non-null    float64
 9   Loan_Amount_Term       361 non-null    float64
 10  Credit_History         338 non-null    float64
 11  Property_Area          367 non-null    object 
 12  Predicted_Loan_Status  367 non-null    object 
dtypes: float64(3), int64(2), object(8)
memory usage: 37.4+ KB


In [None]:
test_df['Predicted_Loan_Status'].replace("1", "Y", inplace=True)
test_df['Predicted_Loan_Status'].replace("0", "N", inplace=True)

In [None]:
test_df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Predicted_Loan_Status
0,LP001015,Male,Yes,0,Graduate,No,5720,0,110.0,360.0,1.0,Urban,Y
1,LP001022,Male,Yes,1,Graduate,No,3076,1500,126.0,360.0,1.0,Urban,Y
2,LP001031,Male,Yes,2,Graduate,No,5000,1800,208.0,360.0,1.0,Urban,Y
3,LP001035,Male,Yes,2,Graduate,No,2340,2546,100.0,360.0,,Urban,Y
4,LP001051,Male,No,0,Not Graduate,No,3276,0,78.0,360.0,1.0,Urban,Y


In [None]:
from google.colab import files
test_df.to_csv("gdrive/My Drive/SRM-MLP-Internship-2021/Projects/Classification/01-Loan-Status-Prediction/Output-Files/Customer_Loan_Data_with_Predicted_Status_Values.csv", index = False)