# Support Vector Machines

Let's create the same fake income / age clustered data that we used for our K-Means clustering example:

In [99]:
import numpy as np
import pandas as pd
import io
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import accuracy_score


from google.colab import files                  #Needed to import a file that is stored on the local drive
uploaded = files.upload()
loan_df = pd.read_csv(io.BytesIO(uploaded['Loan_approval.csv']), header = 0) #The io.BytesIO optimises the dataset to work faster as it is now stored in RAM. Also it enables the uploaded file which is stored in a variable to be used as a file object

Saving Loan_approval.csv to Loan_approval (3).csv


In [100]:
loan_df.head()          # Column and row preview of the dataset

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128,360,1.0,Rural,N
1,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66,360,1.0,Urban,Y
2,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120,360,1.0,Urban,Y
3,LP001008,Male,No,0,Graduate,No,6000,0.0,141,360,1.0,Urban,Y
4,LP001011,Male,Yes,2,Graduate,Yes,5417,4196.0,267,360,1.0,Urban,Y


In [101]:
loan_df.shape                    # check the shape of the dataset (rows, columns)
loan_df.dtypes

Loan_ID               object
Gender                object
Married               object
Dependents             int64
Education             object
Self_Employed         object
ApplicantIncome        int64
CoapplicantIncome    float64
LoanAmount             int64
Loan_Amount_Term       int64
Credit_History       float64
Property_Area         object
Loan_Status           object
dtype: object

In [94]:
loan_df.isnull().sum()      # check to see if there are null values in the data (if so we will need to remove)

AttributeError: ignored

In [93]:
loan_df = loan_df.dropna()     # drops the rows with null values, in this case 149 in total
loan_df.shape                  # check the new shape of the dataset (was 614 now 480 - 34 rows deleted)
loan_df.dtypes


AttributeError: ignored

In [91]:
loan_df['Gender'].to_numeric(float, downcast="integer").astype("integer")


AttributeError: ignored

- For the SVM to work it needs to have all the input values as numbers
- Firstly we need to create variables to replace the string values with numbers
- Secondly we then map those variable values to the dataframe using the map() function

In [85]:
gender = {'Male': 1, 'Female': 0}                                       # Variables to replace the text with numbers (6 of the features have text so we need to do it for all 6 of them)
married = {'Yes': 1, 'No': 0} 
education = {'Graduate': 1, 'Not Graduate': 0} 
self_employed = {'Yes': 1, 'No': 0} 
property_area = {'Rural': 0, 'Semiurban': 1, 'Urban': 2}
loan_status = {'Y': 1, 'N': 0} 

loan_df['Gender'] = loan_df['Gender'].map(gender)                               # map() function used to exchange the string values with the above numbers and assign those values to loan_df dataframe
loan_df['Married'] = loan_df['Married'].map(married)
loan_df['Education'] = loan_df['Education'].map(education)
loan_df['Self_Employed'] = loan_df['Self_Employed'].map(self_employed)
loan_df['Property_Area'] = loan_df['Property_Area'].map(property_area)
loan_df['Loan_Status'] = loan_df['Loan_Status'].map(loan_status)


loan_df.head()



Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status,Gender.1
1,LP001003,,,1,,,4583,1508.0,128.0,360.0,1.0,,,
2,LP001005,,,0,,,3000,0.0,66.0,360.0,1.0,,,
3,LP001006,,,0,,,2583,2358.0,120.0,360.0,1.0,,,
4,LP001008,,,0,,,6000,0.0,141.0,360.0,1.0,,,
5,LP001011,,,2,,,5417,4196.0,267.0,360.0,1.0,,,
