### Example Using Iris Data

In [None]:
from sklearn.datasets import load_iris
(data, target) = load_iris(return_X_y =True)
data[0:20,:]

### Split Data Into Training and Testing

In [None]:
import random
num_rows = data.shape[0]
train_rate = 0.8
num_train = int(num_rows*train_rate)
shuffled_index = random.sample(range(num_rows), num_rows)
train_index = shuffled_index[0:num_train]
test_index = shuffled_index[num_train:num_rows]
data_train = data[train_index,:]
target_train = target[train_index]

data_test = data[test_index,:]
target_test = target[test_index]

### Train SVM Model

In [None]:
from sklearn import svm
clf = svm.SVC(C=0.1, kernel='linear') #Try C=0.1, and/or 'rbf' kernel to see the impact. 
svm_model = clf.fit(data_train, target_train) 

### Predict Testing Data

In [None]:
target_test_pred = svm_model.predict(data_test)
accuracy = round(float((target_test_pred == target_test).sum())/len(target_test)*100,2)
print("Accuracy=%.2f%%"%accuracy)

## Try on Adult Income Dataset

### Read Adult Income Data

In [1]:
trainFile = "C:\\UW\\data\\adult_train.csv"
testFile = "C:\\UW\\data\\adult_test.csv"

import pandas as pd
trainData = pd.read_csv(trainFile, sep=",", header=0)
testData = pd.read_csv(testFile, sep=",", header=0)
trainData.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital-status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


### One Hot Encoding Categorical Variables

In [2]:
X1 = trainData.iloc[:, 0:14]
X2 = testData.iloc[:, 0:14]
Y1 = trainData.iloc[:, 14]
Y2 = testData.iloc[:, 14]
frames = [X1, X2]
X = pd.concat(frames)

import category_encoders as ce
le =  ce.OneHotEncoder(return_df=False,impute_missing=False,handle_unknown="ignore")
X_encoded = le.fit_transform(X)

from sklearn import preprocessing
X_encoded = preprocessing.scale(X_encoded)
X_encoded_train = X_encoded[0:X1.shape[0], :]
X_encoded_test = X_encoded[X1.shape[0]:X_encoded.shape[0], :]



### Only Use 10000 Observations to Train SVM Model (to Save Time)

In [7]:
from sklearn import svm
import random
clf = svm.SVC(kernel='linear')
sampled_train_index = random.sample(range(X_encoded_train.shape[0]), 10000)
svm_model = clf.fit(X_encoded_train[sampled_train_index, :], Y1[sampled_train_index])  


### Test Accuracy on Testing Data

In [8]:
test_pred = svm_model.predict(X_encoded_test)
accuracy = round(float((test_pred == Y2).sum())/len(Y2)*100,2)
print("Accuracy=%.2f%%"%accuracy)

Accuracy=85.25%
