# SVM Tree Worksheet


In [0]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn import svm
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC # "Support vector classifier"
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

#### Load pima indians diabetes dataset below

In [0]:
url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv"
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
# your code here
data = pd.read_csv(url, names=names)

In [144]:
data.head()

Unnamed: 0,preg,plas,pres,skin,test,mass,pedi,age,class
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


#### Split data into test and train set with the test_size being 25 %

In [0]:
# your code here
X = data.iloc[:,:-1].as_matrix()
Y = data.iloc[:,-1].as_matrix()
X_train, X_test, y_train, y_test = train_test_split( X, Y, test_size = 0.25, random_state = None)

In [146]:
print(X_train.shape)
print(y_test.shape)

(576, 8)
(192,)


#### Create a model with sklearn's SVC and a high gamma value (perhaps 1.0)

In [147]:
# your code here
model = SVC( gamma=1)
model.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=1, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

#### Print accuracy of test data and training data

In [148]:
# your code here
y_pred= model.predict(X_test)
print("Accuracy is ", accuracy_score(y_test,y_pred)*100)
y_train_pred= model.predict(X_train)
print("Accuracy is ", accuracy_score(y_train_pred,y_train)*100)

Accuracy is  66.66666666666666
Accuracy is  100.0


#### Create a model with sklearn's SVC and a high low value (perhaps .001)

In [149]:
# your code here
model = SVC( gamma=.001)
model.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.001, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

#### Print accuracy of training data and test data

In [150]:
# your code here
y_pred= model.predict(X_test)
print("Accuracy is ", accuracy_score(y_test,y_pred)*100)
y_train_pred= model.predict(X_train)
print("Accuracy is ", accuracy_score(y_train_pred,y_train)*100)

Accuracy is  71.875
Accuracy is  84.54861111111111


Explain the accuracy results below. 

Your answer here <br>
high gamma value overfit on the training data <br>
but low gamma value performed better on test set

## GridSearchCV

There are several parameters to tune. Instead of tuning the parameters one by one, GridSearchCV does an exhaustive search over provided parameters. <br><br>

###Use gamma, C and decision_function_shape as parameters and GridSearchCV to find the best parameters with kernel='rbf'<br>
Don't know what decision_function_shape is, look at the SVC documentation <br>
Don't know how to use GridSearchCV, google it!<br><br>

In [151]:
# your code here
tuned_parameters = [{'kernel': ['rbf'], 'gamma': [ 0.00001, 0.0001, 0.001, 0.01, 0.1, 1], 'C': [.1, 1, 10, 100, 1000], 'decision_function_shape':['ovo', 'ovr'] }]
grid_search = GridSearchCV(SVC(), tuned_parameters, cv=5)
grid_search.fit(X_train, y_train)
grid_search.best_params_

{'C': 100, 'decision_function_shape': 'ovo', 'gamma': 1e-05, 'kernel': 'rbf'}

### Using the optimal parameters you found, print the accuracy

In [152]:
rbf_tuned_model = SVC(kernel='rbf', C=10 ,gamma=.00001, decision_function_shape='ovo', probability=True)
rbf_tuned_model.fit(X_train, y_train)
y_pred= rbf_tuned_model.predict(X_test)
print("Accuracy is ", accuracy_score(y_test,y_pred)*100)

Accuracy is  77.60416666666666


### Create an ensemble that includes svm and random forest (use your code from the decision trees notebook)
### Use predict_proba to get probabilities and decide a method to comebine the predictions 

In [0]:
# your code here
probs_svm = rbf_tuned_model.predict_proba(X_test)

In [0]:
random_forest = RandomForestClassifier(n_estimators=100)
random_forest.fit(X_train, y_train)
probs_forest = random_forest.predict_proba(X_test)

In [156]:
y_pred_ensemble = []
huh = 0
total = 0
for i in range(0, len(probs_svm)):
  if probs_svm[i].argmax() == probs_forest[i].argmax():
    y_pred_ensemble.append(probs_svm[i].argmax())
    continue
  else:
    svm_highest_val_idx = probs_svm[i].argmax()
    forest_highest_val_idx = probs_forest[i].argmax()
    svm_high_val = probs_svm[i][svm_highest_val_idx]
    forest_high_val = probs_forest[i][forest_highest_val_idx]
    if svm_high_val > forest_high_val:
      y_pred_ensemble.append(probs_svm[i].argmax())
    else:
      y_pred_ensemble.append(probs_forest[i].argmax())
print("Accuracy is ", accuracy_score(y_test,y_pred_ensemble)*100)

Accuracy is  78.64583333333334
