In [1]:
from sklearn.svm import SVC
import pandas as pd
from sklearn.model_selection import GridSearchCV, cross_val_score, KFold
import numpy as np

In [3]:
data = pd.read_csv('data/clean_data.csv')
data.head()

Unnamed: 0,Gender,Height,Weight,Agility,Accelerated Healing,Lantern Power Ring,Dimensional Awareness,Cold Resistance,Durability,Stealth,...,Publisher_South Park,Publisher_Star Trek,Publisher_SyFy,Publisher_Team Epic TV,Publisher_Titan Books,Publisher_Universal Studios,Publisher_Wildstorm,Alignment_bad,Alignment_good,Alignment_neutral
0,0,203.0,441.0,False,True,False,False,False,True,False,...,0,0,0,0,0,0,0,0,1,0
1,0,191.0,65.0,True,True,False,False,True,True,False,...,0,0,0,0,0,0,0,0,1,0
2,0,185.0,90.0,False,False,True,False,False,False,False,...,0,0,0,0,0,0,0,0,1,0
3,0,203.0,441.0,False,True,False,False,False,False,False,...,0,0,0,0,0,0,0,1,0,0
4,0,183.0,81.0,False,False,False,True,False,False,False,...,0,0,0,0,0,0,0,1,0,0


In [None]:
# Number of random trials
NUM_TRIALS = 30

# Load the dataset
y = data['Accelerated Healing']
X = data.drop('Accelerated Healing', axis=1)


# Set up possible values of parameters to optimize over
p_grid = {"C": [1, 10, 100],
          "gamma": [.01, .1]}

# We will use a Support Vector Classifier with "rbf" kernel
svm = SVC(kernel="rbf")

# Arrays to store scores
non_nested_scores = np.zeros(NUM_TRIALS)
nested_scores = np.zeros(NUM_TRIALS)

# Loop for each trial
for i in range(NUM_TRIALS):

    # Choose cross-validation techniques for the inner and outer loops,
    # independently of the dataset.
    # E.g "GroupKFold", "LeaveOneOut", "LeaveOneGroupOut", etc.
    inner_cv = KFold(n_splits=4, shuffle=True, random_state=i)
    outer_cv = KFold(n_splits=4, shuffle=True, random_state=i)

    # Non_nested parameter search and scoring
    clf = GridSearchCV(estimator=svm, param_grid=p_grid, cv=inner_cv,
                       iid=False)
    clf.fit(X, y)
    non_nested_scores[i] = clf.best_score_

    # Nested CV with parameter optimization
    nested_score = cross_val_score(clf, X=X, y=y, cv=outer_cv)
    nested_scores[i] = nested_score.mean()

score_difference = non_nested_scores - nested_scores

print("Average difference of {:6f} with std. dev. of {:6f}."
      .format(score_difference.mean(), score_difference.std()))