This problem involves the OJ data set.

## Preprocessing

In [0]:
import numpy as np
import pandas as pd

In [0]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [0]:
# load and preprocess data
url = "abfss://training@sa8451learningdev.dfs.core.windows.net/interpretable_machine_learning/eml_data/OJ.csv"
OJ = spark.read.option("header", "true").csv(url).toPandas()
OJ.set_index("SlNo", inplace=True)

str_cols = ["Purchase", "Store7"]
float_cols = ["PriceCH", "PriceMM", "DiscCH", "DiscMM", "LoyalCH", "SalePriceMM", "SalePriceCH", "PriceDiff", "PctDiscMM", "PctDiscCH", "ListPriceDiff"]
int_cols = list(set(OJ.columns)-set(str_cols)-set(float_cols))
OJ[str_cols] = OJ[str_cols].astype(str)
OJ[float_cols] = OJ[float_cols].astype(float)
OJ[int_cols] = OJ[int_cols].astype(int)

In [0]:
OJ.head()

In [0]:
OJ.Purchase = pd.factorize(OJ.Purchase)[0]
OJ.Store7 = pd.factorize(OJ.Store7)[0]
OJ.head()

In [0]:
OJ.info()

**a. Create a training set containing a random sample of 800
observations, and a test set containing the remaining
observations.**

In [0]:
from sklearn.model_selection import train_test_split

In [0]:
X = OJ.drop(columns='Purchase')
y = OJ.Purchase

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.53271028037, random_state=42)

In [0]:
X_train.shape

In [0]:
y_train.shape

**b. Fit a support vector classifer to the training data using
`C = 0.01`, with `Purchase` as the response and the other variables
as predictors. How many support points are there?**

In [0]:
from sklearn.svm import SVC

In [0]:
svmfit = SVC(kernel='linear', C=0.01).fit(X_train, y_train)

In [0]:
svmfit.support_vectors_

In [0]:
svmfit.classes_

In [0]:
svmfit.coef_

In [0]:
svmfit.get_params()

**c. What are the training and test error rates?**

In [0]:
from sklearn.metrics import confusion_matrix, classification_report

In [0]:
svmpred_train = svmfit.predict(X_train)

In [0]:
conf_mat_train = pd.DataFrame(confusion_matrix(y_train, svmpred_train).T, index=svmfit.classes_, columns=svmfit.classes_)
conf_mat_train

In [0]:
class_mat_train  = classification_report(y_train, svmpred_train)
print(class_mat_train)

In [0]:
svmpred_test = svmfit.predict(X_test)

In [0]:
conf_mat_test = pd.DataFrame(confusion_matrix(y_test, svmpred_test).T, index=svmfit.classes_, columns=svmfit.classes_)
conf_mat_test

In [0]:
class_mat_test  = classification_report(y_test, svmpred_test)
print(class_mat_test)

**d. Use cross-validation to select an optimal C. Consider values in
the range 0.01 to 10.**

In [0]:
cost_range = np.linspace(0.01, 10, 50)

In [0]:
errdf = pd.DataFrame()
for k in cost_range:
    svmfit = SVC(kernel='linear', C=k, degree=1).fit(X_train, y_train)
    svmpred_train = svmfit.predict(X_train)
    conf_mat_train = confusion_matrix(y_train, svmpred_train)
    NoNo = pd.DataFrame([conf_mat_train[0][0]])
    YesYes = pd.DataFrame([conf_mat_train[1][1]])
    errdf = errdf.append((NoNo + YesYes) / 500)

errdf.columns = ['error']
errdf.reset_index(drop=True, inplace=True)
errdf.index = np.round(cost_range, 2)
errdf

In [0]:
plt.xkcd()
plt.figure(figsize=(25, 10))
plt.plot(errdf.index, errdf.error, c='g', ls='-.', marker='o', markerfacecolor='orange')
plt.title('error rates for svc prediction on training data set', fontsize=30, color='m')
plt.xlabel('cost index (for actual costs, see code)', fontsize=20, color='c')
plt.ylabel('error', fontsize=20, color='c')

In [0]:
errdf = pd.DataFrame()
for k in cost_range:
    svmfit = SVC(kernel='linear', C=k, degree=1).fit(X_train, y_train)
    svmpred_test = svmfit.predict(X_test)
    conf_mat_test = confusion_matrix(y_test, svmpred_test)
    NoNo = pd.DataFrame([conf_mat_test[0][0]])
    YesYes = pd.DataFrame([conf_mat_test[1][1]])
    errdf = errdf.append((NoNo + YesYes) / 500)

errdf.columns = ['error']
errdf.reset_index(drop=True, inplace=True)
errdf.index = np.round(cost_range, 2)
errdf

In [0]:
plt.xkcd()
plt.figure(figsize=(25, 10))
plt.plot(errdf.index, errdf.error, c='g', ls='-.', marker='o', markerfacecolor='orange')
plt.title('error rates for svc prediction on test data set', fontsize=30, color='m')
plt.xlabel('cost index (for actual costs, see code)', fontsize=20, color='c')
plt.ylabel('error', fontsize=20, color='c')

Both training and test data sets provide least error for $cost$=0.01.

**e. Compute the training and test error rates using this new value
for C.**

In [0]:
svmpred001_train = svmfit.predict(X_train)
svmpred001_test = svmfit.predict(X_test)

In [0]:
conf_mat_train = pd.DataFrame(confusion_matrix(y_train, svmpred_train), index=svmfit.classes_, columns=svmfit.classes_)
conf_mat_test = pd.DataFrame(confusion_matrix(y_test, svmpred_test), index=svmfit.classes_, columns=svmfit.classes_)

In [0]:
conf_mat_train

In [0]:
class_mat_train = classification_report(y_train, svmpred_train)
print(class_mat_train)

In [0]:
conf_mat_test

In [0]:
class_mat_test = classification_report(y_test, svmpred_test)
print(class_mat_test)

**f. Repeat parts (b) through (e) using a support vector machine
with a radial kernel. Use the default value for gamma.**

In [0]:
svmfit = SVC(kernel='rbf', C=0.01, degree=3, gamma=0.01).fit(X_train, y_train)

In [0]:
svmfit.support_vectors_

In [0]:
svmfit.classes_

In [0]:
svmfit.get_params()

In [0]:
svmpred_train = svmfit.predict(X_train)

In [0]:
conf_mat_train = pd.DataFrame(confusion_matrix(y_train, svmpred_train).T, index=svmfit.classes_, columns=svmfit.classes_)
conf_mat_train

In [0]:
import warnings
warnings.filterwarnings('ignore')

class_mat_train  = classification_report(y_train, svmpred_train)
print(class_mat_train)

In [0]:
svmpred_test = svmfit.predict(X_test)

In [0]:
conf_mat_test = pd.DataFrame(confusion_matrix(y_test, svmpred_test).T, index=svmfit.classes_, columns=svmfit.classes_)
conf_mat_test

In [0]:
class_mat_test  = classification_report(y_test, svmpred_test)
print(class_mat_test)

In [0]:
cost_range = np.linspace(0.01, 10, 50)
gamma = np.linspace(0.1, 0.5, 5)

In [0]:
errdf = pd.DataFrame()
for k in cost_range:
    for j in gamma:
        svmfit = SVC(kernel='rbf', C=k, degree=3).fit(X_train, y_train)
        svmpred_train = svmfit.predict(X_train)
        conf_mat_train = confusion_matrix(y_train, svmpred_train)
        NoNo = pd.DataFrame([conf_mat_train[0][0]])
        YesYes = pd.DataFrame([conf_mat_train[1][1]])
        errdf = errdf.append((NoNo + YesYes) / 500)

errdf.columns = ['error']
errdf.reset_index(drop=True, inplace=True)
errdf

In [0]:
plt.xkcd()
plt.figure(figsize=(25, 10))
plt.plot(errdf.index, errdf.error, c='g', ls='-.', marker='o', markerfacecolor='orange')
plt.title('error rates for svc prediction on training data set', fontsize=30, color='m')
plt.xlabel('cost index (for actual costs, see code)', fontsize=20, color='c')
plt.ylabel('error', fontsize=20, color='c')

In [0]:
errdf = pd.DataFrame()
for k in cost_range:
    svmfit = SVC(kernel='rbf', C=k, degree=3).fit(X_train, y_train)
    svmpred_test = svmfit.predict(X_test)
    conf_mat_test = confusion_matrix(y_test, svmpred_test)
    NoNo = pd.DataFrame([conf_mat_test[0][0]])
    YesYes = pd.DataFrame([conf_mat_test[1][1]])
    errdf = errdf.append((NoNo + YesYes) / 500)

errdf.columns = ['error']
errdf.reset_index(drop=True, inplace=True)
errdf.index = np.round(cost_range, 2)
errdf

In [0]:
plt.xkcd()
plt.figure(figsize=(25, 10))
plt.plot(errdf.index, errdf.error, c='g', ls='-.', marker='o', markerfacecolor='orange')
plt.title('error rates for svc prediction on test data set', fontsize=30, color='m')
plt.xlabel('cost index (for actual costs, see code)', fontsize=20, color='c')
plt.ylabel('error', fontsize=20, color='c')

Both training and test data sets provide least error for $cost$=0.01.

**g. Repeat parts (b) through (e) using a support vector machine
with a polynomial kernel. Set degree = 2.**

In [0]:
svmfit = SVC(kernel='rbf', C=0.01, degree=2, gamma=0.01).fit(X_train, y_train)

In [0]:
svmfit.support_vectors_

In [0]:
svmfit.classes_

In [0]:
svmfit.get_params()

In [0]:
svmpred_train = svmfit.predict(X_train)

In [0]:
conf_mat_train = pd.DataFrame(confusion_matrix(y_train, svmpred_train).T, index=svmfit.classes_, columns=svmfit.classes_)
conf_mat_train

In [0]:
import warnings
warnings.filterwarnings('ignore')

class_mat_train  = classification_report(y_train, svmpred_train)
print(class_mat_train)

In [0]:
svmpred_test = svmfit.predict(X_test)

In [0]:
conf_mat_test = pd.DataFrame(confusion_matrix(y_test, svmpred_test).T, index=svmfit.classes_, columns=svmfit.classes_)
conf_mat_test

In [0]:
class_mat_test  = classification_report(y_test, svmpred_test)
print(class_mat_test)

In [0]:
cost_range = np.linspace(0.01, 10, 50)
gamma = np.linspace(0.1, 0.5, 5)

In [0]:
errdf = pd.DataFrame()
for k in cost_range:
    for j in gamma:
        svmfit = SVC(kernel='rbf', C=k, degree=2).fit(X_train, y_train)
        svmpred_train = svmfit.predict(X_train)
        conf_mat_train = confusion_matrix(y_train, svmpred_train)
        NoNo = pd.DataFrame([conf_mat_train[0][0]])
        YesYes = pd.DataFrame([conf_mat_train[1][1]])
        errdf = errdf.append((NoNo + YesYes) / 500)

errdf.columns = ['error']
errdf.reset_index(drop=True, inplace=True)
errdf

In [0]:
plt.xkcd()
plt.figure(figsize=(25, 10))
plt.plot(errdf.index, errdf.error, c='g', ls='-.', marker='o', markerfacecolor='orange')
plt.title('error rates for svc prediction on training data set', fontsize=30, color='m')
plt.xlabel('cost index (for actual costs, see code)', fontsize=20, color='c')
plt.ylabel('error', fontsize=20, color='c')

In [0]:
errdf = pd.DataFrame()
for k in cost_range:
    svmfit = SVC(kernel='rbf', C=k, degree=2).fit(X_train, y_train)
    svmpred_test = svmfit.predict(X_test)
    conf_mat_test = confusion_matrix(y_test, svmpred_test)
    NoNo = pd.DataFrame([conf_mat_test[0][0]])
    YesYes = pd.DataFrame([conf_mat_test[1][1]])
    errdf = errdf.append((NoNo + YesYes) / 500)

errdf.columns = ['error']
errdf.reset_index(drop=True, inplace=True)
errdf.index = np.round(cost_range, 2)
errdf

In [0]:
plt.xkcd()
plt.figure(figsize=(25, 10))
plt.plot(errdf.index, errdf.error, c='g', ls='-.', marker='o', markerfacecolor='orange')
plt.title('error rates for svc prediction on test data set', fontsize=30, color='m')
plt.xlabel('cost index (for actual costs, see code)', fontsize=20, color='c')
plt.ylabel('error', fontsize=20, color='c')

Both training and test data sets provide least error for $cost$=0.01.

**h.  Overall, which approach seems to give the best results on this
data?**

Overall, radial kernel seems to provide least error on both training and test data set.