## Assignment 7
### Author: Emily McAfee
### Age of Abalone

In [1]:
# Import libraries
import pandas as pd
import numpy as np
import sklearn.preprocessing

In [2]:
# Read in data
filename = "https://library.startlearninglabs.uw.edu/DATASCI420/2019/Datasets/Abalone.csv"
adf = pd.read_csv(filename)
adf.head()

Unnamed: 0,Sex,Length,Diameter,Height,Whole Weight,Shucked Weight,Viscera Weight,Shell Weight,Rings
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7


In [3]:
adf.dtypes

Sex                object
Length            float64
Diameter          float64
Height            float64
Whole Weight      float64
Shucked Weight    float64
Viscera Weight    float64
Shell Weight      float64
Rings               int64
dtype: object

In [4]:
# Change categorical to numerical (one-hot encoding)
adf = pd.get_dummies(adf, drop_first = True)
adf.head()

Unnamed: 0,Length,Diameter,Height,Whole Weight,Shucked Weight,Viscera Weight,Shell Weight,Rings,Sex_I,Sex_M
0,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15,0,1
1,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7,0,1
2,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9,0,0
3,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10,0,1
4,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7,1,0


In [5]:
# Move Rings column to end
move = adf.pop('Rings')
adf['Rings'] = move
adf.head()

Unnamed: 0,Length,Diameter,Height,Whole Weight,Shucked Weight,Viscera Weight,Shell Weight,Sex_I,Sex_M,Rings
0,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,0,1,15
1,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,0,1,7
2,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,0,0,9
3,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,0,1,10
4,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,1,0,7


### 1. Convert the continous output value from continuous to binary (0,1) and build an SVC

In [6]:
# Make new df for classifier
adf_c = adf.copy()

In [7]:
# Change Rings to binary
adf_c['Rings'] = sklearn.preprocessing.binarize(adf_c[['Rings']].values, threshold = 11)

In [8]:
# SVC
# Split the data into features and target
x = adf_c.iloc[:, :-1]
y = adf_c.iloc[:,-1]

# Split data into training and testing sets
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = .2, random_state = 1)

### 4. Using your best guess for hyperparameters and kernel, what is the percentage of correctly classified results?

In [9]:
# Set hyperparameters
# penalty parameter of the error term, 
# softening the margin from 1.0 (default)
cost = .8

# defines the influence of input vectors on the margins,
# soften by increasing to 4
gamma = 3

In [10]:
# Create a Linear SVC
from sklearn import svm, metrics
from sklearn.metrics import classification_report

clf1 = svm.LinearSVC(C = cost).fit(x_train, y_train)
clf1.predict(x_test)
print("LinearSVC")
print(classification_report(clf1.predict(x_test), y_test))

LinearSVC
              precision    recall  f1-score   support

           0       0.96      0.83      0.89       743
           1       0.35      0.73      0.47        93

    accuracy                           0.82       836
   macro avg       0.65      0.78      0.68       836
weighted avg       0.89      0.82      0.84       836



### 3. Test different kernels and hyperparameters. Which kernel performed best with what settings?

In [11]:
# Test linear, rbf and poly kernels
for k in ('linear', 'rbf', 'poly'):
    clf = svm.SVC(gamma=gamma, kernel=k, C=cost).fit(x_train, y_train)
    clf.predict(x_test)
    print(k)
    print(classification_report(clf.predict(x_test), y_test))

linear
              precision    recall  f1-score   support

           0       0.98      0.80      0.88       782
           1       0.22      0.80      0.34        54

    accuracy                           0.80       836
   macro avg       0.60      0.80      0.61       836
weighted avg       0.93      0.80      0.85       836

rbf
              precision    recall  f1-score   support

           0       0.97      0.81      0.88       763
           1       0.27      0.73      0.39        73

    accuracy                           0.81       836
   macro avg       0.62      0.77      0.64       836
weighted avg       0.91      0.81      0.84       836

poly
              precision    recall  f1-score   support

           0       0.97      0.82      0.89       751
           1       0.32      0.74      0.45        85

    accuracy                           0.81       836
   macro avg       0.64      0.78      0.67       836
weighted avg       0.90      0.81      0.84       836



In [12]:
from sklearn import svm
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC 

# Set parameters
parameters = {'kernel':('linear', 'rbf','poly'), 'C':[1, 10]}
svc = svm.SVC()
clf = GridSearchCV(svc, parameters)
clf.fit(x, y)
GridSearchCV(estimator=SVC(),
             param_grid={'C': [1, 10], 'kernel': ('linear', 'rbf','poly')})


GridSearchCV(cv=None, error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs=None,
             param_grid={'C': [1, 10], 'kernel': ('linear', 'rbf', 'poly')},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

### 4. Show precision, recall, and f-measure for the best model

In [13]:
# Look at results
print(clf.best_estimator_)
y_true, y_pred = y_test, clf.predict(x_test)
print(classification_report(y_true, y_pred))
print('Best performing kernel/settings:')
print(clf.best_params_)

SVC(C=10, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)
              precision    recall  f1-score   support

           0       0.84      0.96      0.90       640
           1       0.77      0.39      0.52       196

    accuracy                           0.83       836
   macro avg       0.80      0.68      0.71       836
weighted avg       0.82      0.83      0.81       836

Best performing kernel/settings:
{'C': 10, 'kernel': 'rbf'}


In [14]:
# Check with other data
print('Mean Test Score:')
print(clf.cv_results_['mean_test_score'])
print()
print('All results:')
print(clf.cv_results_)

Mean Test Score:
[0.82116122 0.81829012 0.81685271 0.82905911 0.83121508 0.82762399]

All results:
{'mean_fit_time': array([0.09764552, 0.14062443, 0.18503733, 0.1395412 , 0.14873309,
       0.43512182]), 'std_fit_time': array([0.00423633, 0.00424598, 0.01613763, 0.0191285 , 0.00593284,
       0.03508627]), 'mean_score_time': array([0.01333923, 0.02488956, 0.0143827 , 0.01208477, 0.0233005 ,
       0.01384706]), 'std_score_time': array([0.00071233, 0.0007417 , 0.00078414, 0.00053415, 0.00070812,
       0.00071383]), 'param_C': masked_array(data=[1, 1, 1, 10, 10, 10],
             mask=[False, False, False, False, False, False],
       fill_value='?',
            dtype=object), 'param_kernel': masked_array(data=['linear', 'rbf', 'poly', 'linear', 'rbf', 'poly'],
             mask=[False, False, False, False, False, False],
       fill_value='?',
            dtype=object), 'params': [{'C': 1, 'kernel': 'linear'}, {'C': 1, 'kernel': 'rbf'}, {'C': 1, 'kernel': 'poly'}, {'C': 10, 'kernel': 

In [15]:
clf.cv_results_

{'mean_fit_time': array([0.09764552, 0.14062443, 0.18503733, 0.1395412 , 0.14873309,
        0.43512182]),
 'std_fit_time': array([0.00423633, 0.00424598, 0.01613763, 0.0191285 , 0.00593284,
        0.03508627]),
 'mean_score_time': array([0.01333923, 0.02488956, 0.0143827 , 0.01208477, 0.0233005 ,
        0.01384706]),
 'std_score_time': array([0.00071233, 0.0007417 , 0.00078414, 0.00053415, 0.00070812,
        0.00071383]),
 'param_C': masked_array(data=[1, 1, 1, 10, 10, 10],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_kernel': masked_array(data=['linear', 'rbf', 'poly', 'linear', 'rbf', 'poly'],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'C': 1, 'kernel': 'linear'},
  {'C': 1, 'kernel': 'rbf'},
  {'C': 1, 'kernel': 'poly'},
  {'C': 10, 'kernel': 'linear'},
  {'C': 10, 'kernel': 'rbf'},
  {'C': 10, 'kernel': 'poly'}],
 'sp

### 5. Using the original data, with rings as a continuous variable, create an SVR model

In [16]:
# Make new df for regression (continous target variable)
adf_r = adf.copy()

# Establish features/target
x1 = adf_r.iloc[:, :-1]
y1 = adf_r.iloc[:,-1]

# Split
x1_train, x1_test, y1_train, y1_test = train_test_split(x1, y1, test_size = .2, random_state = 1)


In [17]:
from sklearn.svm import SVR
from sklearn.metrics import r2_score, mean_squared_error

# Initialize model
regressor = SVR(kernel = 'rbf', gamma = 50, C = 3)
regressor.fit(x1_train, y1_train)
pred = regressor.predict(x1_test)


### 6. Report the predicted variance and the mean square error.

In [18]:
# Check accuracy
print('explained variance score/r2 = ', regressor.score(x1_test,y1_test))
print('mean squared error = ', mean_squared_error(y1_test, pred))

explained variance score/r2 =  0.5271127324765592
mean squared error =  4.63024510754114
