### I ended up using GridSearch, and by hand-tuning did even better than it.  I couldn't get lower than 6 total errors, and depending how I tuned, could distribute them more toward either Type I or Type II errors.  Ultimately the final tune would depend on what is being maximized or minimized.
### The focus here was on prediction, so visualization is nonexistant.
### The top 3 predictive features are: Bare Nuclei, Uniformity of Cell Shape, and Uniformity of Cell Size, in that order.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.feature_selection import SelectKBest

%matplotlib inline

### Give the columns names (from data source site), set index to ID column

In [2]:
cols = ['ID', 'thick', 'unif_size', 'unif_shape', 'adhes',
        'epith_size', 'nuclei', 'chroma', 'nucleoli', 'mitoses', 'class']

df = pd.read_csv('breast_cancer.csv', names=cols, index_col='ID')

### No null values

In [3]:
nulls = df.isnull().sum()
nulls[nulls > 0]

Series([], dtype: int64)

In [4]:
# Looks like one column is acting like an object rather than a number....
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 699 entries, 1000025 to 897471
Data columns (total 10 columns):
thick         699 non-null int64
unif_size     699 non-null int64
unif_shape    699 non-null int64
adhes         699 non-null int64
epith_size    699 non-null int64
nuclei        699 non-null object
chroma        699 non-null int64
nucleoli      699 non-null int64
mitoses       699 non-null int64
class         699 non-null int64
dtypes: int64(9), object(1)
memory usage: 60.1+ KB


### Ahhh, some values in 'nuclei' are showing up as '?' string rather than null

In [5]:
# there's the problem; so, drop or impute?
df['nuclei'].value_counts()

1     402
10    132
5      30
2      30
3      28
8      21
4      19
?      16
9       9
7       8
6       4
Name: nuclei, dtype: int64

In [6]:
# going to drop for now, losing 16 isn't horrible
df = df.loc[df['nuclei'] != '?', :]

In [7]:
# this is better, no more object type showing up
df['nuclei'] = pd.to_numeric(df['nuclei'])
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 683 entries, 1000025 to 897471
Data columns (total 10 columns):
thick         683 non-null int64
unif_size     683 non-null int64
unif_shape    683 non-null int64
adhes         683 non-null int64
epith_size    683 non-null int64
nuclei        683 non-null int64
chroma        683 non-null int64
nucleoli      683 non-null int64
mitoses       683 non-null int64
class         683 non-null int64
dtypes: int64(10)
memory usage: 58.7 KB


In [8]:
df.head(7)

Unnamed: 0_level_0,thick,unif_size,unif_shape,adhes,epith_size,nuclei,chroma,nucleoli,mitoses,class
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1000025,5,1,1,1,2,1,3,1,1,2
1002945,5,4,4,5,7,10,3,2,1,2
1015425,3,1,1,1,2,2,3,1,1,2
1016277,6,8,8,1,3,4,3,7,1,2
1017023,4,1,1,3,2,1,3,1,1,2
1017122,8,10,10,8,7,10,9,7,1,4
1018099,1,1,1,1,2,10,3,1,1,2


In [9]:
# convert the class from (2,4) to (0,1) for benign or malignant
# only run once or they'll all become 0's
df['class'] = np.where(df['class'] == 4, 1, 0)
df.head(7)

Unnamed: 0_level_0,thick,unif_size,unif_shape,adhes,epith_size,nuclei,chroma,nucleoli,mitoses,class
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1000025,5,1,1,1,2,1,3,1,1,0
1002945,5,4,4,5,7,10,3,2,1,0
1015425,3,1,1,1,2,2,3,1,1,0
1016277,6,8,8,1,3,4,3,7,1,0
1017023,4,1,1,3,2,1,3,1,1,0
1017122,8,10,10,8,7,10,9,7,1,1
1018099,1,1,1,1,2,10,3,1,1,0


In [10]:
X = df.drop(columns='class')
y = df['class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

In [11]:
svc = SVC()

In [12]:
svc.fit(X_train, y_train)



SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [13]:
predictions = svc.predict(X_test)

### Looks like we have 7 false positives / Type I error, no Type II

In [14]:
print(confusion_matrix(y_test, predictions))
print('\n', classification_report(y_test, predictions))

[[120   7]
 [  0  78]]

               precision    recall  f1-score   support

           0       1.00      0.94      0.97       127
           1       0.92      1.00      0.96        78

   micro avg       0.97      0.97      0.97       205
   macro avg       0.96      0.97      0.96       205
weighted avg       0.97      0.97      0.97       205



### See if we can improve using a GridSearch

In [15]:
param_grid = {'C':[0.1, 1, 10, 100, 1000], 'gamma':[1, 0.1, 0.01, 0.001, 0.0001]}
grid = GridSearchCV(SVC(), param_grid, verbose=3)

In [16]:
grid.fit(X_train, y_train)

Fitting 3 folds for each of 25 candidates, totalling 75 fits
[CV] C=0.1, gamma=1 ..................................................
[CV] ..................... C=0.1, gamma=1, score=0.6625, total=   0.0s
[CV] C=0.1, gamma=1 ..................................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s


[CV] ..................... C=0.1, gamma=1, score=0.6625, total=   0.0s
[CV] C=0.1, gamma=1 ..................................................
[CV] ......... C=0.1, gamma=1, score=0.6645569620253164, total=   0.0s
[CV] C=0.1, gamma=0.1 ................................................
[CV] ................... C=0.1, gamma=0.1, score=0.9125, total=   0.0s
[CV] C=0.1, gamma=0.1 ................................................
[CV] .................. C=0.1, gamma=0.1, score=0.91875, total=   0.0s
[CV] C=0.1, gamma=0.1 ................................................
[CV] ....... C=0.1, gamma=0.1, score=0.9683544303797469, total=   0.0s
[CV] C=0.1, gamma=0.01 ...............................................
[CV] ................. C=0.1, gamma=0.01, score=0.98125, total=   0.0s
[CV] C=0.1, gamma=0.01 ...............................................
[CV] ................. C=0.1, gamma=0.01, score=0.96875, total=   0.0s
[CV] C=0.1, gamma=0.01 ...............................................
[CV] .

[CV] ..... C=1000, gamma=0.01, score=0.9367088607594937, total=   0.0s
[CV] C=1000, gamma=0.001 .............................................
[CV] ............... C=1000, gamma=0.001, score=0.95625, total=   0.0s
[CV] C=1000, gamma=0.001 .............................................
[CV] ............... C=1000, gamma=0.001, score=0.94375, total=   0.0s
[CV] C=1000, gamma=0.001 .............................................
[CV] .... C=1000, gamma=0.001, score=0.9620253164556962, total=   0.0s
[CV] C=1000, gamma=0.0001 ............................................
[CV] ............... C=1000, gamma=0.0001, score=0.9625, total=   0.0s
[CV] C=1000, gamma=0.0001 ............................................
[CV] .............. C=1000, gamma=0.0001, score=0.96875, total=   0.0s
[CV] C=1000, gamma=0.0001 ............................................
[CV] ... C=1000, gamma=0.0001, score=0.9620253164556962, total=   0.0s


[Parallel(n_jobs=1)]: Done  75 out of  75 | elapsed:    0.6s finished


GridSearchCV(cv='warn', error_score='raise-deprecating',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'C': [0.1, 1, 10, 100, 1000], 'gamma': [1, 0.1, 0.01, 0.001, 0.0001]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=3)

In [17]:
grid.best_params_

{'C': 1, 'gamma': 0.01}

In [18]:
grid_pred = grid.predict(X_test)

In [19]:
print(confusion_matrix(y_test, grid_pred))
print('\n', classification_report(y_test, grid_pred))

[[124   3]
 [  7  71]]

               precision    recall  f1-score   support

           0       0.95      0.98      0.96       127
           1       0.96      0.91      0.93        78

   micro avg       0.95      0.95      0.95       205
   macro avg       0.95      0.94      0.95       205
weighted avg       0.95      0.95      0.95       205



### I can push even better than GridSearch too!  And move the errors away from Type II toward Type I

In [20]:
svc = SVC(C=2.5, gamma=0.05)
svc.fit(X_train, y_train)
predictions = svc.predict(X_test)
print(confusion_matrix(y_test, predictions))
print('\n', classification_report(y_test, predictions))

[[124   3]
 [  3  75]]

               precision    recall  f1-score   support

           0       0.98      0.98      0.98       127
           1       0.96      0.96      0.96        78

   micro avg       0.97      0.97      0.97       205
   macro avg       0.97      0.97      0.97       205
weighted avg       0.97      0.97      0.97       205



### This can get tweaked further; same total errors as above, but different distribution.

In [21]:
svc = SVC(C=2, gamma=0.08)
svc.fit(X_train, y_train)
predictions = svc.predict(X_test)
print(confusion_matrix(y_test, predictions))
print('\n', classification_report(y_test, predictions))

[[122   5]
 [  1  77]]

               precision    recall  f1-score   support

           0       0.99      0.96      0.98       127
           1       0.94      0.99      0.96        78

   micro avg       0.97      0.97      0.97       205
   macro avg       0.97      0.97      0.97       205
weighted avg       0.97      0.97      0.97       205



In [24]:
selector = SelectKBest(k=8)
X_new = selector.fit_transform(X, y)
names = X.columns.values[selector.get_support()]
scores = selector.scores_[selector.get_support()]
names_scores = list(zip(names, scores))
ns_df = pd.DataFrame(data = names_scores, columns=['Feat_names', 'F_Scores'])
ns_df_sorted = ns_df.sort_values(['F_Scores', 'Feat_names'], ascending = [False, True])
print(ns_df_sorted)

   Feat_names     F_Scores
5      nuclei  1426.240270
2  unif_shape  1417.643841
1   unif_size  1406.132470
6      chroma   921.010015
7    nucleoli   727.470805
0       thick   711.423446
3       adhes   677.878400
4  epith_size   622.157681
