In [29]:
from sklearn import datasets

# How are we going to use evaluate the performance? 
# 1. accuracy
from sklearn import metrics
# 2. f1 score 
from sklearn.metrics import f1_score

# Machine learning models 
# SVM
# url: https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html
from sklearn import svm

# KNN 
# url: https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html
from sklearn.neighbors import KNeighborsClassifier

# Decision Tree
# url: https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html
from sklearn.tree import DecisionTreeClassifier

# Random Forest 
# url: https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification

# Logistic Classifier
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV

from sklearn.model_selection import learning_curve, RandomizedSearchCV, GridSearchCV
from sklearn.model_selection import train_test_split, KFold
from sklearn.datasets import make_classification
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

import numpy as np
import matplotlib.pyplot as plt

Here, I use cancer dataset for tutorials.
It can be simply changed to another dataset

## Machine learning algorithms that we are going to use

- SVM

- Random Forest 

- Decision Tree 

- Softmax 

- KNN 

## Preprocess all data
https://www.datacamp.com/community/tutorials/svm-classification-scikit-learn-python

In [30]:
cancer = datasets.load_breast_cancer()
print("Features: \n", cancer.feature_names)
print()
print("Labels: \n",   cancer.target_names)

Features: 
 ['mean radius' 'mean texture' 'mean perimeter' 'mean area'
 'mean smoothness' 'mean compactness' 'mean concavity'
 'mean concave points' 'mean symmetry' 'mean fractal dimension'
 'radius error' 'texture error' 'perimeter error' 'area error'
 'smoothness error' 'compactness error' 'concavity error'
 'concave points error' 'symmetry error' 'fractal dimension error'
 'worst radius' 'worst texture' 'worst perimeter' 'worst area'
 'worst smoothness' 'worst compactness' 'worst concavity'
 'worst concave points' 'worst symmetry' 'worst fractal dimension']

Labels: 
 ['malignant' 'benign']


In [31]:
print("X data:", len(cancer.feature_names))
print("Y data:", len(cancer.target_names))

X data: 30
Y data: 2


In [32]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(cancer.data, cancer.target, test_size=0.3, random_state=1) # 70% training and 30% test

In [39]:
folds = KFold(n_splits = 10, shuffle = True, random_state = 1)

hyper_params = [ {'gamma': np.logspace(-5, 2, 8),
                      'C': np.logspace(-5, 2, 8),
                 'kernel': ['linear', 'poly', 'rbf', 'sigmoid']
                 }
               ]

# specify model
model = svm.SVC()

# set up GridSearchCV()
model_cv = GridSearchCV(estimator = model, 
                        param_grid = hyper_params, 
                        scoring= 'accuracy', 
                        cv = folds, 
                        verbose = 2,
                        return_train_score=True,
                        n_jobs=-1)      

In [40]:
# fit the model
model_cv.fit(X_train, y_train)

Fitting 10 folds for each of 256 candidates, totalling 2560 fits


KeyboardInterrupt: 

In [None]:
print("best hyper parameters", model_cv.best_params_)

In [None]:
y_pred = model_cv.predict(X_test)

In [None]:
# accuracy 
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))
# f1 score 
print("F1 score:", f1_score(y_test, y_pred, average='micro'))