This part consitutes the follwing steps:
1. Searching multiple combinations of the data preprocessing methods and classifiers to find ones that seem to be the most suitable for this binary classification problem
2. Optimization of the Support Vector Machines classifier
3. Optimization of the Random Forests classifier

In [None]:
import pandas as pd  
import numpy as np  
import matplotlib.pyplot as plt  
import seaborn as seabornInstance 
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LogisticRegression
from sklearn import metrics


In [None]:
#read the data
dataset = pd.read_csv("ultimateData.csv")


In [None]:
#setup row names
dataset.set_index('label')

In [None]:
dataset['value'].value_counts()

In [None]:
#Create np arrays of the labels and features and divide the dataset into training and test set
labels = dataset['value'].values
features = dataset.drop('value', axis=1).values

In [None]:
features = features[:,1:].astype(np.float32)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.33, random_state=42)

In [None]:
from imblearn.pipeline import Pipeline
from sklearn.decomposition import PCA, KernelPCA
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, precision_recall_curve, confusion_matrix
from sklearn.svm import SVC
from sklearn import ensemble
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import RandomOverSampler, SMOTE
import xgboost as xgb
from imblearn.under_sampling import ClusterCentroids, RandomUnderSampler

In [None]:
y_test.sum()

Tryout different preprocessing methods together with different classifiers

In [None]:
dimensionality_reductors = [PCA(n_components=0.95),PCA(n_components=0.90), KernelPCA(kernel = 'linear'), KernelPCA(kernel = 'poly', degree=3), KernelPCA(kernel = 'poly', degree=5), None]
classifiers = [LogisticRegression(random_state=0, solver='lbfgs', max_iter=10000), RandomForestClassifier(n_estimators=1000), SVC(kernel='poly', degree = 3), SVC(kernel='poly', degree = 4), SVC(kernel='poly', degree = 5), xgb.XGBClassifier()]
preprocess = [StandardScaler(), MinMaxScaler(), None]
balancers = [SMOTE(), RandomOverSampler(), None]

Create a sklearn pipeline to tryout the different combinations without doing consequitive steps by hand- fit the different models on the trainings set and predict on the test set
Calculate appropraite metrics and put the results into the table for comparison

In [None]:
%%time
results = []
for scaler in preprocess:
    for dimensionality_reductor in dimensionality_reductors:
        for balancer in balancers:
            for classifier in classifiers:
                pipeline = Pipeline([
                    (type(scaler).__name__, scaler),
                    (type(dimensionality_reductor).__name__, dimensionality_reductor),
                    (type(balancer).__name__, balancer),(type(classifier).__name__, classifier)
                    ])
                pipeline_fitted = pipeline.fit(X_train, y_train)
                predictions = pipeline_fitted.predict(X_test)
                F1 = f1_score(y_test, predictions)
                accuracy = accuracy_score(y_test, predictions)
                precision = precision_score(y_test, predictions)
                recall = recall_score(y_test, predictions)
                conf_matrix = confusion_matrix(y_test, predictions)
                roc = metrics.roc_curve(y_test, predictions)
                results.append([type(dimensionality_reductor).__name__,type(scaler).__name__, type(balancer).__name__, type(classifier).__name__, F1, accuracy, precision, recall, conf_matrix, roc])
df_results = pd.DataFrame(results, columns = ['Dimensionality_Reductor', 'Scaler', 'Balancer' ,'Classifier', 'F1', 'Accuracy', 'Precision', 'Recall', 'Confusion_Matrix', 'Roc_Curve'])

In [None]:
#Name the columns of the df with the results
df_results = pd.DataFrame(results, columns = ['Dimensionality_Reductor', 'Scaler', 'Balancer' ,'Classifier', 'F1', 'Accuracy', 'Precision', 'Recall', 'Confusion_Matrix', 'Roc_Curve'])

In [None]:
#Sort by chosen metruc
df_results.sort_values(by=['F1'])

In [None]:
#Save the table
df_results.to_csv('/home/alicja/Documents/furtherModels.csv')

Implement TPE hyperparameter optimization using Hyperopt

In [None]:
%%time
from hyperopt import hp, tpe
from hpsklearn import HyperoptEstimator, any_classifier, any_preprocessing, random_forest, xgboost_classification, svc, min_max_scaler, standard_scaler, svc_poly


In [None]:
#Set hyperopr estimator to choose the best model. Here: chose the best svm classifier with the best preprocessing
#Can be use also to chose any_classifier and random_forest
%%time
estim = HyperoptEstimator(algo=tpe.suggest, classifier = svc('svm'), preprocessing=any_preprocessing('pre'), max_evals=100, trial_timeout=120)

In [None]:
# Fit the Hyperopr models
%%time
estim.fit(X_train, y_train)

In [None]:
#Print the best hyperopt score. Hyperopr uses accuracy as default score
print(estim.score(X_test, y_test))

In [None]:
#get the best model
print( estim.best_model() )

Get the best model to fit it to the data and to get other evaluation metrics

In [None]:
sc = StandardScaler(copy=True, with_mean=False, with_std=False)
X = sc.fit_transform(features[:,1:])


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.33, random_state=42)

In [None]:
best_classifier = SVC(C=77358.56620434714, cache_size=512, class_weight=None,
  coef0=5694.538815205853, decision_function_shape='ovr', degree=2.0,
  gamma=758.5493658267706, kernel='poly', max_iter=707238803.0,
  probability=False, random_state=1, shrinking=True,
  tol=1.621850765788926e-05, verbose=False)

In [None]:
#Fit the best model to the test set
best_classifier.fit(X_train, y_train)

In [None]:
#Predict y on the X_test
predictions = best_classifier.predict(X_test)

In [None]:
#Calculate the metrics for the best classifier
F1 = f1_score(y_test, predictions)
accuracy = accuracy_score(y_test, predictions)
precision = precision_score(y_test, predictions)
recall = recall_score(y_test, predictions)
conf_matrix = confusion_matrix(y_test, predictions)

In [None]:
print(F1, accuracy, precision, recall, conf_matrix)

In [None]:
#Function to ge the confusion matric of the classification results
def plot_confusion_matrix(y_true, y_pred, classes,
                          normalize=False,
                          title=None,
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if not title:
        if normalize:
            title = 'Normalized confusion matrix'
        else:
            title = 'Confusion matrix, without normalization'

    # Compute confusion matrix
    cm = confusion_matrix(y_true, y_pred)
    # Only use the labels that appear in the data
    classes = [False, True]
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    fig, ax = plt.subplots()
    im = ax.imshow(cm, interpolation='nearest', cmap=cmap)
    ax.figure.colorbar(im, ax=ax)
    # We want to show all ticks...
    ax.set(xticks=np.arange(cm.shape[1]),
           yticks=np.arange(cm.shape[0]),
           # ... and label them with the respective list entries
           xticklabels=classes, yticklabels=classes,
           title=title,
           ylabel='True label',
           xlabel='Predicted label')

    # Rotate the tick labels and set their alignment.
    plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
             rotation_mode="anchor")

    # Loop over data dimensions and create text annotations.
    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            ax.text(j, i, format(cm[i, j], fmt),
                    ha="center", va="center",
                    color="white" if cm[i, j] > thresh else "black")
    fig.tight_layout()
    return ax

In [None]:
classes = [False, True]

In [None]:
plot_confusion_matrix(y_test, predictions, classes)

In [None]:
#Same method as for svm can be used to optimize random forest model
%%time
estim_rf = HyperoptEstimator(algo=tpe.suggest, classifier = random_forest('random_forest'), preprocessing=any_preprocessing('pre'), max_evals=100, trial_timeout=120)

In [None]:
# Fit the Hyperopr models
%%time
estim_rf.fit(X_train, y_train)

In [None]:
#Print the best hyperopt score. Hyperopr uses accuracy as default score
print(estim_rf.score(X_test, y_test))

In [None]:
#get the best model
print(estim_rf.best_model())

Output: {RandomForestClassifier(bootstrap=False, class_weight=None, criterion='entropy', max_depth=None, max_features=0.20603235165277012, max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=16, n_jobs=1, oob_score=False, random_state=0, verbose=False, warm_start=False) No preprocessing}