# Import dependencies

In [1]:
import numpy as np
import pickle
import pandas as pd
import joblib

# Load classifier and data

In [2]:
def load_classifier_and_data(model, input_vector):
    """
    Loads trained classifier and training + test data.
    
    Arguments:
    model -- which classifier to use ("RF", "GB", "LR", "SVM")
    input_vector -- use high-dimensional ("high-dim") or low dimensional ("lwo-dim") input vectors
    
    
    Returns:
    clf -- trained classifer
    X_train -- data on which classifier was trained
    X_test -- unseen test data for evaluation
    y_train_binary - labels of train data
    y_test_binary - labels of test data
    """
    
    if model == "RF":
        if input_vector == "high-dim":
            path = "../../ML-models/random-forest/complete-trajectories/high-dimensional-input/"
            clf = pickle.load(open(path + "RF_high_dim.sav", 'rb'))
            X_train = np.load(path + "X_train_high_dim.npy")
            X_test = np.load(path + "X_test_high_dim.npy")
            y_train_binary = np.load(path + "y_train_binary_high_dim.npy")
            y_test_binary = np.load(path + "y_test_binary_high_dim.npy")
        elif input_vector == "low-dim":
            path = "../../ML-models/random-forest/complete-trajectories/low-dimensional-input/"
            clf = joblib.load(open(path + "RF_low_dim.joblib", 'rb'))
            X_train = np.load(path + "X_train_low_dim.npy")
            X_test = np.load(path + "X_test_low_dim.npy")
            y_train_binary = np.load(path + "y_train_binary_low_dim.npy")
            y_test_binary = np.load(path + "y_test_binary_low_dim.npy")
        else:
            print("Please select valid input vector type")
            return 0
    elif model == "GB":
        if input_vector == "high-dim":
            path = "../../ML-models/gradient-boosting/complete-trajectories/high-dimensional-input/"
            clf = pickle.load(open(path + "GB_high_dim.sav", 'rb'))
            X_train = np.load(path + "X_train_high_dim.npy")
            X_test = np.load(path + "X_test_high_dim.npy")
            y_train_binary = np.load(path + "y_train_binary_high_dim.npy")
            y_test_binary = np.load(path + "y_test_binary_high_dim.npy")
        elif input_vector == "low-dim":
            path = "../../ML-models/gradient-boosting/complete-trajectories/low-dimensional-input/"
            clf = joblib.load(open(path + "GB_low_dim.joblib", 'rb'))
            X_train = np.load(path + "X_train_low_dim.npy")
            X_test = np.load(path + "X_test_low_dim.npy")
            y_train_binary = np.load(path + "y_train_binary_low_dim.npy")
            y_test_binary = np.load(path + "y_test_binary_low_dim.npy")
        else:
            print("Please select valid input vector type")
            return 0
    elif model == "LR":
        if input_vector == "high-dim":
            path = "../../ML-models/logistic-regression/complete-trajectories/high-dimensional-input/"
            clf = pickle.load(open(path + "LR_high_dim.sav", 'rb'))
            X_train = np.load(path + "X_train_high_dim.npy")
            X_test = np.load(path + "X_test_high_dim.npy")
            y_train_binary = np.load(path + "y_train_binary_high_dim.npy")
            y_test_binary = np.load(path + "y_test_binary_high_dim.npy")
        elif input_vector == "low-dim":
            path = "../../ML-models/logistic-regression/complete-trajectories/low-dimensional-input/rescaled/"
            clf = joblib.load(open(path + "LR_low_dim.joblib", 'rb'))
            X_train = np.load(path + "X_train_low_dim.npy")
            X_test = np.load(path + "X_test_low_dim.npy")
            y_train_binary = np.load(path + "y_train_binary_low_dim.npy")
            y_test_binary = np.load(path + "y_test_binary_low_dim.npy")
        else:
            print("Please select valid input vector type")
            return 0
    elif model == "SVM":
        if input_vector == "high-dim":
            path = "../../ML-models/svm/complete-trajectories/high-dimensional-input/"
            clf = pickle.load(open(path + "SVM_high_dim.sav", 'rb'))
            X_train = np.load(path + "X_train_high_dim.npy")
            X_test = np.load(path + "X_test_high_dim.npy")
            y_train_binary = np.load(path + "y_train_binary_high_dim.npy")
            y_test_binary = np.load(path + "y_test_binary_high_dim.npy")
        elif input_vector == "low-dim":
            path = "../../ML-models/svm/complete-trajectories/low-dimensional-input/rescaled/"
            clf = joblib.load(open(path + "SVM_low_dim.joblib", 'rb'))
            X_train = np.load(path + "X_train_low_dim.npy")
            X_test = np.load(path + "X_test_low_dim.npy")
            y_train_binary = np.load(path + "y_train_binary_low_dim.npy")
            y_test_binary = np.load(path + "y_test_binary_low_dim.npy")
        else:
            print("Please select valid input vector type")
            return 0
    else:
        print("Please select valid model")
        return 0
        
    return clf, X_train, X_test, y_train_binary, y_test_binary

In [4]:
clf, X_train, X_test, y_train_binary, y_test_binary = load_classifier_and_data(model="SVM", input_vector="low-dim")

# Evaluate

## Overall prediction accuracies

In [68]:
print("Accuracy on train data: ", clf.score(X_train, y_train_binary))

Accuracy on train data:  0.888652858913082


In [69]:
print("Accuracy on test data: ", clf.score(X_test, y_test_binary))

Accuracy on test data:  0.8599177654898624


## Confusion matrix

In [18]:
# dictionary of labels used in data
labelsystem = {
    0:'track-wt',
    1:'track-dSMC',
    2:'track-dParAB',
    3:'track-dSMC-dParAB',
    4: 'factory-wt',
    5: 'factory-dSMC',
    6: 'factory-dParAB',
    7: 'factory-dSMCdParAB'
}

In [20]:
# Apply the Classifier we trained to the test data (which, remember, it has never seen before)
predictions = clf.predict(X_test)
# construct pandas dataframe
preds_idx = []
for i in predictions:
    preds_idx.append(labelsystem[i])
predictions_df = pd.DataFrame(data=preds_idx, index=predictions)

[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    0.5s
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:    0.9s
[Parallel(n_jobs=4)]: Done 800 out of 800 | elapsed:    0.9s finished


In [21]:
# Compare actual labels
reality = []
for i in y_test_binary:
    reality.append(labelsystem[i])

reality = np.array(reality)

In [22]:
# Create confusion matrix
conf_matr = pd.crosstab(reality, 
                        predictions_df[0].values, 
                        rownames=['Actual label'], 
                        colnames=['Predicted label'], normalize='index')
conf_matr = pd.crosstab(reality, 
                        predictions_df[0].values, 
                        rownames=['Actual label'], 
                        colnames=['Predicted label'])

conf_matr

Predicted label,factory-dParAB,factory-dSMC,factory-dSMCdParAB,factory-wt,track-dParAB,track-dSMC,track-dSMC-dParAB,track-wt
Actual label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
factory-dParAB,796,12,65,16,0,0,0,0
factory-dSMC,8,831,23,41,0,0,0,0
factory-dSMCdParAB,54,41,803,10,0,0,0,0
factory-wt,14,36,5,809,0,0,0,0
track-dParAB,0,0,0,0,866,3,44,4
track-dSMC,0,0,0,0,2,835,17,75
track-dSMC-dParAB,0,0,0,0,48,8,819,6
track-wt,0,0,0,0,0,75,9,858


## Prediction and recall values

### Precision
= fraction of correct predictions among all predictions. <br>
- tells us how often a classifier is correct if it predicts a given class\\
- high precision: return much more relevant results than irrelevant ones\\


In [23]:
# Track model of replication
precision_track_wt = conf_matr['track-wt']['track-wt']/conf_matr['track-wt'].sum()
print("precision_track_wt = ", precision_track_wt)

precision_track_dSMC = conf_matr['track-dSMC']['track-dSMC']/conf_matr['track-dSMC'].sum()
print("precision_track_dSMC = ", precision_track_dSMC)

precision_track_dParAB = conf_matr['track-dParAB']['track-dParAB']/conf_matr['track-dParAB'].sum()
print("precision_track_dParAB = ", precision_track_dParAB)

precision_track_dSMC_dParAB = conf_matr['track-dSMC-dParAB']['track-dSMC-dParAB']/conf_matr['track-dSMC-dParAB'].sum()
print("precision_track_dSMC_dParAB = ", precision_track_dSMC_dParAB)


# Factory model of replication
precision_factory_wt = conf_matr['factory-wt']['factory-wt']/conf_matr['factory-wt'].sum()
print("precision_factory_wt = ", precision_factory_wt)

precision_factory_dSMC = conf_matr['factory-dSMC']['factory-dSMC']/conf_matr['factory-dSMC'].sum()
print("precision_factory_dSMC = ", precision_factory_dSMC)

precision_factory_dParAB = conf_matr['factory-dParAB']['factory-dParAB']/conf_matr['factory-dParAB'].sum()
print("precision_factory_dParAB = ", precision_factory_dParAB)

precision_factory_dSMC_dParAB = conf_matr['factory-dSMCdParAB']['factory-dSMCdParAB']/conf_matr['factory-dSMCdParAB'].sum()
print("precision_factory_dSMC_dParAB = ", precision_factory_dSMC_dParAB)

precision_track_wt =  0.9098621420996819
precision_track_dSMC =  0.9066232356134636
precision_track_dParAB =  0.9454148471615721
precision_track_dSMC_dParAB =  0.9212598425196851
precision_factory_wt =  0.9235159817351598
precision_factory_dSMC =  0.9032608695652173
precision_factory_dParAB =  0.9128440366972477
precision_factory_dSMC_dParAB =  0.8962053571428571


### Recall
= fraction of correct predictions of a given class over the total number of members of this class <br>
- high recall: yield most of the relevant results

In [24]:
# Track model of replication
recall_track_wt = conf_matr['track-wt']['track-wt']/conf_matr.T['track-wt'].sum()
print("recall_track_wt = ", recall_track_wt)

recall_track_dSMC = conf_matr['track-dSMC']['track-dSMC']/conf_matr.T['track-dSMC'].sum()
print("recall_track_dSMC = ", recall_track_dSMC)

recall_track_dParAB = conf_matr['track-dParAB']['track-dParAB']/conf_matr.T['track-dParAB'].sum()
print("recall_track_dParAB = ", recall_track_dParAB)

recall_track_dSMC_dParAB = conf_matr['track-dSMC-dParAB']['track-dSMC-dParAB']/conf_matr.T['track-dSMC-dParAB'].sum()
print("recall_track_dSMC_dParAB = ", recall_track_dSMC_dParAB)


# Factory model of replication
recall_factory_wt = conf_matr['factory-wt']['factory-wt']/conf_matr.T['factory-wt'].sum()
print("recall_factory_wt = ", recall_factory_wt)

recall_factory_dSMC = conf_matr['factory-dSMC']['factory-dSMC']/conf_matr.T['factory-dSMC'].sum()
print("recall_factory_dSMC = ", recall_factory_dSMC)

recall_factory_dParAB = conf_matr['factory-dParAB']['factory-dParAB']/conf_matr.T['factory-dParAB'].sum()
print("recall_factory_dParAB = ", recall_factory_dParAB)

recall_factory_dSMC_dParAB = conf_matr['factory-dSMCdParAB']['factory-dSMCdParAB']/conf_matr.T['factory-dSMCdParAB'].sum()
print("recall_factory_dSMC_dParAB = ", recall_factory_dSMC_dParAB)

recall_track_wt =  0.910828025477707
recall_track_dSMC =  0.898815931108719
recall_track_dParAB =  0.9443838604143948
recall_track_dSMC_dParAB =  0.9296254256526674
recall_factory_wt =  0.9363425925925926
recall_factory_dSMC =  0.920265780730897
recall_factory_dParAB =  0.8953880764904387
recall_factory_dSMC_dParAB =  0.8843612334801763


## Feature importance

In [90]:
def calculate_feature_importance(clf, model, input_vector):
    """
    Gives out the feature importance values of a trained classifier.
    We only calculated feature importance for low-dimensional input vectors.
    
    Arguments:
    clf -- trained classifer - select above using function load_classifier_and_data()
    model -- which classifier was selected above ("RF", "GB", "LR", "SVM")? 
    input_vector -- use high-dimensional ("high-dim") or low dimensional ("lwo-dim") input vectors
    
    
    Returns:
    feature_importance -- dictionary with feature importance values
    """
    
    # define list of features
    features = ['MSD', 'Alpha', 'MSDR', 'FD', 'RG', 'E', 'G', 'S']
    if input_vector == "high_dim":
        print("Feature importance was only calculated for low-dimensional input vectors.")
        return 0
    elif input_vector == "low_dim":
        if model == "RF" or model == "GB":
            feature_importance = list(zip(features, clf.feature_importances_))
        elif model =="LR" or model =="SVM":
            coefficients = abs(clf.coef_[0])
            importances = coefficients/np.sum(coefficients)*100
            feature_importance = list(zip(features, importances))
        else:
            print("no valid model selected")
            return 0
    
    return feature_importance

In [91]:
feature_importance = calculate_feature_importance(clf=clf, model="SVM", input_vector="low_dim")
feature_importance

[('MSD', 5.985632088796504),
 ('Alpha', 45.62055159206931),
 ('MSDR', 0.00039053287902617484),
 ('FD', 28.041420909296484),
 ('RG', 7.938495537167234),
 ('E', 4.561991086953707),
 ('G', 4.201706985781596),
 ('S', 3.6498112670561365)]