# **One-Hot Encoding Classifiers**

In [4]:
import pickle
X_test = []
with (open("X_test", "rb")) as openfile:
    while True:
        try:
            X_test = pickle.load(openfile)
        except EOFError:
            break
y_test = []
with (open("y_test", "rb")) as openfile:
    while True:
        try:
            y_test = pickle.load(openfile)
        except EOFError:
            break
X_train = []
with (open("X_train", "rb")) as openfile:
    while True:
        try:
            X_train = pickle.load(openfile)
        except EOFError:
            break
y_train = []
with (open("y_train", "rb")) as openfile:
    while True:
        try:
            y_train = pickle.load(openfile)
        except EOFError:
            break

In [5]:
#This is done for to one-hot incode the guide sequences
for guide in X_train:
    for i in range(len(guide)):
        if guide[i] == 0:
            guide[i] = [1,0,0,0,0]
        elif guide[i] == 1:
            guide[i] = [0,0,0,0,1]
        elif guide[i] == 2:
            guide[i] = [0,0,0,1,0]
        elif guide[i] == 3:
            guide[i] = [0,0,1,0,0]
        elif guide[i] == 4:
            guide[i] = [0,1,0,0,0]
for guide in X_test:
    for i in range(len(guide)):
        if guide[i] == 0:
            guide[i] = [1,0,0,0,0]
        elif guide[i] == 1:
            guide[i] = [0,0,0,0,1]
        elif guide[i] == 2:
            guide[i] = [0,0,0,1,0]
        elif guide[i] == 3:
            guide[i] = [0,0,1,0,0]
        elif guide[i] == 4:
            guide[i] = [0,1,0,0,0]


In [6]:
import numpy as np
X_train = np.array(X_train)
nsamples, nx, ny = X_train.shape
X_train = X_train.reshape((nsamples,nx*ny))

In [7]:
import numpy as np
X_test = np.array(X_test)
nsamples, nx, ny = X_test.shape
X_test = X_test.reshape((nsamples,nx*ny))

# **Build Classification models**

In [8]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score


**K nearest neighbors**

In [9]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=3) # Define classifier
knn.fit(X_train, y_train) # Train model

# Make predictions
y_train_pred = knn.predict(X_train)
y_test_pred = knn.predict(X_test)

# Training set performance
knn_train_accuracy = accuracy_score(y_train, y_train_pred) # Calculate Accuracy
knn_train_mcc = matthews_corrcoef(y_train, y_train_pred) # Calculate MCC
knn_train_f1 = f1_score(y_train, y_train_pred, average='weighted') # Calculate F1-score

# Test set performance
knn_test_accuracy = accuracy_score(y_test, y_test_pred) # Calculate Accuracy
knn_test_mcc = matthews_corrcoef(y_test, y_test_pred) # Calculate MCC
knn_test_f1 = f1_score(y_test, y_test_pred, average='weighted') # Calculate F1-score
knn_test_ROC = roc_auc_score(y_test, y_test_pred) # Calculate ROC AUC
knn_test_recall = recall_score(y_test, y_test_pred, average='weighted') # Calculate Recall
knn_test_precision = precision_score(y_test, y_test_pred, average='weighted') # Calculate Precision

print('Model performance for Training set')
print('- Accuracy: %s' % knn_train_accuracy)
print('- MCC: %s' % knn_train_mcc)
print('- F1 score: %s' % knn_train_f1)
print('----------------------------------')
print('Model performance for Test set')
print('- Accuracy: %s' % knn_test_accuracy)
print('- MCC: %s' % knn_test_mcc)
print('- F1 score: %s' % knn_test_f1)
print('- ROC AUC: %s' % knn_test_ROC)
print('- Recall: %s' % knn_test_recall)
print('- Precision: %s' % knn_test_precision)

Model performance for Training set
- Accuracy: 0.8237410071942446
- MCC: 0.6483637045852658
- F1 score: 0.8236451745811304
----------------------------------
Model performance for Test set
- Accuracy: 0.7096774193548387
- MCC: 0.41875
- F1 score: 0.7096774193548387
- ROC AUC: 0.709375
- Recall: 0.7096774193548387
- Precision: 0.7096774193548387


In [10]:
#save model as pickle file
filename = 'knn_model1hot.sav'
pickle.dump(knn, open(filename, 'wb'))

**Support vector machine (Radial basis function kernel)**

In [11]:
from sklearn.svm import SVC

svm_rbf = SVC(gamma=2, C=1)
svm_rbf.fit(X_train, y_train)

# Make predictions
y_train_pred = svm_rbf.predict(X_train)
y_test_pred = svm_rbf.predict(X_test)

# Training set performance
svm_rbf_train_accuracy = accuracy_score(y_train, y_train_pred) # Calculate Accuracy
svm_rbf_train_mcc = matthews_corrcoef(y_train, y_train_pred) # Calculate MCC
svm_rbf_train_f1 = f1_score(y_train, y_train_pred, average='weighted') # Calculate F1-score

# Test set performance
svm_rbf_test_accuracy = accuracy_score(y_test, y_test_pred) # Calculate Accuracy
svm_rbf_test_mcc = matthews_corrcoef(y_test, y_test_pred) # Calculate MCC
svm_rbf_test_f1 = f1_score(y_test, y_test_pred, average='weighted') # Calculate F1-score
svm_rbf_test_ROC = roc_auc_score(y_test, y_test_pred) # Calculate ROC AUC
svm_rbf_test_recall = recall_score(y_test, y_test_pred, average='weighted') # Calculate Recall
svm_rbf_test_precision = precision_score(y_test, y_test_pred, average='weighted') # Calculate Precision

print('Model performance for Training set')
print('- Accuracy: %s' % svm_rbf_train_accuracy)
print('- MCC: %s' % svm_rbf_train_mcc)
print('- F1 score: %s' % svm_rbf_train_f1)
print('----------------------------------')
print('Model performance for Test set')
print('- Accuracy: %s' % svm_rbf_test_accuracy)
print('- MCC: %s' % svm_rbf_test_mcc)
print('- F1 score: %s' % svm_rbf_test_f1)
print('- ROC AUC: %s' % svm_rbf_test_ROC)
print('- Recall: %s' % svm_rbf_test_recall)
print('- Precision: %s' % svm_rbf_test_precision)

Model performance for Training set
- Accuracy: 1.0
- MCC: 1.0
- F1 score: 1.0
----------------------------------
Model performance for Test set
- Accuracy: 0.5806451612903226
- MCC: 0.2712254014483242
- F1 score: 0.480877082015602
- ROC AUC: 0.5666666666666667
- Recall: 0.5806451612903226
- Precision: 0.7686318131256952


In [12]:
#save model as pickle file
filename = 'svm_rbf_model1hot.sav'
pickle.dump(svm_rbf, open(filename, 'wb'))

**SVM Polynomial**

In [14]:
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

poly_kernel_svm_clf = Pipeline((
("scaler", StandardScaler()),
("svm_clf", SVC(kernel="poly", degree=17, coef0=0, C=1000))
))
poly_kernel_svm_clf.fit(X_train, y_train) # Train model

# Make predictions
y_train_pred = poly_kernel_svm_clf.predict(X_train)
y_test_pred = poly_kernel_svm_clf.predict(X_test)

# Training set performance
dt_train_accuracy = accuracy_score(y_train, y_train_pred) # Calculate Accuracy
dt_train_mcc = matthews_corrcoef(y_train, y_train_pred) # Calculate MCC
dt_train_f1 = f1_score(y_train, y_train_pred, average='weighted') # Calculate F1-score

# Test set performance
dt_test_accuracy = accuracy_score(y_test, y_test_pred) # Calculate Accuracy
dt_test_mcc = matthews_corrcoef(y_test, y_test_pred) # Calculate MCC
dt_test_f1 = f1_score(y_test, y_test_pred, average='weighted') # Calculate F1-score
dt_test_ROC = roc_auc_score(y_test, y_test_pred) # Calculate ROC AUC
dt_test_recall = recall_score(y_test, y_test_pred, average='weighted') # Calculate Recall
dt_test_precision = precision_score(y_test, y_test_pred, average='weighted') # Calculate Precision

print('Model performance for Training set')
print('- Accuracy: %s' % dt_train_accuracy)
print('- MCC: %s' % dt_train_mcc)
print('- F1 score: %s' % dt_train_f1)
print('----------------------------------')
print('Model performance for Test set')
print('- Accuracy: %s' % dt_test_accuracy)
print('- MCC: %s' % dt_test_mcc)
print('- F1 score: %s' % dt_test_f1)
print('- ROC AUC: %s' % dt_test_ROC)
print('- Recall: %s' % dt_test_recall)
print('- Precision: %s' % dt_test_precision)

Model performance for Training set
- Accuracy: 1.0
- MCC: 1.0
- F1 score: 1.0
----------------------------------
Model performance for Test set
- Accuracy: 0.7258064516129032
- MCC: 0.5319710492864359
- F1 score: 0.7003787984033045
- ROC AUC: 0.7166666666666667
- Recall: 0.7258064516129032
- Precision: 0.8209348255431204


In [15]:
#save model as pickle file
filename = 'poly_kernel_svm_clf_model1hot.sav'
pickle.dump(poly_kernel_svm_clf, open(filename, 'wb'))

**Random forest**

In [20]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=1000) # Define classifier
rf.fit(X_train, y_train) # Train model

# Make predictions
y_train_pred = rf.predict(X_train)
y_test_pred = rf.predict(X_test)

# Training set performance
rf_train_accuracy = accuracy_score(y_train, y_train_pred) # Calculate Accuracy
rf_train_mcc = matthews_corrcoef(y_train, y_train_pred) # Calculate MCC
rf_train_f1 = f1_score(y_train, y_train_pred, average='weighted') # Calculate F1-score

# Test set performance
rf_test_accuracy = accuracy_score(y_test, y_test_pred) # Calculate Accuracy
rf_test_mcc = matthews_corrcoef(y_test, y_test_pred) # Calculate MCC
rf_test_f1 = f1_score(y_test, y_test_pred, average='weighted') # Calculate F1-score
rf_test_ROC = roc_auc_score(y_test, y_test_pred) # Calculate ROC AUC
rf_test_recall = recall_score(y_test, y_test_pred, average='weighted') # Calculate Recall
rf_test_precision = precision_score(y_test, y_test_pred, average='weighted') # Calculate Precision

print('Model performance for Training set')
print('- Accuracy: %s' % rf_train_accuracy)
print('- MCC: %s' % rf_train_mcc)
print('- F1 score: %s' % rf_train_f1)
print('----------------------------------')
print('Model performance for Test set')
print('- Accuracy: %s' % rf_test_accuracy)
print('- MCC: %s' % rf_test_mcc)
print('- F1 score: %s' % rf_test_f1)
print('- ROC AUC: %s' % rf_test_ROC)
print('- Recall: %s' % rf_test_recall)
print('- Precision: %s' % rf_test_precision)

Model performance for Training set
- Accuracy: 1.0
- MCC: 1.0
- F1 score: 1.0
----------------------------------
Model performance for Test set
- Accuracy: 0.7580645161290323
- MCC: 0.5199835409730609
- F1 score: 0.7558332270814738
- ROC AUC: 0.7552083333333333
- Recall: 0.7580645161290323
- Precision: 0.7637314734088928


In [21]:
#save model as pickle file
filename = 'rf_model1hot.sav'
pickle.dump(rf, open(filename, 'wb'))

**Multi-Layer Perceptron**

In [41]:
from sklearn.neural_network import MLPClassifier

mlp = MLPClassifier(alpha=0.1, max_iter=10)
mlp.fit(X_train, y_train)

# Make predictions
y_train_pred = mlp.predict(X_train)
y_test_pred = mlp.predict(X_test)

# Training set performance
mlp_train_accuracy = accuracy_score(y_train, y_train_pred) # Calculate Accuracy
mlp_train_mcc = matthews_corrcoef(y_train, y_train_pred) # Calculate MCC
mlp_train_f1 = f1_score(y_train, y_train_pred, average='weighted') # Calculate F1-score

# Test set performance
mlp_test_accuracy = accuracy_score(y_test, y_test_pred) # Calculate Accuracy
mlp_test_mcc = matthews_corrcoef(y_test, y_test_pred) # Calculate MCC
mlp_test_f1 = f1_score(y_test, y_test_pred, average='weighted') # Calculate F1-score
mlp_test_ROC = roc_auc_score(y_test, y_test_pred) # Calculate ROC AUC
mlp_test_recall = recall_score(y_test, y_test_pred, average='weighted') # Calculate Recall
mlp_test_precision = precision_score(y_test, y_test_pred, average='weighted') # Calculate Precision


print('Model performance for Training set')
print('- Accuracy: %s' % mlp_train_accuracy)
print('- MCC: %s' % mlp_train_mcc)
print('- F1 score: %s' % mlp_train_f1)
print('----------------------------------')
print('Model performance for Test set')
print('- Accuracy: %s' % mlp_test_accuracy)
print('- MCC: %s' % mlp_test_mcc)
print('- F1 score: %s' % mlp_test_f1)
print('- ROC AUC: %s' % mlp_test_ROC)
print('- Recall: %s' % mlp_test_recall)
print('- Precision: %s' % mlp_test_precision)

Model performance for Training set
- Accuracy: 0.6888489208633094
- MCC: 0.37970097077108056
- F1 score: 0.688168144559005
----------------------------------
Model performance for Test set
- Accuracy: 0.7258064516129032
- MCC: 0.45487796963448873
- F1 score: 0.7255924051270429
- ROC AUC: 0.7270833333333333
- Recall: 0.7258064516129032
- Precision: 0.7287895641622005




In [42]:
#save model as pickle file
filename = 'mlp_model1hot.sav'
pickle.dump(mlp, open(filename, 'wb'))

# **Build Stacked model**

In [44]:
# Define estimators
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression

estimator_list = [
    ('knn',knn),
    ('svm_rbf',svm_rbf),
    ('poly_kernel_svm_clf',poly_kernel_svm_clf),
    ('rf',rf),
    ('mlp',mlp), ]

# Build stack model
stack_model = StackingClassifier(
    estimators=estimator_list, final_estimator=LogisticRegression()
)

# Train stacked model
stack_model.fit(X_train, y_train)

# Make predictions
y_train_pred = stack_model.predict(X_train)
y_test_pred = stack_model.predict(X_test)

# Training set model performance
stack_model_train_accuracy = accuracy_score(y_train, y_train_pred) # Calculate Accuracy
stack_model_train_mcc = matthews_corrcoef(y_train, y_train_pred) # Calculate MCC
stack_model_train_f1 = f1_score(y_train, y_train_pred, average='weighted') # Calculate F1-score

# Test set model performance
stack_model_test_accuracy = accuracy_score(y_test, y_test_pred) # Calculate Accuracy
stack_model_test_mcc = matthews_corrcoef(y_test, y_test_pred) # Calculate MCC
stack_model_test_f1 = f1_score(y_test, y_test_pred, average='weighted') # Calculate F1-score
stack_model_test_ROC = roc_auc_score(y_test, y_test_pred) # Calculate ROC AUC
stack_model_test_recall = recall_score(y_test, y_test_pred, average='weighted') # Calculate Recall
stack_model_test_precision = precision_score(y_test, y_test_pred, average='weighted') # Calculate Precision

print('Model performance for Training set')
print('- Accuracy: %s' % stack_model_train_accuracy)
print('- MCC: %s' % stack_model_train_mcc)
print('- F1 score: %s' % stack_model_train_f1)
print('----------------------------------')
print('Model performance for Test set')
print('- Accuracy: %s' % stack_model_test_accuracy)
print('- MCC: %s' % stack_model_test_mcc)
print('- F1 score: %s' % stack_model_test_f1)
print('- ROC AUC: %s' % stack_model_test_ROC)
print('- Recall: %s' % stack_model_test_recall)
print('- Precision: %s' % stack_model_test_precision)



Model performance for Training set
- Accuracy: 1.0
- MCC: 1.0
- F1 score: 1.0
----------------------------------
Model performance for Test set
- Accuracy: 0.7903225806451613
- MCC: 0.6027585316394857
- F1 score: 0.7847469522497466
- ROC AUC: 0.7854166666666667
- Recall: 0.7903225806451613
- Precision: 0.8154434078903001


In [45]:
#save model as pickle file
filename = 'stack_model1hot.sav'
pickle.dump(stack_model, open(filename, 'wb'))

In [46]:
# Define estimators
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression

estimator_list = [
    ('knn',knn),
    ('poly_kernel_svm_clf',poly_kernel_svm_clf),
    ('rf',rf) ]

# Build stack model
stack_model = StackingClassifier(
    estimators=estimator_list, final_estimator=LogisticRegression()
)

# Train stacked model
stack_model.fit(X_train, y_train)

# Make predictions
y_train_pred = stack_model.predict(X_train)
y_test_pred = stack_model.predict(X_test)

# Training set model performance
stack_model_train_accuracy = accuracy_score(y_train, y_train_pred) # Calculate Accuracy
stack_model_train_mcc = matthews_corrcoef(y_train, y_train_pred) # Calculate MCC
stack_model_train_f1 = f1_score(y_train, y_train_pred, average='weighted') # Calculate F1-score

# Test set model performance
stack_model_test_accuracy = accuracy_score(y_test, y_test_pred) # Calculate Accuracy
stack_model_test_mcc = matthews_corrcoef(y_test, y_test_pred) # Calculate MCC
stack_model_test_f1 = f1_score(y_test, y_test_pred, average='weighted') # Calculate F1-score
stack_model_test_ROC = roc_auc_score(y_test, y_test_pred) # Calculate ROC AUC
stack_model_test_recall = recall_score(y_test, y_test_pred, average='weighted') # Calculate Recall
stack_model_test_precision = precision_score(y_test, y_test_pred, average='weighted') # Calculate Precision

print('Model performance for Training set')
print('- Accuracy: %s' % stack_model_train_accuracy)
print('- MCC: %s' % stack_model_train_mcc)
print('- F1 score: %s' % stack_model_train_f1)
print('----------------------------------')
print('Model performance for Test set')
print('- Accuracy: %s' % stack_model_test_accuracy)
print('- MCC: %s' % stack_model_test_mcc)
print('- F1 score: %s' % stack_model_test_f1)
print('- ROC AUC: %s' % stack_model_test_ROC)
print('- Recall: %s' % stack_model_test_recall)
print('- Precision: %s' % stack_model_test_precision)

Model performance for Training set
- Accuracy: 1.0
- MCC: 1.0
- F1 score: 1.0
----------------------------------
Model performance for Test set
- Accuracy: 0.7580645161290323
- MCC: 0.5345632233153104
- F1 score: 0.7516310987497076
- ROC AUC: 0.753125
- Recall: 0.7580645161290323
- Precision: 0.7798134202540182


In [47]:
#save model as pickle file
filename = 'stack_model1hot2.sav'
pickle.dump(stack_model, open(filename, 'wb'))

In [56]:
# Define estimators
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression

estimator_list = [
    ('knn',knn),
    ('svm_rbf',svm_rbf),
    ('poly_kernel_svm_clf',poly_kernel_svm_clf),
    ('rf',rf) ]

# Build stack model
stack_model = StackingClassifier(
    estimators=estimator_list, final_estimator=LogisticRegression()
)

# Train stacked model
stack_model.fit(X_train, y_train)

# Make predictions
y_train_pred = stack_model.predict(X_train)
y_test_pred = stack_model.predict(X_test)

# Training set model performance
stack_model_train_accuracy = accuracy_score(y_train, y_train_pred) # Calculate Accuracy
stack_model_train_mcc = matthews_corrcoef(y_train, y_train_pred) # Calculate MCC
stack_model_train_f1 = f1_score(y_train, y_train_pred, average='weighted') # Calculate F1-score

# Test set model performance
stack_model_test_accuracy = accuracy_score(y_test, y_test_pred) # Calculate Accuracy
stack_model_test_mcc = matthews_corrcoef(y_test, y_test_pred) # Calculate MCC
stack_model_test_f1 = f1_score(y_test, y_test_pred, average='weighted') # Calculate F1-score
stack_model_test_ROC = roc_auc_score(y_test, y_test_pred) # Calculate ROC AUC
stack_model_test_recall = recall_score(y_test, y_test_pred, average='weighted') # Calculate Recall
stack_model_test_precision = precision_score(y_test, y_test_pred, average='weighted') # Calculate Precision

print('Model performance for Training set')
print('- Accuracy: %s' % stack_model_train_accuracy)
print('- MCC: %s' % stack_model_train_mcc)
print('- F1 score: %s' % stack_model_train_f1)
print('----------------------------------')
print('Model performance for Test set')
print('- Accuracy: %s' % stack_model_test_accuracy)
print('- MCC: %s' % stack_model_test_mcc)
print('- F1 score: %s' % stack_model_test_f1)
print('- ROC AUC: %s' % stack_model_test_ROC)
print('- Recall: %s' % stack_model_test_recall)
print('- Precision: %s' % stack_model_test_precision)

Model performance for Training set
- Accuracy: 1.0
- MCC: 1.0
- F1 score: 1.0
----------------------------------
Model performance for Test set
- Accuracy: 0.7741935483870968
- MCC: 0.5746116701117124
- F1 score: 0.766870095902354
- ROC AUC: 0.76875
- Recall: 0.7741935483870968
- Precision: 0.804147465437788


In [None]:
#save file as pickle file
filename = 'stack_model1hot3.sav'
pickle.dump(stack_model, open(filename, 'wb'))