In [45]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import csv
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
#Use featureHasher possible .. a good alternative from CountVectorizer, can potentially use less memory  
from sklearn.feature_extraction import FeatureHasher
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.linear_model import SGDClassifier
import seaborn as sns

In [2]:
new_data = pd.read_csv('../database-files/output.csv')

In [3]:
new_data

Unnamed: 0.1,Unnamed: 0,Source,Dna,zf,f1,f2,f3,ex
0,0,'',ctcgcgGCGgcggcc,3,KSADLKRHIRI,RSDHLTTHIRT,RSDERKRHTKI,ex-
1,1,'',ctcgcgGTTgcggcc,3,KSADLKRHIRI,RSDHLTTHIRT,RSDERKRHTKI,ex-
2,2,'',ctcgatTACgcggcc,3,KSADLKRHIRI,RSDHLTTHIRT,TSGNLVRHTKI,ex-
3,3,'',ctcgatTATgcggcc,3,KSADLKRHIRI,RSDHLTTHIRT,TSGNLVRHTKI,ex-
4,4,'',ctcgatTCTgcggcc,3,KSADLKRHIRI,RSDHLTTHIRT,TSGNLVRHTKI,ex-
...,...,...,...,...,...,...,...,...
1165,1165,'',cccgcgGTTgcgtcc,3,KSADLKRHIRI,TADKLSRHIRT,RSDERKRHTKI,ex-
1166,1166,'',cccgcgGCGgcgtcc,3,KSADLKRHIRI,TADKLSRHIRT,RSDERKRHTKI,ex-
1167,1167,'',cccgcgGCAgcgtcc,3,KSADLKRHIRI,TADKLSRHIRT,RSDERKRHTKI,ex-
1168,1168,'',cccgcgGCTgcgtcc,3,KSADLKRHIRI,TADKLSRHIRT,RSDERKRHTKI,ex-


In [4]:
positiveExamples = new_data.loc[(new_data['ex'] == "ex+") | (new_data['ex'] == "+")]
negativeExamples = new_data.loc[(new_data['ex'] == "ex-") | (new_data['ex'] == "-")]

In [5]:
print(positiveExamples)

     Unnamed: 0 Source              Dna  zf           f1           f2  \
8             8     ''  ctcgatAAAgcggcc   3  KSADLKRHIRI  QRANLRAHIRT   
9             9     ''  ctcgatAACgcggcc   3  KSADLKRHIRI  DSGNLRVHIRT   
10           10     ''  ctcgatAAGgcggcc   3  KSADLKRHIRI  RSDTLSNHIRT   
11           11     ''  ctcgatAATgcggcc   3  KSADLKRHIRI  TTGNLTVHIRT   
12           12     ''  ctcgatACAgcggcc   3  KSADLKRHIRI  SPADLTRHIRT   
..          ...    ...              ...  ..          ...          ...   
777         777     ''  cccgcgGGCgcgtcc   3  KSADLKRHIRI  QSSKLVRHIRT   
778         778     ''  cccgcgGGTgcgtcc   3  KSADLKRHIRI  TSGHLVRHIRT   
779         779     ''  cccgcgGGCgcgtcc   3  KSADLKRHIRI  TSGKLVRHIRT   
780         780     ''  cccgcgGGTgcgtcc   3  KSADLKRHIRI  TADHLSRHIRT   
781         781     ''  cccgcgGGGgcgtcc   3  KSADLKRHIRI  TADKLSRHIRT   

              f3   ex  
8    TSGNLVRHTKI  ex+  
9    TSGNLVRHTKI  ex+  
10   TSGNLVRHTKI  ex+  
11   TSGNLVRHTKI  ex+  
12 

In [6]:
print(negativeExamples)

      Unnamed: 0 Source              Dna  zf           f1           f2  \
0              0     ''  ctcgcgGCGgcggcc   3  KSADLKRHIRI  RSDHLTTHIRT   
1              1     ''  ctcgcgGTTgcggcc   3  KSADLKRHIRI  RSDHLTTHIRT   
2              2     ''  ctcgatTACgcggcc   3  KSADLKRHIRI  RSDHLTTHIRT   
3              3     ''  ctcgatTATgcggcc   3  KSADLKRHIRI  RSDHLTTHIRT   
4              4     ''  ctcgatTCTgcggcc   3  KSADLKRHIRI  RSDHLTTHIRT   
...          ...    ...              ...  ..          ...          ...   
1165        1165     ''  cccgcgGTTgcgtcc   3  KSADLKRHIRI  TADKLSRHIRT   
1166        1166     ''  cccgcgGCGgcgtcc   3  KSADLKRHIRI  TADKLSRHIRT   
1167        1167     ''  cccgcgGCAgcgtcc   3  KSADLKRHIRI  TADKLSRHIRT   
1168        1168     ''  cccgcgGCTgcgtcc   3  KSADLKRHIRI  TADKLSRHIRT   
1169        1169     ''  cccgcgGCCgcgtcc   3  KSADLKRHIRI  TADKLSRHIRT   

               f3   ex  
0     RSDERKRHTKI  ex-  
1     RSDERKRHTKI  ex-  
2     TSGNLVRHTKI  ex-  
3     TSGNL

In [7]:
print(len(positiveExamples))
print(len(negativeExamples))

255
915


In [8]:
combined_data = pd.concat([positiveExamples, negativeExamples])

In [9]:
pos_combined_dna_with_zinc = []
pos_dna = []
for index, row in positiveExamples.iterrows():
    pos_seq_dna_with_zinc = (row['f1'] + row['f2'] + row['f3'])
    pos_combined_dna_with_zinc.append(pos_seq_dna_with_zinc)
    pos_dna_seq = row['Dna']
    pos_dna.append(pos_dna_seq)

neg_combined_dna_with_zinc = []
neg_dna = []
for index, row in negativeExamples.iterrows():
    neg_seq_dna_with_zinc = (row['f1'] + row['f2'] + row['f3'])
    neg_combined_dna_with_zinc.append(neg_seq_dna_with_zinc)
    neg_dna_seq = row['Dna']
    neg_dna.append(neg_dna_seq)

In [10]:
def kmers(sequence, kmer_length=4):
    return[sequence[x:x+kmer_length].lower() for x in range(len(sequence) - kmer_length + 1)]

In [11]:
pos_df = pd.DataFrame(data=pos_combined_dna_with_zinc, columns=['Zinc'])
neg_df = pd.DataFrame(data=neg_combined_dna_with_zinc, columns=['Zinc'])

In [12]:
pos_df['kmers_with_zinc'] = pos_df.apply(lambda x: kmers(x['Zinc']), axis = 1)

In [13]:
neg_df['kmers_with_zinc'] = neg_df.apply(lambda x: kmers(x['Zinc']), axis = 1)

In [14]:
pos_df['Dna'] = pos_dna

In [15]:
neg_df['Dna'] = neg_dna

In [16]:
dna_combined = pd.concat([pos_df, neg_df])

In [17]:
pos_and_neg = pd.concat([pos_df, neg_df])
pos_and_neg

Unnamed: 0,Zinc,kmers_with_zinc,Dna
0,KSADLKRHIRIQRANLRAHIRTTSGNLVRHTKI,"[ksad, sadl, adlk, dlkr, lkrh, krhi, rhir, hir...",ctcgatAAAgcggcc
1,KSADLKRHIRIDSGNLRVHIRTTSGNLVRHTKI,"[ksad, sadl, adlk, dlkr, lkrh, krhi, rhir, hir...",ctcgatAACgcggcc
2,KSADLKRHIRIRSDTLSNHIRTTSGNLVRHTKI,"[ksad, sadl, adlk, dlkr, lkrh, krhi, rhir, hir...",ctcgatAAGgcggcc
3,KSADLKRHIRITTGNLTVHIRTTSGNLVRHTKI,"[ksad, sadl, adlk, dlkr, lkrh, krhi, rhir, hir...",ctcgatAATgcggcc
4,KSADLKRHIRISPADLTRHIRTTSGNLVRHTKI,"[ksad, sadl, adlk, dlkr, lkrh, krhi, rhir, hir...",ctcgatACAgcggcc
...,...,...,...
910,KSADLKRHIRITADKLSRHIRTRSDERKRHTKI,"[ksad, sadl, adlk, dlkr, lkrh, krhi, rhir, hir...",cccgcgGTTgcgtcc
911,KSADLKRHIRITADKLSRHIRTRSDERKRHTKI,"[ksad, sadl, adlk, dlkr, lkrh, krhi, rhir, hir...",cccgcgGCGgcgtcc
912,KSADLKRHIRITADKLSRHIRTRSDERKRHTKI,"[ksad, sadl, adlk, dlkr, lkrh, krhi, rhir, hir...",cccgcgGCAgcgtcc
913,KSADLKRHIRITADKLSRHIRTRSDERKRHTKI,"[ksad, sadl, adlk, dlkr, lkrh, krhi, rhir, hir...",cccgcgGCTgcgtcc


In [18]:
pos_df['kmers_with_dna'] = pos_df.apply(lambda x: kmers(x['Dna']), axis = 1)
neg_df['kmers_with_dna'] = neg_df.apply(lambda x: kmers(x['Dna']), axis = 1)
pos_and_neg = pd.concat([pos_df, neg_df])
pos_and_neg

Unnamed: 0,Zinc,kmers_with_zinc,Dna,kmers_with_dna
0,KSADLKRHIRIQRANLRAHIRTTSGNLVRHTKI,"[ksad, sadl, adlk, dlkr, lkrh, krhi, rhir, hir...",ctcgatAAAgcggcc,"[ctcg, tcga, cgat, gata, ataa, taaa, aaag, aag..."
1,KSADLKRHIRIDSGNLRVHIRTTSGNLVRHTKI,"[ksad, sadl, adlk, dlkr, lkrh, krhi, rhir, hir...",ctcgatAACgcggcc,"[ctcg, tcga, cgat, gata, ataa, taac, aacg, acg..."
2,KSADLKRHIRIRSDTLSNHIRTTSGNLVRHTKI,"[ksad, sadl, adlk, dlkr, lkrh, krhi, rhir, hir...",ctcgatAAGgcggcc,"[ctcg, tcga, cgat, gata, ataa, taag, aagg, agg..."
3,KSADLKRHIRITTGNLTVHIRTTSGNLVRHTKI,"[ksad, sadl, adlk, dlkr, lkrh, krhi, rhir, hir...",ctcgatAATgcggcc,"[ctcg, tcga, cgat, gata, ataa, taat, aatg, atg..."
4,KSADLKRHIRISPADLTRHIRTTSGNLVRHTKI,"[ksad, sadl, adlk, dlkr, lkrh, krhi, rhir, hir...",ctcgatACAgcggcc,"[ctcg, tcga, cgat, gata, atac, taca, acag, cag..."
...,...,...,...,...
910,KSADLKRHIRITADKLSRHIRTRSDERKRHTKI,"[ksad, sadl, adlk, dlkr, lkrh, krhi, rhir, hir...",cccgcgGTTgcgtcc,"[cccg, ccgc, cgcg, gcgg, cggt, ggtt, gttg, ttg..."
911,KSADLKRHIRITADKLSRHIRTRSDERKRHTKI,"[ksad, sadl, adlk, dlkr, lkrh, krhi, rhir, hir...",cccgcgGCGgcgtcc,"[cccg, ccgc, cgcg, gcgg, cggc, ggcg, gcgg, cgg..."
912,KSADLKRHIRITADKLSRHIRTRSDERKRHTKI,"[ksad, sadl, adlk, dlkr, lkrh, krhi, rhir, hir...",cccgcgGCAgcgtcc,"[cccg, ccgc, cgcg, gcgg, cggc, ggca, gcag, cag..."
913,KSADLKRHIRITADKLSRHIRTRSDERKRHTKI,"[ksad, sadl, adlk, dlkr, lkrh, krhi, rhir, hir...",cccgcgGCTgcgtcc,"[cccg, ccgc, cgcg, gcgg, cggc, ggct, gctg, ctg..."


In [19]:
pos_df['Class'] = 0
neg_df['Class'] = 1
pos_and_neg = pd.concat([pos_df, neg_df])
pos_and_neg

Unnamed: 0,Zinc,kmers_with_zinc,Dna,kmers_with_dna,Class
0,KSADLKRHIRIQRANLRAHIRTTSGNLVRHTKI,"[ksad, sadl, adlk, dlkr, lkrh, krhi, rhir, hir...",ctcgatAAAgcggcc,"[ctcg, tcga, cgat, gata, ataa, taaa, aaag, aag...",0
1,KSADLKRHIRIDSGNLRVHIRTTSGNLVRHTKI,"[ksad, sadl, adlk, dlkr, lkrh, krhi, rhir, hir...",ctcgatAACgcggcc,"[ctcg, tcga, cgat, gata, ataa, taac, aacg, acg...",0
2,KSADLKRHIRIRSDTLSNHIRTTSGNLVRHTKI,"[ksad, sadl, adlk, dlkr, lkrh, krhi, rhir, hir...",ctcgatAAGgcggcc,"[ctcg, tcga, cgat, gata, ataa, taag, aagg, agg...",0
3,KSADLKRHIRITTGNLTVHIRTTSGNLVRHTKI,"[ksad, sadl, adlk, dlkr, lkrh, krhi, rhir, hir...",ctcgatAATgcggcc,"[ctcg, tcga, cgat, gata, ataa, taat, aatg, atg...",0
4,KSADLKRHIRISPADLTRHIRTTSGNLVRHTKI,"[ksad, sadl, adlk, dlkr, lkrh, krhi, rhir, hir...",ctcgatACAgcggcc,"[ctcg, tcga, cgat, gata, atac, taca, acag, cag...",0
...,...,...,...,...,...
910,KSADLKRHIRITADKLSRHIRTRSDERKRHTKI,"[ksad, sadl, adlk, dlkr, lkrh, krhi, rhir, hir...",cccgcgGTTgcgtcc,"[cccg, ccgc, cgcg, gcgg, cggt, ggtt, gttg, ttg...",1
911,KSADLKRHIRITADKLSRHIRTRSDERKRHTKI,"[ksad, sadl, adlk, dlkr, lkrh, krhi, rhir, hir...",cccgcgGCGgcgtcc,"[cccg, ccgc, cgcg, gcgg, cggc, ggcg, gcgg, cgg...",1
912,KSADLKRHIRITADKLSRHIRTRSDERKRHTKI,"[ksad, sadl, adlk, dlkr, lkrh, krhi, rhir, hir...",cccgcgGCAgcgtcc,"[cccg, ccgc, cgcg, gcgg, cggc, ggca, gcag, cag...",1
913,KSADLKRHIRITADKLSRHIRTRSDERKRHTKI,"[ksad, sadl, adlk, dlkr, lkrh, krhi, rhir, hir...",cccgcgGCTgcgtcc,"[cccg, ccgc, cgcg, gcgg, cggc, ggct, gctg, ctg...",1


In [20]:
kmer_with_zinc = list(pos_and_neg['kmers_with_zinc'])
for i in range(len(kmer_with_zinc)):
    kmer_with_zinc[i] = ' '.join(kmer_with_zinc[i])
#class type which would be a positive 0

#convert these using CountVectorizer / LabelEncoder ?
#le = preprocessing.LabelEncoder()
#cc = CountVectorizer()
y_kmer_with_zinc = (pos_and_neg.iloc[:,0].values)

In [21]:
kmer_without_zinc = list(pos_and_neg['kmers_with_dna'])
for i in range(len(kmer_without_zinc)):
    kmer_without_zinc[i] = ' '.join(kmer_without_zinc[i])

#convert these using CountVectorizer
#cc = CountVectorizer()
y_kmer_without_zinc = (pos_and_neg.iloc[:,2].values)
#y_kmer_without_zinc = cc.fit_transform(y_kmer_without_zinc)

In [22]:
countVectorizer = CountVectorizer(ngram_range=(5,5))
kmer_zinc = countVectorizer.fit_transform(kmer_with_zinc)
kmer_without_zinc = countVectorizer.transform(kmer_without_zinc)

In [23]:
countVectorizer.vocabulary_

{'ksad sadl adlk dlkr lkrh': 883,
 'sadl adlk dlkr lkrh krhi': 1537,
 'adlk dlkr lkrh krhi rhir': 3,
 'dlkr lkrh krhi rhir hiri': 91,
 'lkrh krhi rhir hiri iriq': 935,
 'krhi rhir hiri iriq riqr': 857,
 'rhir hiri iriq riqr iqra': 1202,
 'hiri iriq riqr iqra qran': 293,
 'iriq riqr iqra qran ranl': 608,
 'riqr iqra qran ranl anlr': 1331,
 'iqra qran ranl anlr nlra': 510,
 'qran ranl anlr nlra lrah': 1101,
 'ranl anlr nlra lrah rahi': 1133,
 'anlr nlra lrah rahi ahir': 32,
 'nlra lrah rahi ahir hirt': 1037,
 'lrah rahi ahir hirt irtt': 960,
 'rahi ahir hirt irtt rtts': 1130,
 'ahir hirt irtt rtts ttsg': 15,
 'hirt irtt rtts ttsg tsgn': 388,
 'irtt rtts ttsg tsgn sgnl': 754,
 'rtts ttsg tsgn sgnl gnlv': 1533,
 'ttsg tsgn sgnl gnlv nlvr': 1772,
 'tsgn sgnl gnlv nlvr lvrh': 1754,
 'sgnl gnlv nlvr lvrh vrht': 1599,
 'gnlv nlvr lvrh vrht rhtk': 202,
 'nlvr lvrh vrht rhtk htki': 1045,
 'lkrh krhi rhir hiri irid': 929,
 'krhi rhir hiri irid rids': 844,
 'rhir hiri irid rids idsg': 1159,
 'hiri

## After

In [25]:
labels = pos_and_neg['Class'].to_numpy().reshape(-1,1)
labels.shape

(1170, 1)

In [26]:
v1 = CountVectorizer(analyzer='char', stop_words=None, ngram_range=(2, 2))

In [27]:
x = pos_and_neg['Zinc']
zinc_finger = v1.fit_transform(x).toarray()

In [28]:
v2 = CountVectorizer(analyzer='char', stop_words=None, ngram_range=(2, 2))

In [29]:
y = pos_and_neg['Dna']
dna = v2.fit_transform(y).toarray()

In [88]:
features = np.hstack((dna,zinc_finger))
features

array([[2, 0, 1, ..., 0, 0, 0],
       [1, 1, 0, ..., 0, 0, 0],
       [1, 0, 1, ..., 0, 0, 0],
       ...,
       [0, 0, 1, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

## Testing Features 

In [102]:
X_train, x_test, y_train, y_test = train_test_split(features, labels)
x = pos_and_neg['Zinc']
y = pos_and_neg['Dna']

In [None]:
%%time
logistic_accuracy = []
randomForest_accuracy = []
test_numbers = list(range(1,15+1))
test_numbers2 = list(range(1,15+1))
for i in test_numbers: 
    for j in test_numbers2:
        print(i,j)
        v1 = CountVectorizer(analyzer='char', stop_words=None, ngram_range=(j, j))
        zinc_finger = v1.fit_transform(x).toarray()

        v2 = CountVectorizer(analyzer='char', stop_words=None, ngram_range=(i, i))
        dna = v2.fit_transform(y).toarray()
        
        reg = SGDClassifier().fit(X_train, y_train.ravel())
        y_pred_log = reg.predict(x_test)
        logistic_acc = accuracy_score(y_pred_log, y_test)
        logistic_accuracy.append(logistic_acc)
    
        clf = RandomForestClassifier()
        clf.fit(X_train, y_train.ravel())
        y_pred_rf = clf.predict(x_test)
        clf_acc = accuracy_score(y_pred_rf, y_test)
        randomForest_accuracy.append(clf_acc)

1 1
1 2
1 3
1 4
1 5
1 6
1 7
1 8
1 9
1 10
1 11
1 12
1 13
1 14
1 15
2 1
2 2
2 3
2 4
2 5
2 6
2 7
2 8
2 9
2 10
2 11
2 12
2 13
2 14
2 15
3 1
3 2
3 3
3 4


In [313]:
logistic_accuracy

[0.7440273037542662,
 0.764505119453925,
 0.7372013651877133,
 0.5836177474402731,
 0.7986348122866894,
 0.7167235494880546,
 0.78839590443686,
 0.726962457337884,
 0.7542662116040956,
 0.7918088737201365,
 0.7952218430034129,
 0.78839590443686,
 0.7679180887372014,
 0.7781569965870307,
 0.7952218430034129,
 0.7986348122866894,
 0.6962457337883959,
 0.757679180887372,
 0.7815699658703071,
 0.7918088737201365,
 0.7337883959044369,
 0.8020477815699659,
 0.7918088737201365,
 0.7542662116040956,
 0.7849829351535836,
 0.7986348122866894,
 0.6825938566552902,
 0.7952218430034129,
 0.726962457337884,
 0.78839590443686,
 0.7815699658703071,
 0.7815699658703071,
 0.7098976109215017,
 0.7952218430034129,
 0.8020477815699659,
 0.8020477815699659,
 0.8020477815699659,
 0.3242320819112628,
 0.7747440273037542,
 0.8054607508532423,
 0.6006825938566553,
 0.4778156996587031,
 0.8054607508532423,
 0.7918088737201365,
 0.7986348122866894,
 0.8020477815699659,
 0.5426621160409556,
 0.7849829351535836,
 0

In [314]:
randomForest_accuracy

[0.7440273037542662,
 0.764505119453925,
 0.7372013651877133,
 0.5836177474402731,
 0.7986348122866894,
 0.7167235494880546,
 0.78839590443686,
 0.726962457337884,
 0.7542662116040956,
 0.7918088737201365,
 0.7952218430034129,
 0.78839590443686,
 0.7679180887372014,
 0.7781569965870307,
 0.7952218430034129,
 0.7986348122866894,
 0.6962457337883959,
 0.757679180887372,
 0.7815699658703071,
 0.7918088737201365,
 0.7337883959044369,
 0.8020477815699659,
 0.7918088737201365,
 0.7542662116040956,
 0.7849829351535836,
 0.7986348122866894,
 0.6825938566552902,
 0.7952218430034129,
 0.726962457337884,
 0.78839590443686,
 0.7815699658703071,
 0.7815699658703071,
 0.7098976109215017,
 0.7952218430034129,
 0.8020477815699659,
 0.8020477815699659,
 0.8020477815699659,
 0.3242320819112628,
 0.7747440273037542,
 0.8054607508532423,
 0.6006825938566553,
 0.4778156996587031,
 0.8054607508532423,
 0.7918088737201365,
 0.7986348122866894,
 0.8020477815699659,
 0.5426621160409556,
 0.7849829351535836,
 0

In [188]:
from sklearn.ensemble import RandomForestClassifier

In [189]:
clf = RandomForestClassifier()

In [190]:
clf.fit(X_train, y_train.ravel())

RandomForestClassifier()

In [191]:
clf.fit(X_train, y_train.ravel())

In [192]:
acc = accuracy_score(y_pred, y_test)
print(acc)

0.8498293515358362


In [193]:
from sklearn.neural_network import MLPClassifier

In [279]:
mlp = MLPClassifier(hidden_layer_sizes=(100,100,100,100), max_iter=200000, activation='logistic', solver='adam', alpha=0.0001).fit(X_train, y_train.ravel())

In [280]:
mlp.predict(x_test)

array([1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0,
       1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1,
       1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0,
       0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1,
       0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1,
       0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1])

In [281]:
mlp.predict_proba(x_test)

array([[0.11866884, 0.88133116],
       [0.09864011, 0.90135989],
       [0.07174745, 0.92825255],
       [0.64075003, 0.35924997],
       [0.28424924, 0.71575076],
       [0.17914602, 0.82085398],
       [0.07719938, 0.92280062],
       [0.07178323, 0.92821677],
       [0.0730194 , 0.9269806 ],
       [0.16506615, 0.83493385],
       [0.07169603, 0.92830397],
       [0.07256931, 0.92743069],
       [0.07177396, 0.92822604],
       [0.07208563, 0.92791437],
       [0.07169308, 0.92830692],
       [0.08520491, 0.91479509],
       [0.0809091 , 0.9190909 ],
       [0.07223497, 0.92776503],
       [0.07352851, 0.92647149],
       [0.10378577, 0.89621423],
       [0.07166222, 0.92833778],
       [0.07636833, 0.92363167],
       [0.07256389, 0.92743611],
       [0.07231624, 0.92768376],
       [0.07199609, 0.92800391],
       [0.07185708, 0.92814292],
       [0.07288105, 0.92711895],
       [0.32406972, 0.67593028],
       [0.07425454, 0.92574546],
       [0.80283629, 0.19716371],
       [0.

In [282]:
mlp.score(x_test, y_test)

0.7952218430034129