In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import csv
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

In [2]:
new_data = pd.read_csv('../database-files/output.csv')

In [3]:
new_data

Unnamed: 0.1,Unnamed: 0,Source,Dna,zf,f1,f2,f3,ex
0,0,'',ctcgcgGCGgcggcc,3,KSADLKRHIRI,RSDHLTTHIRT,RSDERKRHTKI,ex-
1,1,'',ctcgcgGTTgcggcc,3,KSADLKRHIRI,RSDHLTTHIRT,RSDERKRHTKI,ex-
2,2,'',ctcgatTACgcggcc,3,KSADLKRHIRI,RSDHLTTHIRT,TSGNLVRHTKI,ex-
3,3,'',ctcgatTATgcggcc,3,KSADLKRHIRI,RSDHLTTHIRT,TSGNLVRHTKI,ex-
4,4,'',ctcgatTCTgcggcc,3,KSADLKRHIRI,RSDHLTTHIRT,TSGNLVRHTKI,ex-
...,...,...,...,...,...,...,...,...
1165,1165,'',cccgcgGTTgcgtcc,3,KSADLKRHIRI,TADKLSRHIRT,RSDERKRHTKI,ex-
1166,1166,'',cccgcgGCGgcgtcc,3,KSADLKRHIRI,TADKLSRHIRT,RSDERKRHTKI,ex-
1167,1167,'',cccgcgGCAgcgtcc,3,KSADLKRHIRI,TADKLSRHIRT,RSDERKRHTKI,ex-
1168,1168,'',cccgcgGCTgcgtcc,3,KSADLKRHIRI,TADKLSRHIRT,RSDERKRHTKI,ex-


In [4]:
positiveExamples = new_data.loc[(new_data['ex'] == "ex+") | (new_data['ex'] == "+")]
negativeExamples = new_data.loc[(new_data['ex'] == "ex-") | (new_data['ex'] == "-")]

In [5]:
print(positiveExamples)

     Unnamed: 0 Source              Dna  zf           f1           f2  \
8             8     ''  ctcgatAAAgcggcc   3  KSADLKRHIRI  QRANLRAHIRT   
9             9     ''  ctcgatAACgcggcc   3  KSADLKRHIRI  DSGNLRVHIRT   
10           10     ''  ctcgatAAGgcggcc   3  KSADLKRHIRI  RSDTLSNHIRT   
11           11     ''  ctcgatAATgcggcc   3  KSADLKRHIRI  TTGNLTVHIRT   
12           12     ''  ctcgatACAgcggcc   3  KSADLKRHIRI  SPADLTRHIRT   
..          ...    ...              ...  ..          ...          ...   
777         777     ''  cccgcgGGCgcgtcc   3  KSADLKRHIRI  QSSKLVRHIRT   
778         778     ''  cccgcgGGTgcgtcc   3  KSADLKRHIRI  TSGHLVRHIRT   
779         779     ''  cccgcgGGCgcgtcc   3  KSADLKRHIRI  TSGKLVRHIRT   
780         780     ''  cccgcgGGTgcgtcc   3  KSADLKRHIRI  TADHLSRHIRT   
781         781     ''  cccgcgGGGgcgtcc   3  KSADLKRHIRI  TADKLSRHIRT   

              f3   ex  
8    TSGNLVRHTKI  ex+  
9    TSGNLVRHTKI  ex+  
10   TSGNLVRHTKI  ex+  
11   TSGNLVRHTKI  ex+  
12 

In [6]:
print(negativeExamples)

      Unnamed: 0 Source              Dna  zf           f1           f2  \
0              0     ''  ctcgcgGCGgcggcc   3  KSADLKRHIRI  RSDHLTTHIRT   
1              1     ''  ctcgcgGTTgcggcc   3  KSADLKRHIRI  RSDHLTTHIRT   
2              2     ''  ctcgatTACgcggcc   3  KSADLKRHIRI  RSDHLTTHIRT   
3              3     ''  ctcgatTATgcggcc   3  KSADLKRHIRI  RSDHLTTHIRT   
4              4     ''  ctcgatTCTgcggcc   3  KSADLKRHIRI  RSDHLTTHIRT   
...          ...    ...              ...  ..          ...          ...   
1165        1165     ''  cccgcgGTTgcgtcc   3  KSADLKRHIRI  TADKLSRHIRT   
1166        1166     ''  cccgcgGCGgcgtcc   3  KSADLKRHIRI  TADKLSRHIRT   
1167        1167     ''  cccgcgGCAgcgtcc   3  KSADLKRHIRI  TADKLSRHIRT   
1168        1168     ''  cccgcgGCTgcgtcc   3  KSADLKRHIRI  TADKLSRHIRT   
1169        1169     ''  cccgcgGCCgcgtcc   3  KSADLKRHIRI  TADKLSRHIRT   

               f3   ex  
0     RSDERKRHTKI  ex-  
1     RSDERKRHTKI  ex-  
2     TSGNLVRHTKI  ex-  
3     TSGNL

In [7]:
print(len(positiveExamples))
print(len(negativeExamples))

255
915


In [8]:
combined_data = pd.concat([positiveExamples, negativeExamples])

In [9]:
pos_combined_dna_with_zinc = []
pos_dna = []
for index, row in positiveExamples.iterrows():
    pos_seq_dna_with_zinc = (row['Dna'] + row['f1'] + row['f2'] + row['f3'])
    pos_combined_dna_with_zinc.append(pos_seq_dna_with_zinc)
    pos_dna_seq = row['Dna']
    pos_dna.append(pos_dna_seq)

neg_combined_dna_with_zinc = []
neg_dna = []
for index, row in negativeExamples.iterrows():
    neg_seq_dna_with_zinc = (row['Dna'] + row['f1'] + row['f2'] + row['f3'])
    neg_combined_dna_with_zinc.append(neg_seq_dna_with_zinc)
    neg_dna_seq = row['Dna']
    neg_dna.append(neg_dna_seq)

In [10]:
pos_dna

['ctcgatAAAgcggcc',
 'ctcgatAACgcggcc',
 'ctcgatAAGgcggcc',
 'ctcgatAATgcggcc',
 'ctcgatACAgcggcc',
 'ctcgatACCgcggcc',
 'ctcgatACGgcggcc',
 'ctcgatACTgcggcc',
 'ctcgatAGAgcggcc',
 'ctcgatAGGgcggcc',
 'ctcgatAATgcggcc',
 'ctcgatATAgcggcc',
 'ctcgatATGgcggcc',
 'ctcgatATTgcggcc',
 'ctcgatAAGgcggcc',
 'ctcgatAATgcggcc',
 'ctcgatACAgcggcc',
 'ctcgatACAgcggcc',
 'ctcgatACTgcggcc',
 'ctcgatAGGgcggcc',
 'ctcgatATAgcggcc',
 'ctcgatATTgcggcc',
 'ctcgatAAGgcggcc',
 'ctcgatAAGgcggcc',
 'ctcgatAGGgcggcc',
 'ctcgatAGTgcggcc',
 'ctcgatATGgcggcc',
 'ctcgatTAGgcggcc',
 'ctcgatCAAgcggcc',
 'ctcgatCAAgcggcc',
 'ctcgatCAAgcggcc',
 'ctcgatCAAgcggcc',
 'ctcgatCAAgcggcc',
 'ctcgatCCAgcggcc',
 'ctcgatCCAgcggcc',
 'ctcgatCCAgcggcc',
 'ctcgatCCAgcggcc',
 'ctcgatCCAgcggcc',
 'ctcgatCGAgcggcc',
 'ctcgatCGAgcggcc',
 'ctcgatCTAgcggcc',
 'ctcgatCACgcggcc',
 'ctcgatCACgcggcc',
 'ctcgatCACgcggcc',
 'ctcgatCACgcggcc',
 'ctcgatCACgcggcc',
 'ctcgatCCCgcggcc',
 'ctcgatCCCgcggcc',
 'ctcgatCCCgcggcc',
 'ctcgatCGCgcggcc',


In [11]:
neg_dna

['ctcgcgGCGgcggcc',
 'ctcgcgGTTgcggcc',
 'ctcgatTACgcggcc',
 'ctcgatTATgcggcc',
 'ctcgatTCTgcggcc',
 'ctcgatTTAgcggcc',
 'ctcgatTTCgcggcc',
 'ctcgatTTTgcggcc',
 'ctcgatAAGgcggcc',
 'ctcgatACAgcggcc',
 'ctcgatACCgcggcc',
 'ctcgatACGgcggcc',
 'ctcgatACTgcggcc',
 'ctcgatAGAgcggcc',
 'ctcgatAGCgcggcc',
 'ctcgatAGGgcggcc',
 'ctcgatAGTgcggcc',
 'ctcgatATAgcggcc',
 'ctcgatATCgcggcc',
 'ctcgatATGgcggcc',
 'ctcgatATTgcggcc',
 'ctcgatAAAgcggcc',
 'ctcgatAACgcggcc',
 'ctcgatAATgcggcc',
 'ctcgatACAgcggcc',
 'ctcgatACCgcggcc',
 'ctcgatACTgcggcc',
 'ctcgatAGAgcggcc',
 'ctcgatAGCgcggcc',
 'ctcgatAGTgcggcc',
 'ctcgatATAgcggcc',
 'ctcgatATCgcggcc',
 'ctcgatATTgcggcc',
 'ctcgatAAAgcggcc',
 'ctcgatAACgcggcc',
 'ctcgatAAGgcggcc',
 'ctcgatACAgcggcc',
 'ctcgatACCgcggcc',
 'ctcgatACGgcggcc',
 'ctcgatACTgcggcc',
 'ctcgatAGAgcggcc',
 'ctcgatAGCgcggcc',
 'ctcgatAGGgcggcc',
 'ctcgatAGTgcggcc',
 'ctcgatATAgcggcc',
 'ctcgatATCgcggcc',
 'ctcgatATGgcggcc',
 'ctcgatATTgcggcc',
 'ctcgatAAAgcggcc',
 'ctcgatAACgcggcc',


In [12]:
def kmers(sequence, kmer_length=4):
    return[sequence[x:x+kmer_length].lower() for x in range(len(sequence) - kmer_length + 1)]

In [13]:
pos_df = pd.DataFrame(data=pos_combined_dna_with_zinc, columns=['Dna with Zinc'])
neg_df = pd.DataFrame(data=neg_combined_dna_with_zinc, columns=['Dna with Zinc'])

In [14]:
pos_df['kmers'] = pos_df.apply(lambda x: kmers(x['Dna with Zinc']), axis = 1)
pos_df['Class'] = 0

In [15]:
neg_df['kmers'] = neg_df.apply(lambda x: kmers(x['Dna with Zinc']), axis = 1)
neg_df['Class'] = 1

In [16]:
pos_and_neg = pd.concat([pos_df, neg_df])
pos_and_neg

Unnamed: 0,Dna with Zinc,kmers,Class
0,ctcgatAAAgcggccKSADLKRHIRIQRANLRAHIRTTSGNLVRHTKI,"[ctcg, tcga, cgat, gata, ataa, taaa, aaag, aag...",0
1,ctcgatAACgcggccKSADLKRHIRIDSGNLRVHIRTTSGNLVRHTKI,"[ctcg, tcga, cgat, gata, ataa, taac, aacg, acg...",0
2,ctcgatAAGgcggccKSADLKRHIRIRSDTLSNHIRTTSGNLVRHTKI,"[ctcg, tcga, cgat, gata, ataa, taag, aagg, agg...",0
3,ctcgatAATgcggccKSADLKRHIRITTGNLTVHIRTTSGNLVRHTKI,"[ctcg, tcga, cgat, gata, ataa, taat, aatg, atg...",0
4,ctcgatACAgcggccKSADLKRHIRISPADLTRHIRTTSGNLVRHTKI,"[ctcg, tcga, cgat, gata, atac, taca, acag, cag...",0
...,...,...,...
910,cccgcgGTTgcgtccKSADLKRHIRITADKLSRHIRTRSDERKRHTKI,"[cccg, ccgc, cgcg, gcgg, cggt, ggtt, gttg, ttg...",1
911,cccgcgGCGgcgtccKSADLKRHIRITADKLSRHIRTRSDERKRHTKI,"[cccg, ccgc, cgcg, gcgg, cggc, ggcg, gcgg, cgg...",1
912,cccgcgGCAgcgtccKSADLKRHIRITADKLSRHIRTRSDERKRHTKI,"[cccg, ccgc, cgcg, gcgg, cggc, ggca, gcag, cag...",1
913,cccgcgGCTgcgtccKSADLKRHIRITADKLSRHIRTRSDERKRHTKI,"[cccg, ccgc, cgcg, gcgg, cggc, ggct, gctg, ctg...",1


In [18]:
pos_lists = list(pos_df['Dna with Zinc'])
for i in range(len(pos_lists)):
    pos_lists[i] = ' '.join(pos_lists[i])
#class type which would be a positive 0
y_pos_data = pos_df.iloc[:,2].values

In [19]:
neg_lists = list(neg_df['Dna with Zinc'])
for i in range(len(neg_lists)):
    neg_lists[i] = ' '.join(neg_lists[i])
#class type which would be a negative 1 
y_neg_data= neg_df.iloc[:,2].values

In [20]:
#?
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(ngram_range=(4,4))
x = cv.fit_transform(pos_lists)
y = cv.transform(neg_lists)
print(x.shape)
print(y.shape)

ValueError: empty vocabulary; perhaps the documents only contain stop words

In [None]:
def metrics(y_test, y_pred):
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    print("accuracy = %.3f \nprecision = %.3f \nrecall = %.3f" % (accuracy, precision, recall))
    return accuracy, precision, recall