In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import csv
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn import svm

### "Data Cleaning"

Source - the data origin

dna - DNA Sequence

zf - number of zinc fingers in protein

f1-fn - sequences of corresponding zinc finger regions

In [2]:
with open('database.txt', 'r') as file:
    data = file.read()

In [3]:
data = data.replace("source", "")
data = data.replace("dna", "")
data = data.replace("zf", "")
data = data.replace('f1', "")
data = data.replace("f2", "")
data = data.replace("f3", "")
data = data.replace("=", "")

In [4]:
z = data.split("\n")

In [5]:
textfile = open("database2.txt", "w")
for e in z:
    textfile.write(e + "\n")
textfile.close()

In [6]:
df = pd.read_fwf("database2.txt")

In [7]:
df.to_csv("output.csv", header=['Source', 'Dna', 'zf', 'f1', 'f2', 'f3', 'ex'])

## Hopefully simulating what they had in the paper

Sources:

http://www.cryst.bbk.ac.uk/education/AminoAcid/the_twenty.html

For Contacts:

01 - between amino acids a6 and nucleotide b1


02 - between amino acids a3 and nucleotide b2


03 - between amino acids a-1 and nucleotide b3


04 - between amino acids a2 and nucleotide b4



So this makes a canonical zinc finger binding model, to map each Zinc finger-DNA contact to a feature number. The contact positions are numbered from the start of the alpha-helix. 

This model is used to represent each protein-DNA complex 

In [8]:
amino_acids = ['A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G', 'H', 'I', 'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V']
base = ['a', 'c', 'g', 't']
contacts = ['01', '02', '03', '04']

In [9]:
s = []
for i in contacts:
    for j in amino_acids:
        for k in base:
            pair = i + j + k
            s.append(pair)

In [10]:
len(s)

320

Reading in the csv for the newly created database file

In [43]:
new_data = pd.read_csv('output.csv')
new_data

Unnamed: 0.1,Unnamed: 0,Source,Dna,zf,f1,f2,f3,ex
0,0,DBSFB01,ctcgcgGAAgcggcc,3,KSADLKRHIRI,RSDHLTTHIRT,RSDERKRHTKI,ex-
1,1,DBSFB01,ctcgcgGCGgcggcc,3,KSADLKRHIRI,RSDHLTTHIRT,RSDERKRHTKI,ex-
2,2,DBSFB01,ctcgcgGTTgcggcc,3,KSADLKRHIRI,RSDHLTTHIRT,RSDERKRHTKI,ex-
3,3,DBSFB01,ctcgcgGGGgcggcc,3,KSADLKRHIRI,RSDHLTTHIRT,RSDERKRHTKI,ex> 2ctcgcgGACgcggcc
4,4,DBSFB01,ctcgcgGGGgcggcc,3,KSADLKRHIRI,RSDHLTTHIRT,RSDERKRHTKI,ex> 2ctcgcgGATgcggcc
...,...,...,...,...,...,...,...,...
4077,4077,WYB95 c,tgcgTGGgcgccc 3,R,DELTRHIRI R,GNYTTHIRT R,DERKRHTKI e,Kd Kd20.0
4078,4078,WYB95 c,tGCGtgggcgccc 3,R,DELTRHIRI R,DHLTTHIRT R,DERKRHTKI e,Kd Kd6.5
4079,4079,WYB95 c,tCTGtgggcgccc 3,R,DELTRHIRI R,DHLTTHIRT R,DERKRHTKI e,Kd Kd101.0
4080,4080,WYB95 c,tGCGtgggcgccc 3,R,DELTRHIRI R,DHLTTHIRT S,GQWWRHTKI e,Kd Kd13.1


In [42]:
positiveExamples = new_data.loc[(new_data['ex'] == "ex+") | (new_data['ex'] == "+")]
negativeExamples = new_data.loc[(new_data['ex'] == "ex-") | (new_data['ex'] == "-")]
comparativeExamples = new_data.loc[new_data['ex'] == "ex>"]

In [45]:
print(len(positiveExamples))
print(len(negativeExamples))

98
689


### Experiments

In [13]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier 
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

#### Experimental Setup

In [14]:
#creating the 80 categories
categories = []
for i in amino_acids:
    for j in base:
        pair = i + j
        categories.append(pair)

In [15]:
len(categories)

80

In [53]:
features

Unnamed: 0,Aa,Ac,Ag,At,Ca,Cc,Cg,Ct,Da,Dc,...,Vg,Vt,Wa,Wc,Wg,Wt,Ya,Yc,Yg,Yt
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
75,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
76,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
77,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
78,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0


In [54]:
features['categories'] = categories

In [55]:
features

Unnamed: 0,Aa,Ac,Ag,At,Ca,Cc,Cg,Ct,Da,Dc,...,Vt,Wa,Wc,Wg,Wt,Ya,Yc,Yg,Yt,categories
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Aa
1,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Ac
2,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Ag
3,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,At
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Ra
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
75,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,Yt
76,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Va
77,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Vc
78,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Vg


In [22]:
df_train, df_test = train_test_split(features)
print(df_train.shape)
print(df_test.shape)

(60, 81)
(20, 81)


### MLP Classifier 

In [48]:
from sklearn.neural_network import MLPClassifier

In [56]:
x = features.drop('categories', axis=1)
y = features['categories']

In [58]:
X_train, X_test, y_train, y_test = train_test_split(x, y)

#### Scaling Data because of sensitivity ... Thanks Dr.Bukowy on the wise words

In [59]:
scaler = StandardScaler()