In [1]:
import numpy
import itertools
from Bio.Seq import Seq
from Bio.Alphabet import IUPAC
from Bio import SeqIO
from Bio.SeqUtils.ProtParam import ProteinAnalysis
from scipy import stats
from Bio.Alphabet import IUPAC
#from itertools import permutations
AA=["A","C","D","E","F","G","H","I","K","L","M","N","P","Q","R","S","T","V","W","Y"]

generate all dipeptides

In [2]:
dipep = [''.join(i) for i in itertools.product(AA, repeat = 2)]

The next two blocks read and extract features from two fasta files 

In [3]:
arr = numpy.empty((0,402), dtype=numpy.float)
names = numpy.empty((0,1),  dtype=object)
for record in SeqIO.parse('A45_phage_orfs.txt', "fasta"):
        ll=len(record.seq)
        #print(record.id)
        #print(ll)
        #print(record.seq)
        X = ProteinAnalysis(record.seq.__str__().replace('X','A').replace('J','L'))
        tt= [X.isoelectric_point(), X.instability_index()]
        tt_n = numpy.asarray(tt,dtype=numpy.float)

        dipep_count=[record.seq.count(i)/ll for i in dipep]
        dipep_count_n = numpy.asarray(dipep_count,dtype=numpy.float)
    
        cat_n= numpy.append(dipep_count_n,tt_n)
        cat_n = cat_n.reshape((1,cat_n.shape[0]))

        arr = numpy.append(arr,cat_n , axis=0)
        names = numpy.append(names,record.id)
        

In [4]:
arr2 = numpy.empty((0,402), dtype=numpy.float)
names2 = numpy.empty((0,1),  dtype=object)
for record in SeqIO.parse('Pluteo_MACcluster_Shikuma84ORFs.txt', "fasta"):
        ll=len(record.seq)
        X = ProteinAnalysis(record.seq.__str__().replace('X','A').replace('J','L'))
        tt= [X.isoelectric_point(), X.instability_index()]
        tt_n = numpy.asarray(tt,dtype=numpy.float)

        dipep_count=[record.seq.count(i)/ll for i in dipep]
        dipep_count_n = numpy.asarray(dipep_count,dtype=numpy.float)
    
        cat_n= numpy.append(dipep_count_n,tt_n)
        cat_n = cat_n.reshape((1,cat_n.shape[0]))

        arr2 = numpy.append(arr2,cat_n , axis=0)
        names2 = numpy.append(names2,record.id)

Load mean and standar deviation from the training features, this is needed for the Z-score

In [5]:
import pickle
mean_arr=pickle.load(open( "dipep_new_mean.p", "rb" ) )
std_arr=pickle.load(open( "dipep_new_std.p", "rb" ) )

get the Z-score

In [6]:
for i in range(arr.shape[0]):
    for j in range(arr.shape[1]):
        arr[i,j]=(arr[i,j]-mean_arr[j])/std_arr[j]

for i in range(arr2.shape[0]):
    for j in range(arr2.shape[1]):
        arr2[i,j]=(arr2[i,j]-mean_arr[j])/std_arr[j]

load model

In [7]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Activation
from keras.layers import Dropout
from keras.models import load_model

model = load_model('di_new_model.h5')

Using TensorFlow backend.


run predictions

In [8]:
arr_pred=model.predict(arr)
arr_pred2=model.predict(arr2)

generate tables

In [9]:
import pandas as pd
col_names=["major capsid","minor capsid","baseplate",
           "major tail","minor tail","portal",
           "tail fiber","tail shaft","colar",
           "HTJ"]
table1=pd.DataFrame(data=arr_pred,
                index=names,
                columns=col_names
                )
table2=pd.DataFrame(data=arr_pred2,
                index=names2,
                columns=col_names
                )


In [10]:
table1.style.format("{:.2f}").highlight_max(axis=1)

Unnamed: 0,major capsid,minor capsid,baseplate,major tail,minor tail,portal,tail fiber,tail shaft,colar,HTJ
1_A45,0.0,0.0,0.99,0.0,0.0,0.01,0.0,0.0,0.0,0.0
2_A45,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3_A45,0.73,0.0,0.03,0.0,0.0,0.23,0.0,0.0,0.0,0.0
4_A45,0.0,0.0,0.04,0.0,0.0,0.96,0.0,0.0,0.0,0.0
5_A45,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
6_A45,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7_A45,0.0,0.0,0.71,0.0,0.16,0.01,0.0,0.0,0.0,0.12
8_A45,0.01,0.0,0.02,0.01,0.91,0.01,0.0,0.04,0.0,0.01
9_A45,0.16,0.05,0.01,0.0,0.03,0.0,0.56,0.01,0.02,0.17
10_A45,0.03,0.02,0.29,0.0,0.17,0.01,0.01,0.42,0.04,0.01


In [11]:
table2.style.format("{:.2f}").highlight_max(axis=1)

Unnamed: 0,major capsid,minor capsid,baseplate,major tail,minor tail,portal,tail fiber,tail shaft,colar,HTJ
Ps_orf1_Proton,0.08,0.0,0.0,0.0,0.0,0.0,0.0,0.92,0.0,0.0
Ps_orf2_2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Ps_orf3_Glutathione,0.78,0.0,0.2,0.0,0.0,0.0,0.02,0.0,0.0,0.0
Ps_orf4_D,0.93,0.0,0.04,0.0,0.0,0.01,0.02,0.0,0.0,0.0
Ps_orf5_Sodium,0.77,0.0,0.0,0.0,0.16,0.0,0.01,0.0,0.06,0.0
Ps_orf6_Putative,0.0,0.0,0.0,0.0,0.98,0.02,0.0,0.0,0.0,0.0
Ps_orf7_Aspartokinase,0.1,0.0,0.1,0.0,0.0,0.8,0.0,0.0,0.0,0.0
Ps_orf8_Na_H,0.0,0.0,0.0,0.0,0.0,0.99,0.0,0.01,0.0,0.0
Ps_orf9_hypothetical,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
Ps_orf10_hypothetical,0.01,0.07,0.0,0.02,0.03,0.44,0.04,0.0,0.31,0.08
