In [44]:
import pandas
import numpy
import matplotlib
import seaborn
import sklearn
import keras
import sys

print("Python : {}".format(sys.version))
print("Numpy : {}".format(numpy.__version__))
print("Pandas : {}".format(pandas.__version__))
print("Matplotlib : {}".format(matplotlib.__version__))
print("Sklearn : {}".format(sklearn.__version__))
print("Seaborn : {}".format(seaborn.__version__))
print("Keras : {}".format(keras.__version__))

Python : 3.6.6 |Anaconda, Inc.| (default, Jun 28 2018, 17:14:51) 
[GCC 7.2.0]
Numpy : 1.15.2
Pandas : 0.23.4
Matplotlib : 3.0.0
Sklearn : 0.20.0
Seaborn : 0.9.0
Keras : 2.2.4


In [2]:
import pandas as pd
import numpy as np

# UCI repository dataset molecular biology (gene sequence promotion) dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/molecular-biology/promoter-gene-sequences/promoters.data"
names = ['Class','Instance_name','Sequential_nucleotide']

# Loading the dataset
data = pd.read_csv(url,names = names)

In [4]:
data.head()

Unnamed: 0,Class,Instance_name,Sequential_nucleotide
0,+,S10,\t\ttactagcaatacgcttgcgttcggtggttaagtatgtataat...
1,+,AMPC,\t\ttgctatcctgacagttgtcacgctgattggtgtcgttacaat...
2,+,AROH,\t\tgtactagagaactagtgcattagcttatttttttgttatcat...
3,+,DEOP2,\taattgtgatgtgtatcgaagtgtgttgcggagtagatgttagaa...
4,+,LEU1_TRNA,\ttcgataattaactattgacgaaaagctgaaaaccactagaatgc...


In [7]:
print(data.shape)

(106, 3)


In [8]:
#Sample data
print(data.iloc[10])

Class                                                                    +
Instance_name                                                     RRNAB_P2
Sequential_nucleotide    \tgcaaaaataaatgcttgactctgtagcgggaaggcgtattatgc...
Name: 10, dtype: object


In [9]:
# Observations-
# \t which was actually a tab between the instance_name and the sequence in the original data ,
# interpreted here as \t
# Need to be removed, also , id column is not useful in the machine learning sense

# Finally, building a dataset using custom pandas dataframe , each column in a dataframe is called a series

classes = data.loc[:,'Class']
print(classes[:10])

0    +
1    +
2    +
3    +
4    +
5    +
6    +
7    +
8    +
9    +
Name: Class, dtype: object


In [12]:
# Generating list of DNA sequences and looping through them to observe individual nucleotides
sequences = list(data.loc[:,'Sequential_nucleotide'])
dataset = {}

# Looping through the sequences and splitting into individual nucleotides also removing tabs

for i,seq in enumerate(sequences):
    nucleotides = list(seq)
    nucleotides = [x for  x in nucleotides if x!='\t']
    
    
    #appending class to the sequence
    nucleotides.append(classes[i])
    
    #add to  dataset
    dataset[i] = nucleotides
    
    
#Looking at the list created (custom dataframe to  be created later)
print(dataset[0])

['t', 'a', 'c', 't', 'a', 'g', 'c', 'a', 'a', 't', 'a', 'c', 'g', 'c', 't', 't', 'g', 'c', 'g', 't', 't', 'c', 'g', 'g', 't', 'g', 'g', 't', 't', 'a', 'a', 'g', 't', 'a', 't', 'g', 't', 'a', 't', 'a', 'a', 't', 'g', 'c', 'g', 'c', 'g', 'g', 'g', 'c', 't', 't', 'g', 't', 'c', 'g', 't', '+']


In [26]:
#Class has been added at the end
# Finallly turning this list back into the dataframe

df = pd.DataFrame(dataset)
print(df)
# We have a wrong alignment (observing the class column values apperaing as a row, thus we need to  transpose, 



   0   1   2   3   4   5   6   7   8   9   ... 96  97  98  99  100 101 102  \
0    t   t   g   a   t   a   c   t   c   t ...   c   c   t   a   g   c   g   
1    a   g   t   a   c   g   a   t   g   t ...   c   g   a   g   a   c   t   
2    c   c   a   t   g   g   g   t   a   t ...   g   c   t   a   g   t   a   
3    t   t   c   t   a   g   g   c   c   t ...   a   t   g   g   a   c   t   
4    a   a   t   g   t   g   g   t   t   a ...   g   a   a   g   g   a   t   
5    g   t   a   t   a   c   g   a   t   a ...   t   g   c   g   c   a   c   
6    c   c   g   g   a   a   g   c   a   a ...   a   g   c   t   a   t   t   
7    a   c   a   a   t   a   t   a   a   t ...   g   a   g   g   t   g   c   
8    a   t   g   t   t   g   g   a   t   t ...   a   c   a   t   g   g   a   
9    t   g   a   g   a   g   g   a   a   t ...   c   t   a   a   t   c   a   
10   a   a   a   t   a   a   a   a   t   c ...   c   t   c   c   c   c   c   
11   c   c   c   g   c   g   g   c   a   c ...   c   t   g   t  

In [27]:
# Transposing the dataframe
df1=df.transpose()
print(df1)
print(df1.shape)

    0  1  2  3  4  5  6  7  8  9  ... 48 49 50 51 52 53 54 55 56 57
0    t  a  c  t  a  g  c  a  a  t ...  g  c  t  t  g  t  c  g  t  +
1    t  g  c  t  a  t  c  c  t  g ...  c  a  t  c  g  c  c  a  a  +
2    g  t  a  c  t  a  g  a  g  a ...  c  a  c  c  c  g  g  c  g  +
3    a  a  t  t  g  t  g  a  t  g ...  a  a  c  a  a  a  c  t  c  +
4    t  c  g  a  t  a  a  t  t  a ...  c  c  g  t  g  g  t  a  g  +
5    a  g  g  g  g  c  a  a  g  g ...  c  g  t  t  t  a  g  g  t  +
6    c  a  g  g  g  g  g  t  g  g ...  a  t  c  a  t  g  a  a  t  +
7    t  t  t  c  t  a  c  a  a  a ...  a  a  c  a  g  a  a  c  a  +
8    c  g  a  c  t  t  a  a  t  a ...  a  a  a  t  g  g  t  t  t  +
9    t  t  t  t  a  a  a  t  t  t ...  c  c  a  c  t  g  a  c  a  +
10   g  c  a  a  a  a  a  t  a  a ...  c  c  c  g  c  g  c  c  g  +
11   c  c  t  g  a  a  a  t  t  c ...  c  c  t  c  g  c  g  a  c  +
12   g  a  t  c  a  a  a  a  a  a ...  c  c  g  t  t  g  a  g  a  +
13   c  t  g  c  a  a  t  t  t  t ...  c  c  a  

In [31]:
df1.rename(columns = { 57 : 'Class'} , inplace = True)
df1.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,48,49,50,51,52,53,54,55,56,Class
0,t,a,c,t,a,g,c,a,a,t,...,g,c,t,t,g,t,c,g,t,+
1,t,g,c,t,a,t,c,c,t,g,...,c,a,t,c,g,c,c,a,a,+
2,g,t,a,c,t,a,g,a,g,a,...,c,a,c,c,c,g,g,c,g,+
3,a,a,t,t,g,t,g,a,t,g,...,a,a,c,a,a,a,c,t,c,+
4,t,c,g,a,t,a,a,t,t,a,...,c,c,g,t,g,g,t,a,g,+


In [32]:
# More information about the data
df1.describe()
# Greatly balanced dataset with equal number of + and - shown by 53 frequency of + and 106 total

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,48,49,50,51,52,53,54,55,56,Class
count,106,106,106,106,106,106,106,106,106,106,...,106,106,106,106,106,106,106,106,106,106
unique,4,4,4,4,4,4,4,4,4,4,...,4,4,4,4,4,4,4,4,4,2
top,t,a,a,c,a,a,a,a,a,a,...,c,c,c,t,t,c,c,t,t,+
freq,38,34,30,30,36,42,38,34,33,36,...,36,42,31,33,35,32,29,29,34,53


In [35]:
# We can't work with alphabets, thus we need to convert them into numbers
# Recording value counts for  each sequence to have a greater understanding of the data

series = []
for name in df1.columns:
    series.append(df1[name].value_counts())
    
info = pd.DataFrame(series)
details = info.transpose()
print(details)

      0     1     2     3     4     5     6     7     8     9  ...      48  \
t  38.0  26.0  27.0  26.0  22.0  24.0  30.0  32.0  32.0  28.0  ...    21.0   
c  27.0  22.0  21.0  30.0  19.0  18.0  21.0  20.0  22.0  22.0  ...    36.0   
a  26.0  34.0  30.0  22.0  36.0  42.0  38.0  34.0  33.0  36.0  ...    23.0   
g  15.0  24.0  28.0  28.0  29.0  22.0  17.0  20.0  19.0  20.0  ...    26.0   
+   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...     NaN   
-   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...     NaN   

     49    50    51    52    53    54    55    56  Class  
t  22.0  23.0  33.0  35.0  30.0  23.0  29.0  34.0    NaN  
c  42.0  31.0  32.0  21.0  32.0  29.0  29.0  17.0    NaN  
a  24.0  28.0  27.0  25.0  22.0  26.0  24.0  27.0    NaN  
g  18.0  24.0  14.0  25.0  22.0  28.0  24.0  28.0    NaN  
+   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   53.0  
-   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   53.0  

[6 rows x 58 columns]


In [37]:
# Converting to numerical data using pd.get_dummies() function

num_df = pd.get_dummies(df1)
num_df.iloc[:5]

Unnamed: 0,0_a,0_c,0_g,0_t,1_a,1_c,1_g,1_t,2_a,2_c,...,55_a,55_c,55_g,55_t,56_a,56_c,56_g,56_t,Class_+,Class_-
0,0,0,0,1,1,0,0,0,0,1,...,0,0,1,0,0,0,0,1,1,0
1,0,0,0,1,0,0,1,0,0,1,...,1,0,0,0,1,0,0,0,1,0
2,0,0,1,0,0,0,0,1,1,0,...,0,1,0,0,0,0,1,0,1,0
3,1,0,0,0,1,0,0,0,0,0,...,0,0,0,1,0,1,0,0,1,0
4,0,0,0,1,0,1,0,0,0,0,...,1,0,0,0,0,0,1,0,1,0


In [38]:
# Removing redundancy of class column, removing class -(negative)
df=num_df.drop(columns=['Class_-'])
df.rename(columns={'Class_+':'Class'} , inplace=True) # inplace = true so that data doesnt get shuffled


In [41]:
# Looking at the final dataset 
#  first a row only, then the whole
print(df.loc[10])
df.head()
# We can also eliminate all the t variable columns as if it is not a,c,g then it must be t

0_a      0
0_c      0
0_g      1
0_t      0
1_a      0
1_c      1
1_g      0
1_t      0
2_a      1
2_c      0
2_g      0
2_t      0
3_a      1
3_c      0
3_g      0
3_t      0
4_a      1
4_c      0
4_g      0
4_t      0
5_a      1
5_c      0
5_g      0
5_t      0
6_a      1
6_c      0
6_g      0
6_t      0
7_a      0
7_c      0
        ..
49_t     0
50_a     0
50_c     1
50_g     0
50_t     0
51_a     0
51_c     0
51_g     1
51_t     0
52_a     0
52_c     1
52_g     0
52_t     0
53_a     0
53_c     0
53_g     1
53_t     0
54_a     0
54_c     1
54_g     0
54_t     0
55_a     0
55_c     1
55_g     0
55_t     0
56_a     0
56_c     0
56_g     1
56_t     0
Class    1
Name: 10, Length: 229, dtype: uint8


Unnamed: 0,0_a,0_c,0_g,0_t,1_a,1_c,1_g,1_t,2_a,2_c,...,54_t,55_a,55_c,55_g,55_t,56_a,56_c,56_g,56_t,Class
0,0,0,0,1,1,0,0,0,0,1,...,0,0,0,1,0,0,0,0,1,1
1,0,0,0,1,0,0,1,0,0,1,...,0,1,0,0,0,1,0,0,0,1
2,0,0,1,0,0,0,0,1,1,0,...,0,0,1,0,0,0,0,1,0,1
3,1,0,0,0,1,0,0,0,0,0,...,0,0,0,0,1,0,1,0,0,1
4,0,0,0,1,0,1,0,0,0,0,...,1,1,0,0,0,0,0,1,0,1


In [48]:
# Data Preprocessed, now Machine Learning
# Importing Packages
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

# Metrics for evaluation
from sklearn.metrics import classification_report , accuracy_score

In [53]:
# Training and test set seperation
from sklearn.model_selection import train_test_split

X=np.array(df.drop(['Class'],axis=1))
Y=np.array(df['Class'])

# Defining a constant seed althroughout the program
seed=1

#Splitting the dataset
X_train,X_test,Y_train,Y_test = model_selection.train_test_split(X,Y,test_size=0.25,random_state=seed)

In [64]:
from sklearn import model_selection
# Define Scoring method accuracy
scoring='accuracy'

# Defining the models
model_names = ['K Nearest Neighbors','Gaussian Process','Decision Tree','Random Forest','Neural Network','Adaboost'
        ,'Naive Bayes','SVM Linear','SVM RBF','SVM Sigmoid']
classifiers = [
    KNeighborsClassifier(n_neighbors=3),
    GaussianProcessClassifier(1.0*RBF(1.0)),
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(max_depth = 5, n_estimators = 10,max_features = 1),
    MLPClassifier(alpha=1),
    AdaBoostClassifier(),
    GaussianNB(),
    SVC(kernel='linear'),
    SVC(kernel='rbf'),
    SVC(kernel='sigmoid')
             ]
models=zip(model_names,classifiers)

#Evaluate each model together and store thhe results
names=[]
results=[]
#names=[]

for name,model in models:
    kfold = model_selection.KFold(n_splits=10, random_state=seed)
    cv_results = model_selection.cross_val_score(model , X_train , Y_train , cv = kfold , scoring = scoring)
    results.append(cv_results)
    names.append(name)
    msg ="{0}' : {1} ({2})".format(name,cv_results.mean(),cv_results.std())
    print(msg)


K Nearest Neighbors' : 0.8232142857142858 (0.11390841738440759)
Gaussian Process' : 0.8732142857142857 (0.05615780426255853)
Decision Tree' : 0.75 (0.2091650066335189)
Random Forest' : 0.5982142857142857 (0.20535714285714285)




Neural Network' : 0.8625 (0.10383279828647594)
Adaboost' : 0.925 (0.11456439237389601)
Naive Bayes' : 0.8375 (0.1375)
SVM Linear' : 0.85 (0.10897247358851683)
SVM RBF' : 0.7375 (0.11792476415070755)
SVM Sigmoid' : 0.5696428571428571 (0.1592092225048921)




In [66]:
# Maximum Adaboost, now testing the dataset on the validation(test set)
# Defining the models
model_names = ['K Nearest Neighbors','Gaussian Process','Decision Tree','Random Forest','Neural Network','Adaboost'
        ,'Naive Bayes','SVM Linear','SVM RBF','SVM Sigmoid']
classifiers = [
    KNeighborsClassifier(n_neighbors=3),
    GaussianProcessClassifier(1.0*RBF(1.0)),
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(max_depth = 5, n_estimators = 10,max_features = 1),
    MLPClassifier(alpha=1),
    AdaBoostClassifier(),
    GaussianNB(),
    SVC(kernel='linear'),
    SVC(kernel='rbf'),
    SVC(kernel='sigmoid')
             ]
models=zip(model_names,classifiers)
for name,model in models:
    model.fit(X_train,Y_train)
    predictions = model.predict(X_test)
    print(name)
    print(accuracy_score(Y_test,predictions))
    print(classification_report(Y_test,predictions))

K Nearest Neighbors
0.7777777777777778
              precision    recall  f1-score   support

           0       1.00      0.65      0.79        17
           1       0.62      1.00      0.77        10

   micro avg       0.78      0.78      0.78        27
   macro avg       0.81      0.82      0.78        27
weighted avg       0.86      0.78      0.78        27

Gaussian Process
0.8888888888888888
              precision    recall  f1-score   support

           0       1.00      0.82      0.90        17
           1       0.77      1.00      0.87        10

   micro avg       0.89      0.89      0.89        27
   macro avg       0.88      0.91      0.89        27
weighted avg       0.91      0.89      0.89        27

Decision Tree
0.7777777777777778
              precision    recall  f1-score   support

           0       1.00      0.65      0.79        17
           1       0.62      1.00      0.77        10

   micro avg       0.78      0.78      0.78        27
   macro avg       0



In [None]:
# Linear SVM best for this dataset
# f1 score best metric to evaluate the model