load the dataset

In [94]:
import pandas as pd
import numpy as np

df = pd.read_csv("splice.data", delimiter=",", header=None)

df.head()

Unnamed: 0,0,1,2
0,EI,ATRINS-DONOR-521,CCAGCTGCATCACAGGAGGCCAGCGAGCAGG...
1,EI,ATRINS-DONOR-905,AGACCCGCCGGGAGGCGGAGGACCTGCAGGG...
2,EI,BABAPOE-DONOR-30,GAGGTGAAGGACGTCCTTCCCCAGGAGCCGG...
3,EI,BABAPOE-DONOR-867,GGGCTGCGTTGCTGGTCACATTCCTGGCAGGT...
4,EI,BABAPOE-DONOR-2817,GCTCAGCCCCCAGGTCACCCAGGAACTGACGTG...


In [95]:
df.describe()

Unnamed: 0,0,1,2
count,3190,3190,3190
unique,3,3178,3092
top,N,HUMMYLCA-ACCEPTOR-2481,CAAATTGTGGACGTGATTCCCTTTCTCAGGGTGAG...
freq,1655,2,3


In [96]:
dna_seq = list(df[2])
df.drop([1,2], axis=1)

Unnamed: 0,0
0,EI
1,EI
2,EI
3,EI
4,EI
...,...
3185,N
3186,N
3187,N
3188,N


In [97]:
for j in range(0, 3190):
    for i in range(1, 61):
        df.at[j, i] = dna_seq[j][-61+i]

In [98]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,51,52,53,54,55,56,57,58,59,60
0,EI,C,C,A,G,C,T,G,C,A,...,A,G,C,C,A,G,T,C,T,G
1,EI,A,G,A,C,C,C,G,C,C,...,G,T,G,C,C,C,C,C,G,C
2,EI,G,A,G,G,T,G,A,A,G,...,C,A,C,G,G,G,G,A,T,G
3,EI,G,G,G,C,T,G,C,G,T,...,G,G,T,T,T,T,C,C,C,C
4,EI,G,C,T,C,A,G,C,C,C,...,C,C,T,T,G,A,C,C,C,T


In [99]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

for i in range(0, 61):
    df[i] = LabelEncoder().fit_transform(df[i])

Y = df[0]
df = df.drop(0, axis = 1)
x_train, x_test, y_train, y_test = train_test_split(df, Y, test_size=0.25, random_state=100)

In [101]:
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

(2392, 60)
(2392,)
(798, 60)
(798,)


Simple Decision Tree

In [102]:
from sklearn.tree import DecisionTreeClassifier

tree_obj = DecisionTreeClassifier(criterion="gini", random_state=100, max_depth=5, min_samples_leaf=10)

tree_obj.fit(x_train, y_train)

DecisionTreeClassifier(max_depth=5, min_samples_leaf=10, random_state=100)

In [103]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

y_pred = tree_obj.predict(x_test)

print("confusion matrix ", confusion_matrix(y_test, y_pred))
print("accuracy", accuracy_score(y_test, y_pred)*100)
print("report ", classification_report(y_test, y_pred))

confusion matrix  [[190   1   9]
 [ 26 163   6]
 [ 19  25 359]]
accuracy 89.22305764411027
report                precision    recall  f1-score   support

           0       0.81      0.95      0.87       200
           1       0.86      0.84      0.85       195
           2       0.96      0.89      0.92       403

    accuracy                           0.89       798
   macro avg       0.88      0.89      0.88       798
weighted avg       0.90      0.89      0.89       798



In [105]:
from sklearn.tree import DecisionTreeClassifier

tree_obj = DecisionTreeClassifier(criterion="entropy", random_state=100, max_depth=5, min_samples_leaf=10)

tree_obj.fit(x_train, y_train)

DecisionTreeClassifier(criterion='entropy', max_depth=5, min_samples_leaf=10,
                       random_state=100)

In [106]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

y_pred = tree_obj.predict(x_test)

print("confusion matrix ", confusion_matrix(y_test, y_pred))
print("accuracy", accuracy_score(y_test, y_pred)*100)
print("report ", classification_report(y_test, y_pred))

confusion matrix  [[180   1  19]
 [ 26 163   6]
 [  7  25 371]]
accuracy 89.47368421052632
report                precision    recall  f1-score   support

           0       0.85      0.90      0.87       200
           1       0.86      0.84      0.85       195
           2       0.94      0.92      0.93       403

    accuracy                           0.89       798
   macro avg       0.88      0.89      0.88       798
weighted avg       0.90      0.89      0.89       798



In [123]:
max_depth = list(np.array([4, 5, 6, 7, 8, 9, 10]))
min_samples_leaf = list(np.array([4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]))
random_state = [100]
criterion = ["gini", "entropy"]

para_dict = {"criterion": criterion,
            "max_depth": max_depth,
            "min_samples_leaf":min_samples_leaf,
             "random_state": random_state
            }

In [124]:
from sklearn.model_selection import GridSearchCV

tree_obj = DecisionTreeClassifier()

tree_obj = GridSearchCV(tree_obj, para_dict, cv=6)

tree_obj.fit(x_train, y_train)

y_pred = tree_obj.predict(x_test)

print("confusion matrix ", confusion_matrix(y_test, y_pred))
print("accuracy", accuracy_score(y_test, y_pred)*100)
print("report ", classification_report(y_test, y_pred))

confusion matrix  [[184   5  11]
 [  8 175  12]
 [ 10  17 376]]
accuracy 92.10526315789474
report                precision    recall  f1-score   support

           0       0.91      0.92      0.92       200
           1       0.89      0.90      0.89       195
           2       0.94      0.93      0.94       403

    accuracy                           0.92       798
   macro avg       0.91      0.92      0.92       798
weighted avg       0.92      0.92      0.92       798



In [125]:
tree_obj.best_params_

{'criterion': 'entropy',
 'max_depth': 7,
 'min_samples_leaf': 6,
 'random_state': 100}

In [127]:
rf_params_dict = {
    "n_estimators": [1000],
    "criterion": ["gini", "entropy"],
    "max_depth": [6,7,8]
    "min_samples_leaf": [6],
    "random_state": [100],
}

In [141]:
from sklearn.ensemble import RandomForestClassifier

rf_obj = RandomForestClassifier()

rf_obj = GridSearchCV(rf_obj, rf_params_dict)

rf_obj.fit(x_train, y_train)

y_pred = rf_obj.predict(x_test)

print("confusion matrix ", confusion_matrix(y_test, y_pred))
print("accuracy", accuracy_score(y_test, y_pred)*100)
print("report ", classification_report(y_test, y_pred))        

confusion matrix  [[191   2   7]
 [  6 182   7]
 [  5   9 389]]
accuracy 95.48872180451127
report                precision    recall  f1-score   support

           0       0.95      0.95      0.95       200
           1       0.94      0.93      0.94       195
           2       0.97      0.97      0.97       403

    accuracy                           0.95       798
   macro avg       0.95      0.95      0.95       798
weighted avg       0.95      0.95      0.95       798



In [143]:
rf_obj.best_score_

0.9569422000157232