In [20]:
import os
import cv2
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import random
from sklearn.preprocessing import LabelEncoder
from collections import Counter

from keras import layers, models, optimizers
from keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_excel("./RawData.xlsx")
df.head()

Unnamed: 0,LocusName,SourceOrganism,Source,LocusSequenceLength,Sequence
0,NC_018874,Duplodnaviria,Abalone herpesvirus Victoria/AUS/2009,211518,actcgtatgaactttgactggtttttggggcgcgagagtttggttt...
1,NC_011646,Duplodnaviria,Abalone shriveling syndrome-associated virus,34952,ctatttaactaatttagtattgtttgttgttttcggttgagtcaat...
2,NC_001341,Duplodnaviria,Acholeplasma virus MV-L51,4491,ggccttaaagcttttagagaactctttttgcagtaaaaagcacaag...
3,NC_028834,Duplodnaviria,Achromobacter phage 83-24,48216,gcacttcatgcagcatgaacatggacaatgtcccaaactgggactt...
4,NC_023556,Duplodnaviria,Achromobacter phage JWAlpha,72329,acacacccccccggtgtcttgctcctgtgcacctctacccaccccc...


In [5]:
len(df)

11540

In [3]:
df['SourceOrganism'].value_counts()

SourceOrganism
Riboviria        5069
Duplodnaviria    3643
Monodnaviria     1084
Varidnaviria      988
unclassified      756
Name: count, dtype: int64

In [7]:
df_without_unclassified = df[df['SourceOrganism'] != 'unclassified']
print(df_without_unclassified.head())
print(df_without_unclassified['SourceOrganism'].value_counts())
print(len(df_without_unclassified))

df = df_without_unclassified

   LocusName SourceOrganism                                        Source   
0  NC_018874  Duplodnaviria         Abalone herpesvirus Victoria/AUS/2009  \
1  NC_011646  Duplodnaviria  Abalone shriveling syndrome-associated virus   
2  NC_001341  Duplodnaviria                     Acholeplasma virus MV-L51   
3  NC_028834  Duplodnaviria                     Achromobacter phage 83-24   
4  NC_023556  Duplodnaviria                   Achromobacter phage JWAlpha   

   LocusSequenceLength                                           Sequence  
0               211518  actcgtatgaactttgactggtttttggggcgcgagagtttggttt...  
1                34952  ctatttaactaatttagtattgtttgttgttttcggttgagtcaat...  
2                 4491  ggccttaaagcttttagagaactctttttgcagtaaaaagcacaag...  
3                48216  gcacttcatgcagcatgaacatggacaatgtcccaaactgggactt...  
4                72329  acacacccccccggtgtcttgctcctgtgcacctctacccaccccc...  
SourceOrganism
Riboviria        5069
Duplodnaviria    3643
Monodnaviria     1084


In [13]:
classes = np.array(list(df['SourceOrganism']))
folder_path = "./images"

X = []
y_temp = []

image_filenames = os.listdir(folder_path)

counter = Counter()

mapping = {
    'Riboviria': 0,
    'Duplodnaviria': 1,
    'Monodnaviria': 2,
    'Varidnaviria': 3
}

for filename in image_filenames:
    image_class = filename.split('_')[-1]
    
    if counter[image_class] < 988:
        image = cv2.imread(os.path.join(folder_path, filename))
        image = cv2.resize(image, (64, 64))
        X.append(image)
        y_temp.append(mapping[image_class.split('.')[0]])
        
        counter[image_class] += 1

X = np.array(X)
y = np.array(y_temp)

In [24]:
permutation_index = np.random.permutation(len(X))

X = X[permutation_index]
y = y[permutation_index]

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [26]:
def create_cnn_model():
    model = models.Sequential()
    
    model.add(layers.InputLayer(input_shape=(64, 64, 3)))
    model.add(layers.Rescaling(1./255))  
    
    # Convolutional layers
    model.add(Conv2D(32, (7, 7), activation='relu', padding='same', strides=2))
    model.add(Conv2D(64, (5, 5), activation='relu', padding='same', strides=2))
    model.add(Conv2D(64, (3, 3), activation='relu', padding='same', strides=2))
    
    model.add(Dropout(0.2))
    
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(Dropout(0.4))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    
    model.add(Flatten())
    
    model.add(Dense(128, activation='relu'))
    
    model.add(Dense(4, activation='softmax'))  
    
    return model

In [27]:
model = create_cnn_model()
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 rescaling_1 (Rescaling)     (None, 64, 64, 3)         0         
                                                                 
 conv2d_3 (Conv2D)           (None, 32, 32, 32)        4736      
                                                                 
 conv2d_4 (Conv2D)           (None, 16, 16, 64)        51264     
                                                                 
 conv2d_5 (Conv2D)           (None, 8, 8, 64)          36928     
                                                                 
 dropout_2 (Dropout)         (None, 8, 8, 64)          0         
                                                                 
 max_pooling2d_2 (MaxPooling  (None, 4, 4, 64)         0         
 2D)                                                             
                                                      

In [28]:
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

In [29]:
history = model.fit(
    X_train,
    y_train,
    epochs=50,
    validation_data=(
        X_test,
        y_test
    )
)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [30]:
scores = model.evaluate(X_test, y_test, verbose=1)
print(f'test accuracy is {scores[1]}')

test accuracy is 0.8950695395469666


In [32]:
!mkdir -p saved_model
model.save('./saved_model/4_class_classification_model.keras')