In [None]:
from gensim import models
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import keras
from keras import layers

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split as tts
from keras.utils import np_utils
import seaborn as sns
from sklearn.metrics import confusion_matrix

In [None]:
def get_kmers(sequences, kmer=4):
    return_seqs = sequences.copy()
    # if kmer <= 1:
    #     raise ValueError("kmer size must be greater than 1")
    for seq_index, seq in sequences.iteritems():
        kmer_list = []
        enum = seq[:-kmer + 1] if kmer > 1 else seq
        for let_index, let in enumerate(enum):
            kmer_list.append(seq[let_index:let_index + kmer])
        return_seqs[seq_index] = kmer_list
    return return_seqs

def get_2d_kmer(seqs, mnm, mxm):
    return_seqs = []
    for _, val in seqs.iteritems():
        kmer_seqs = []
        for i in range(mnm, mxm+1):
            kmers = list(get_kmers(pd.Series([val]), kmer=i))[0]
            # kmers += [kmers[-1] for _ in range(i-1)]
            kmer_seqs.append(kmers)
        return_seqs.append(kmer_seqs)
    
    return pd.Series(return_seqs)

def dup_vecs(dfs):
    dup_X_2d = []
    for df_ind, df in enumerate(dfs):
        num_dupes = 4**(2-df_ind)
        print(num_dupes)
        new_df = df.values.tolist()
        for ind, seq in enumerate(new_df):
            new_seq = []
            for num in seq:
                for i in range(num_dupes):
                    new_seq.append(num)
            new_df[ind] = new_seq
        dup_X_2d.append(pd.DataFrame(new_df))

    return dup_X_2d

def vectorize_1d(X, kmer, model):
    X = get_kmers(kmer)
    df_list = []
    for _, seq in X.iteritems():
        seq_matrix = [model.wv[val] for val in seq]
        df_list.append(seq_matrix)
    df_list = np.array(df_list)
    df_list = df_list.reshape(*df_list.shape, 1).astype('float32')
    return df_list


def vectorize_2d(X, mnm, mxm, model):

    X = get_2d_kmer(X, mnm, mxm)

    for _, seq in X.iteritems():
        for mer in seq[:-1]:
            del mer[-(mxm-len(mer[0])):]

    df_list = np.zeros(shape=(6764,mxm-mnm+1,len(X[0][0]),100))
    for i, seq in X.iteritems():
        seq_matrix = []
        for mer in seq:
            mer_matrix = []
            for val in mer:
                # print(val)
                mer_matrix.append(model.wv[val])
            seq_matrix.append(mer_matrix)
            # print(np.array(seq_matrix).shape)
        df_list[i] = seq_matrix
    df_list = df_list.reshape(*df_list.shape, 1).astype('float32')
    return df_list

opt = keras.optimizers.Adam(learning_rate=0.0005)

def cnn2d(input_shape, num_classes):
    model = keras.Sequential()
    model.add(layers.Dropout(0.1, input_shape=input_shape))
    model.add(layers.Conv2D(32,77, input_shape=input_shape))
    model.add(layers.Activation(activation='softsign'))
    model.add(layers.Dropout(0.1))
    model.add(layers.BatchNormalization())
    model.add(layers.MaxPooling2D(2))
    model.add(layers.Flatten())
    model.add(layers.Dense(16))
    model.add(layers.Activation(activation='softsign'))
    model.add(layers.Dense(num_classes))
    model.add(layers.Activation(activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy'])
    return model

def cnn3d(input_shape, num_classes):
    model = keras.Sequential()
    model.add(layers.Dropout(0.1, input_shape=input_shape))
    model.add(layers.Conv3D(32,(input_shape[0]-1, input_shape[1]-1, input_shape[1]-1), input_shape=input_shape))
    model.add(layers.Activation(activation='softsign'))
    model.add(layers.Dropout(0.1))
    model.add(layers.BatchNormalization())
    model.add(layers.MaxPooling3D(2))
    model.add(layers.Flatten())
    model.add(layers.Dense(16))
    model.add(layers.Activation(activation='softsign'))
    model.add(layers.Dense(num_classes))
    model.add(layers.Activation(activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy'])
    return model

In [None]:
data = pd.read_csv('../data/pro_nonpro.csv')
data = data.sample(frac=1).reset_index(drop=True)

In [None]:
X = data.Seq
y = data.Level

le = LabelEncoder()
y = pd.Series(le.fit_transform(y))

In [None]:
X_2d = get_2d_kmer(X, mnm=1, mxm=10)

In [None]:
# for _, seq in X_2d.iteritems():
#     for mer in seq[:-1]:
#         del mer[-(10-len(mer[0])):]

In [None]:
sents = [i for _, v in X_2d.iteritems() for i in v]

In [None]:
word2vec_model = models.Word2Vec(
    sentences=sents,
    min_count=1,
    window=10,
    workers=4
    )

word2vec_model.save('word2vecmodel')

In [None]:
model = models.word2vec.Word2Vec.load('./word2vecmodel')

In [None]:
inpt_data = vectorize_2d(X, 1, 10, model)

In [None]:
inpt_data.shape

In [None]:
num_features = 100
num_classes = 2
num_epochs = 150
input_shape = inpt_data.shape[1:]

In [None]:
y

In [None]:
for i in range(10):
    plt.imshow(inpt_data[0][i], cmap='gray')
    plt.show()


In [None]:
X_train, X_test, y_train, y_test = tts(inpt_data, y, train_size=0.8)
y_train = np_utils.to_categorical(y_train, num_classes)
y_test_keras = np_utils.to_categorical(y_test, num_classes)

In [None]:
# for train, test in kfold.split(X, y):

cnn = cnn3d(input_shape, num_classes)

history = cnn.fit(
    X_train, y_train,
    validation_data=(X_test, y_test_keras),
    epochs=num_epochs,
    batch_size=128,
    verbose=True,
)


In [None]:
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper left')
plt.show()

In [None]:
max(history.history['val_accuracy'])

In [None]:
predictions = cnn.predict(
    X_test)
predictions = np.argmax(predictions, axis=1)
true_labels = np.asarray(y_test)
print('CV: ')
sns.heatmap(pd.DataFrame(confusion_matrix(true_labels, predictions), range(num_classes), range(num_classes)), annot=True)
plt.show()


In [None]:
X_test.shape

In [None]:
test_data = pd.read_csv('../data/pro_nonpro.csv')
test = test_data.Seq
test_val = test_data.Level
inpt = vectorize_2d(test, 4, 6, model)
pred = cnn.predict(inpt)
preds = [list(i).index(max(i)) for i in pred]

In [None]:
pred.shape

In [None]:
list(preds == test_val).count(True) / 6764

In [None]:
preds==test_val