In [1]:
import sys
sys.path.append('/Users/dmitrys/anaconda2/lib/python2.7/site-packages')

In [9]:
import pandas as pd
import numpy as np
import tensorflow as tf

import os
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
from gensim.models import word2vec # using tensorflow backend'
import logging
#import xgboost as xgb
from sklearn.metrics import accuracy_score

In [None]:
seq_df = pd.read_table('data/family_classification_sequences.tab')
seq_df.head()

In [None]:
def getTrigrams(sequence):
    n = 3
    line = [sequence[i:i+n] for i in range(0, len(sequence), n)]
    line.extend([sequence[i+1:i+1+n] for i in range(0, len(sequence), n)])
    line.extend([sequence[i+2:i+2+n] for i in range(0, len(sequence), n)])
    line = [x for x in line if len(x)==3]
    return line

In [None]:
all_trigrams = [getTrigrams(sequence) for sequence in seq_df.Sequences[0:10000]]

In [None]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

skip_gram = 1  # skip-gram = 1, cbow = 0
num_features = 30  # Word vector dimensionality
context = 25  # Context window size
num_workers = 1
model = word2vec.Word2Vec(all_trigrams, sg=skip_gram, negative=5, workers=num_workers, size=num_features)

In [None]:
vocab = model.wv.vocab
final_embed_matrix = np.matrix([model.wv[key] for key in vocab])

In [None]:
tsne = TSNE(n_components=2)
X = tsne.fit_transform(final_embed_matrix)

In [None]:
tsne_df = pd.DataFrame(XX, columns=['x0', 'x1'])
tsne_df['codone'] = model.wv.vocab.keys()
tsne_df.head()

In [None]:
plt.figure(figsize=(15, 10))
plt.title('unlabeled encoding', fontsize=20)
plt.scatter(tsne_df.x0, tsne_df.x1, s=10)
plt.show()

In [None]:
properties = pd.read_csv('trigrams_properties.csv', index_col=0)
properties.head()

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
properties[['hydrophobicity', 'mass', 'number_of_atoms', 'volume']] = \
scaler.fit_transform(properties[['hydrophobicity', 'mass', 'number_of_atoms', 'volume']])

In [None]:
import matplotlib.cm as cm
plt.figure(figsize=(25, 20))
for i, p in enumerate(['hydrophobicity', 'mass', 'number_of_atoms', 'volume']):
    plt.subplot(2,2,i+1)
    plt.title(p, fontsize=25)
    plt.scatter(tsne_df.x0, tsne_df.x1, s=10, c=cm.jet(properties[p]))
            
plt.show()

In [None]:
nice_embed_tsne = pd.read_csv("data/nice_embed_tsne.csv")

import matplotlib.cm as cm
plt.figure(figsize=(25, 20))
for i, p in enumerate(['hydrophobicity', 'mass', 'number_of_atoms', 'volume']):
    plt.subplot(2,2,i+1)
    plt.title(p, fontsize=25)
    plt.scatter(nice_embed_tsne["0"], nice_embed_tsne["1"], s=10, c=cm.jet(properties[p]))
            
plt.show()

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
families = pd.read_csv("data/family_classification_metadata.tab", sep="\t")
families.head()

In [None]:
families["y"] = 0
families["y"][families.FamilyDescription.isin(['50S ribosome-binding GTPase' ])] = 1

subset_neg = families[families.y==0].sample(n=4000)
subset_pos = families[families.y==1]

target = pd.concat([subset_neg.y, subset_pos.y])
sequence_sample = seq_df[seq_df.index.isin(target.index)]
sequence_sample.head()
trigrams = [getTrigrams(sequence) for sequence in sequence_sample.Sequences]

In [None]:
for i, seq in enumerate(trigrams):    
    for j, trig in enumerate(seq):
        try:
            trigrams[i][j] = model.wv[trig]
        except:
            trigrams[i].remove(trig)

ready = [0]*len(trigrams)

for i, vec in enumerate(trigrams):
    try:
        ready[i] = np.mean(vec, axis=1)
    except:
        ready[i] = [0]*30

In [None]:
X = pd.DataFrame(ready)
X = X.loc[:, 0:30]
X.index = target.index
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, target, test_size = 0.3,random_state=7)

In [None]:
clf = xgb.XGBClassifier()
clf.fit(X_train, y_train)
pred = clf.predict(X_test)
accuracy_score(y_test, pred)

In [None]:
from sklearn.metrics import accuracy_score, roc_curve, roc_auc_score, confusion_matrix, log_loss,auc
def plot_roc_curve(predictions, yval):
    try:
        preds = predictions[:,1]
    except:
        preds = np.array(predictions)
    fpr, tpr, threshold = roc_curve(yval, preds)
    roc_auc = auc(fpr, tpr)
    with plt.rc_context({'xtick.color':'white', 'ytick.color':'white',
                     'axes.labelcolor':'white'}):
        plt.title('Receiver Operating Characteristic')
        plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
        plt.legend(loc = 'lower right')
        plt.plot([0, 1], [0, 1],'r--')
        plt.xlim([0, 1])
        plt.ylim([0, 1])
        plt.ylabel('True Positive Rate')
        plt.xlabel('False Positive Rate')
        plt.grid(True)
        plt.show()