In [1]:
import pandas as pd
import numpy as np
np.random.seed(0)
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

In [2]:
sequences = pd.read_csv('../storage/seq_data.csv', index_col='Unnamed: 0')

In [3]:
sequences.head()

Unnamed: 0,target_id,Sequence
0,P06213,MATGGRRGAAAAPLLVAVAALLLGAAGHLYPGEVCPGMDIRNNLTR...
1,P78368,MDFDKKGGKGETEEGRRMSKAGGGRSSHGIRSSGTSSGVLMVGPNF...
2,Q9H2K8,MRKGVLKDPEIADLFYKDDPEELFIGLHEIGHGSFGAVYFATNAHT...
3,P49336,MDYDFKVKLSSERERVEDLFEYEGCKVGRGTYGHVYKAKRKDGKDD...
4,Q6DT37,MERRLRALEQLARGEAGGCPGLDGLLDLLLALHHELSSGPLRRERS...


# SGT embeddings

In [None]:
seqlist = [list(x) for x in sequences.Sequence.values]

In [None]:
print(seqlist[0])

In [None]:
from sgt import Sgt

sgt = Sgt(kappa=10, lengthsensitive = False)
embedding = sgt.fit_transform(corpus=seqlist)

In [None]:
len(embedding)

In [None]:
for i in range(10):
    print(len(embedding[i]))
    

In [None]:
encode = pd.DataFrame(embedding)

In [None]:
encode.head()

In [None]:
protdata = pd.concat([sequences, encode], axis=1)
protdata.head()

In [None]:
protdata.drop('Sequence', inplace=True, axis=1)

In [None]:
protdata.head()

In [None]:
protdata.to_csv('sgt_embeds.csv')

## TAPE embedding

In [None]:
import torch
from tape import ProteinBertModel, TAPETokenizer
model = ProteinBertModel.from_pretrained('bert-base')
tokenizer = TAPETokenizer(vocab='iupac')

In [None]:
# read the sequence data
sequences = pd.read_csv('seq_data.csv', index_col='Unnamed: 0')
seqlist = [x for x in sequences.Sequence.values]
len(seqlist), sequences.shape

In [None]:
peptide = sequences.Sequence[0]

In [None]:
len(peptide)

In [None]:

predictor = Predictor(
    inputs=[SequenceInput(
        length=len(peptide), name="peptide", encoding="embedding", variable_length=True,
        conv_filter_sizes=[9],
        repeat_conv_layers=2)
    ],
    outputs=[Output(name="y", dim=1, activation="sigmoid")])

## tokenizer embedding

In [None]:
max([len(x) for x in seqlist])

In [None]:
from keras.preprocessing import text, sequence
from keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split

max_length = 2549

tokenizer = Tokenizer(char_level=True)
tokenizer.fit_on_texts(seqlist)
X = tokenizer.texts_to_sequences(seqlist)
X = sequence.pad_sequences(X, maxlen=max_length)

In [None]:
X.shape

In [None]:
token = pd.DataFrame(X)

In [None]:
from keras.models import Sequential
from keras.layers import Dense, Conv1D, MaxPooling1D, Flatten
from keras.layers import LSTM
from keras.layers.embeddings import Embedding

embedding_dim = 400
top_classes = 1
# create the model
model = Sequential()
model.add(Embedding(len(tokenizer.word_index)+1, embedding_dim, input_length=3573))
model.add(Conv1D(filters=64, kernel_size=6, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dense(top_classes, kernel_initializer='normal'))
model.compile(loss='mean_squared_error', optimizer='adam', metrics=['mse'])
print(model.summary())

In [None]:
tokendata = pd.concat([sequences, token], axis=1)
tokendata.head()

In [None]:
tokendata.drop('Sequence', inplace=True, axis=1)

In [None]:
train_token = train.merge(tokendata, on='target_id')
train_token.shape

In [None]:
test_token = test.merge(tokendata, on='target_id')
test_token.shape

In [None]:
train_token.head()

In [None]:
X = train_token.drop(['compound_id', 'target_id', 'pchembl_value'], axis=1)
y = train_token['pchembl_value']

In [None]:
X.shape

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2)

model.fit(X_train, y_train, validation_data=(X_test, y_test), 
          epochs=4, batch_size=100)

# Using pytorch approach



In [4]:
import torch
from tape import ProteinBertModel, TAPETokenizer
model = ProteinBertModel.from_pretrained('bert-base')
tokenizer = TAPETokenizer(vocab='iupac')  # iupac is the vocab for TAPE models, use unirep for the UniRep model

In [5]:
# Pfam Family: Hexapep, Clan: CL0536
def tape(sequence):
    token_ids = torch.tensor([tokenizer.encode(sequence)])
    output = model(token_ids)
    pooled_output = output[1]
    return pooled_output

In [6]:
seqlist = [x for x in sequences.Sequence.values]
len(seqlist), sequences.shape

(204, (204, 2))

In [7]:
import time

In [None]:
pooled = []
i = 11
for ind in range(11, len(seqlist)):
    encode = tape(seqlist[ind])
    pooled.append(encode)
    print(f'done with {i} indexed sequence')
    i+=1
    if i%5==0:
        with open('pool.txt', 'w') as f:
            for item in pooled:
                f.write("%s\n" % item)
        time.sleep(120)

done with 11 indexed sequence
done with 12 indexed sequence
done with 13 indexed sequence
done with 14 indexed sequence
done with 15 indexed sequence
done with 16 indexed sequence
done with 17 indexed sequence
