In [None]:
import numpy as np
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import Bidirectional, LSTM, BatchNormalization
from tensorflow.keras.layers import Dense, Activation, Dropout, Flatten
from tensorflow.keras.layers import Conv1D

In [None]:
import tensorflow as tf

In [None]:
VERBOSE = 1
OPTIMIZER = Adam(learning_rate=10e-4)
VOCAB_SIZE = 16  # 4**3
EMBED_SIZE = 100
maxlen = 23  # [(L-kmer)/step] +1

In [None]:
def loadGlove(inputpath, outputpath=""):
    data_list = []
    wordEmb = {}
    with open(inputpath) as f:
        for line in f:
            ll = line.strip().split(',')
            ll[0] = str(int(float(ll[0])))
            data_list.append(ll)
            ll_new = [float(i) for i in ll]
            emb = np.array(ll_new[1:], dtype="float32")
            wordEmb[str(int(ll_new[0]))] = emb

    if outputpath != "":
        with open(outputpath) as f:
            for data in data_list:
                f.writelines(' '.join(data))
    return wordEmb

In [None]:
def CnnCrispr(model_ini):
    print("model1 loaded with 1 biLSTM, 5 conv and 2 dense")
    model_message = "Dropout 0.3,biLSTM.40, Conv1D.[10,20,40,80,100],  Dense[20,2], BatchNormalization,Activition='relu'"
    model = model_ini
    model.add(Bidirectional(LSTM(40, return_sequences=True)))
    model.add(Activation('relu'))

    model.add(Conv1D(10, (5)))
    model.add(Activation('relu'))
    model.add(BatchNormalization())

    model.add(Conv1D(20, (5)))
    model.add(Activation('relu'))
    model.add(BatchNormalization())

    model.add(Conv1D(40, (5)))
    model.add(Activation('relu'))
    model.add(BatchNormalization())

    model.add(Conv1D(80, (5)))
    model.add(Activation('relu'))
    model.add(BatchNormalization())

    model.add(Conv1D(100, (5)))
    model.add(Activation('relu'))
    model.add(BatchNormalization())

    model.add(Flatten())
    model.add(Dropout(0.3))
    model.add(Dense(20))
    model.add(Activation('relu'))

    model.add(Dense(2))
    model.add(Activation('softmax'))
    return model, model_message

In [None]:
!wget https://raw.githubusercontent.com/LQYoLH/CnnCrispr/master/CnnCrispr_final/Encoded_data/Class/keras_GloVeVec_5_100_10000.csv

--2022-09-23 22:31:56--  https://raw.githubusercontent.com/LQYoLH/CnnCrispr/master/CnnCrispr_final/Encoded_data/Class/keras_GloVeVec_5_100_10000.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 31381 (31K) [text/plain]
Saving to: ‘keras_GloVeVec_5_100_10000.csv.2’


2022-09-23 22:31:56 (19.4 MB/s) - ‘keras_GloVeVec_5_100_10000.csv.2’ saved [31381/31381]



In [None]:
glove_inputpath = "keras_GloVeVec_5_100_10000.csv"
model_glove = loadGlove(glove_inputpath)
embedding_weights = np.zeros((VOCAB_SIZE, EMBED_SIZE))
for i in range(VOCAB_SIZE):
    embedding_weights[i, :] = model_glove[str(i)]

print("Building models")
model = Sequential()
model.add(Embedding(VOCAB_SIZE, EMBED_SIZE, input_length=maxlen,
                    weights=[embedding_weights],
                    trainable=True))
model, model_message = CnnCrispr(model)

print("Loading weights for the models")
model.load_weights("CnnCrispr_weights.h5")

Building models
model1 loaded with 1 biLSTM, 5 conv and 2 dense
Loading weights for the models


In [None]:
def test_data_read(lines):
    data_n = len(lines)
    data_list = []
    seq_list = []

    for l in range(data_n):
        data = lines[l].split(",")
        seq_item = data[:2]
        print(seq_item)
        data_item = [int(i) for i in data[3:]]
        print(data_item)
        data_list.append(data_item)
        seq_list.append(seq_item)

    return data_list,seq_list

In [None]:
data = ["GCCTCTTTCCCACCCACCTTGGG,GTCTCTTTCCCAGCGACCTGGGG,0.0,10,7,5,15,5,15,15,15,5,5,5,0,6,5,6,0,5,5,15,14,10,10,10",
        "GACTTGTTTTCATTGTTCTCAGG,GAGTCATTTTCATTGTCTTCATG,0.0,10,0,6,15,13,8,15,15,15,15,5,0,15,15,10,15,13,7,15,5,0,11,10",
        "GGTGAGTGAGTGTGTGCGTGTGG,TGTGAGTGTGTGTGTGTGTGTGT,0.0,11,10,15,10,0,10,15,10,3,10,15,10,15,10,15,10,7,10,15,10,15,10,11",
        "GGTGAGTGAGTGTGTGCGTGTGG,TGTGTGTTCGTGTGTGCGTGTGT,0.0,11,10,15,10,3,10,15,11,1,10,15,10,15,10,15,10,5,10,15,10,15,10,11",
        "GCCTCCCCAAAGCCTGGCCAGGG,GCTTCTCCAAAGCCTTCAGAGGG,0.0,10,5,7,15,5,7,5,5,0,0,0,10,5,5,15,11,9,4,6,0,10,10,10"]

X_test,seq_list= test_data_read(data)
seq_list = tf.convert_to_tensor(seq_list)
X_test = np.array(X_test)

['GCCTCTTTCCCACCCACCTTGGG', 'GTCTCTTTCCCAGCGACCTGGGG']
[10, 7, 5, 15, 5, 15, 15, 15, 5, 5, 5, 0, 6, 5, 6, 0, 5, 5, 15, 14, 10, 10, 10]
['GACTTGTTTTCATTGTTCTCAGG', 'GAGTCATTTTCATTGTCTTCATG']
[10, 0, 6, 15, 13, 8, 15, 15, 15, 15, 5, 0, 15, 15, 10, 15, 13, 7, 15, 5, 0, 11, 10]
['GGTGAGTGAGTGTGTGCGTGTGG', 'TGTGAGTGTGTGTGTGTGTGTGT']
[11, 10, 15, 10, 0, 10, 15, 10, 3, 10, 15, 10, 15, 10, 15, 10, 7, 10, 15, 10, 15, 10, 11]
['GGTGAGTGAGTGTGTGCGTGTGG', 'TGTGTGTTCGTGTGTGCGTGTGT']
[11, 10, 15, 10, 3, 10, 15, 11, 1, 10, 15, 10, 15, 10, 15, 10, 5, 10, 15, 10, 15, 10, 11]
['GCCTCCCCAAAGCCTGGCCAGGG', 'GCTTCTCCAAAGCCTTCAGAGGG']
[10, 5, 7, 15, 5, 7, 5, 5, 0, 0, 0, 10, 5, 5, 15, 11, 9, 4, 6, 0, 10, 10, 10]


In [None]:
CnnCrispr_SCORE = model.predict(X_test, batch_size=50, verbose=0)

In [None]:
print(CnnCrispr_SCORE)

[[9.8582643e-01 1.4173619e-02]
 [1.0000000e+00 3.8002670e-23]
 [1.0000000e+00 8.5072326e-19]
 [1.0000000e+00 8.3848136e-24]
 [1.0000000e+00 1.7815443e-22]]


In [None]:
for l in range(len(data)):
  print(seq_list[l])

tf.Tensor([b'GCCTCTTTCCCACCCACCTTGGG' b'GTCTCTTTCCCAGCGACCTGGGG'], shape=(2,), dtype=string)
tf.Tensor([b'GACTTGTTTTCATTGTTCTCAGG' b'GAGTCATTTTCATTGTCTTCATG'], shape=(2,), dtype=string)
tf.Tensor([b'GGTGAGTGAGTGTGTGCGTGTGG' b'TGTGAGTGTGTGTGTGTGTGTGT'], shape=(2,), dtype=string)
tf.Tensor([b'GGTGAGTGAGTGTGTGCGTGTGG' b'TGTGTGTTCGTGTGTGCGTGTGT'], shape=(2,), dtype=string)
tf.Tensor([b'GCCTCCCCAAAGCCTGGCCAGGG' b'GCTTCTCCAAAGCCTTCAGAGGG'], shape=(2,), dtype=string)


In [None]:
%run cnn_crisper.py

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
146359      |   TGACATCAAGTTTAAGCCCTCGG                 |          0.0%
146360      |   AACACCAGTGAGTAGAGCGGAGG                 |          100.0%
146361      |   GACACAATTGAGTAGAGCATATG                 |          0.0%
146362      |   GGGTGGGGGGAGTTTGCTCCTGG                 |          100.0%
146363      |   GGGTGGGGGGAATGGCCTCCGAG                 |          0.0%
146364      |   GGTCCTGCCGCTGCTTGTCATGG                 |          100.0%
146365      |   GGTCCTGCCTCAGCCTCTCACTG                 |          0.0%
146366      |   AAATGAGAAGAAGAGGCACAGGG                 |          100.0%
146367      |   GACTGTGAAGAAGAGGCACAGTC                 |          0.0%
146368      |   AAATGAGAAGAAGAGGCACAGGG                 |          100.0%
146369      |   AGTTGAGGAACTGAGGCACAGGG                 |          0.0%
146370      |   TGACATCAATTATTATACATCGG                 |          100.0%
146371      |   TGAGGCCAATTATTAGACTTCAG                 |  

In [None]:
vim 'pam_scores.pkl' -c "set ff=unix" -c ":wq"

SyntaxError: ignored

In [None]:
%run cfd-score-calculator.py

Exception: ignored

In [None]:
pam_scores = pickle.load(open(r'/content/pam_scores.pkl','rb'))

UnpicklingError: ignored

In [None]:
mm_scores = pickle.load(open(r'mismatch_score.pkl','rb'))

UnpicklingError: ignored

In [None]:
WINDOWS_LINE_ENDING = b'\r\n'
UNIX_LINE_ENDING = b'\n'

# relative or absolute file path, e.g.:
file_path = r"/content/pam_scores.pkl"

with open(file_path, 'rb') as open_file:
    content = open_file.read()
    
# Windows ➡ Unix
content = content.replace(WINDOWS_LINE_ENDING, UNIX_LINE_ENDING)

# Unix ➡ Windows
#content = content.replace(UNIX_LINE_ENDING, WINDOWS_LINE_ENDING)

with open(file_path, 'wb') as open_file:
    open_file.write(content)

In [None]:
#!/usr/bin/env python
"""\
convert dos linefeeds (crlf) to unix (lf)
usage: dos2unix.py <input> <output>
"""
import sys

if len(sys.argv[1:]) != 2:
  sys.exit(__doc__)

content = ''
outsize = 0
with open('/content/mismatch_score.pkl', 'rb') as infile:
  content = infile.read()
with open(sys.argv[2], 'wb') as output:
  for line in content.splitlines():
    outsize += len(line) + 1
    output.write(line + "\n".encode('ascii'))

print("Done. Saved %s bytes." % (len(content)-outsize))

Done. Saved 720 bytes.
