In [307]:
#Import libraries required
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

In [308]:
#Load up guide RNA sequences and target DNA sequences as pandas dataframe
df_guide_RNA = pd.read_csv(r"C:\Users\dasak\OneDrive\Documents\GitHub\cas12-collateral-cleavage-prediction\full_guide_RNA_sequences.csv")
df_target_DNA = pd.read_csv(r"C:\Users\dasak\OneDrive\Documents\GitHub\cas12-collateral-cleavage-prediction\target_DNA_sequences.csv")
df_target_DNA_reverse_complements = pd.read_csv(r"C:\Users\dasak\OneDrive\Documents\GitHub\cas12-collateral-cleavage-prediction\target_DNA_sequences_reverse_complements.csv")


In [309]:
# Load k_value_sigmoid from experiment on 24_05_2022 
from numpy import genfromtxt
k_values = genfromtxt('k_values_sigmoid.csv', delimiter=',')

In [310]:
def one_hot_encode_RNA(gRNA_sequence):
    mapping = dict(zip("aucg", range(4)))    
    one_hot_encoded_sequence = [mapping[i] for i in gRNA_sequence]
    return np.eye(4)[one_hot_encoded_sequence]

In [311]:
 # One hot encode guide RNA sequences
guide_RNA_sequences = df_guide_RNA['Sequence']
guide_RNA_one_hot_encoded_sequences = []
for i in guide_RNA_sequences:
    one_hot_encoded_sequence = one_hot_encode_RNA(i)
    guide_RNA_one_hot_encoded_sequences.append(one_hot_encoded_sequence.ravel())

#generate feature names
feature_names_gRNA = []
for i in range(41):
    feature_names_gRNA.append('guide RNA A' + str(i+1))
    feature_names_gRNA.append('guide RNA U' + str(i+1))
    feature_names_gRNA.append('guide RNA C' + str(i+1))
    feature_names_gRNA.append('guide RNA G' + str(i+1))

In [312]:
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.metrics import mean_squared_error
from scipy import stats

def train_dt(input_features, outputs, input_feature_names):
    '''
    This function takes in input features and outputs as kvalue for an experiment. 
    It then trains a decision tree on these features.
    
    '''
    input_train, input_test, output_train, output_test = train_test_split(
        input_features, outputs, test_size=0.2, random_state=42)
    model = tree.DecisionTreeRegressor(max_depth = 10, random_state = 42)
    model = model.fit(input_train, output_train)
    model_importance = pd.DataFrame({'Feature_names':input_feature_names,'Importance': model.feature_importances_})
    output_predictions = model.predict(input_test)
    mse = mean_squared_error(output_test, output_predictions)
    pearson_rank_results = stats.pearsonr(output_test, output_predictions)
    return(model_importance,mse, pearson_rank_results)
    
    
    


In [321]:
dt_gRNA_model_importance = []
dt_gRNA_mse =[]
dt_gRNA_model_importance, dt_gRNA_mse, dt_gRNA_pearson_results = train_dt(guide_RNA_one_hot_encoded_sequences,k_values,feature_names_gRNA)
print(dt_gRNA_model_importance.sort_values(by = 'Importance', ascending = False))
print(dt_gRNA_mse)
print(dt_gRNA_pearson_results)

     Feature_names  Importance
140  guide RNA A36    0.426137
87   guide RNA G22    0.347384
102  guide RNA C26    0.044643
148  guide RNA A38    0.017398
139  guide RNA G35    0.014933
..             ...         ...
55   guide RNA G14    0.000000
56   guide RNA A15    0.000000
57   guide RNA U15    0.000000
58   guide RNA C15    0.000000
163  guide RNA G41    0.000000

[164 rows x 2 columns]
0.03167370134062151
(0.42750606663950685, 0.02937268997360803)


In [320]:
#Try 8 bit vectors where the first 4 bits represent the guide RNA and the second 4 bits represent the target DNA. Use only the 20bp matching region to begin with.
#Take the last twenty bases of the guide RNA sequences
complementary_guide_RNA_sequences = []
for i in guide_RNA_sequences:
    complementary_guide_RNA_sequence = i[21:41]
    complementary_guide_RNA_sequences.append(complementary_guide_RNA_sequence)

#One hot encode them
complementary_guide_RNA_one_hot_encoded_sequences = []
for i in complementary_guide_RNA_sequences:
    one_hot_encoded_sequence = one_hot_encode_RNA(i)
    complementary_guide_RNA_one_hot_encoded_sequences.append(one_hot_encoded_sequence)


#Take the middle 20 of the target complements and reorder back to fron so they become complementary to guide RNA sequence
target_DNA_complements = []
complementary_target_DNA_sequences = []
for i in target_DNA_sequences:
    complementary_target_DNA_sequence = i[39:19:-1] #This generates the complement to the guide
    complementary_target_DNA_sequences.append(complementary_target_DNA_sequence)
# print(complementary_target_DNA_sequences[0])    

complementary_target_DNA_one_hot_encoded_sequences = []
for i in complementary_target_DNA_sequences:
    one_hot_encoded_sequence = one_hot_encode_DNA(i)
    complementary_target_DNA_one_hot_encoded_sequences.append(one_hot_encoded_sequence)
# print(complementary_target_DNA_one_hot_encoded_sequences[0])  

#Concatenate the one hot encodes sequences
concat_complementary_guide_RNAs_target_DNAs = []
# test with 1st element 
for i in range(len(complementary_guide_RNA_one_hot_encoded_sequences)):
    # print(i)
    concat_complementary_guide_RNA_target_DNA = np.hstack((complementary_guide_RNA_one_hot_encoded_sequences[i], complementary_target_DNA_one_hot_encoded_sequences[i])).ravel()
    concat_complementary_guide_RNAs_target_DNAs.append(concat_complementary_guide_RNA_target_DNA)

#Set up feature names 
feature_names_concat_guide_RNA_target_DNA = []
for i in range(20):
    feature_names_concat_guide_RNA_target_DNA.append('guide RNA A' + str(i+1))
    feature_names_concat_guide_RNA_target_DNA.append('guide RNA U' + str(i+1))
    feature_names_concat_guide_RNA_target_DNA.append('guide RNA C' + str(i+1))
    feature_names_concat_guide_RNA_target_DNA.append('guide RNA G' + str(i+1))
    feature_names_concat_guide_RNA_target_DNA.append('target DNA A' + str(i+1))
    feature_names_concat_guide_RNA_target_DNA.append('target DNA T' + str(i+1))
    feature_names_concat_guide_RNA_target_DNA.append('target DNA C' + str(i+1))
    feature_names_concat_guide_RNA_target_DNA.append('target DNA G' + str(i+1))
    
dt_concat_importance=[]
dt_concat_mse = []
dt_concat_pearson_results = []
dt_concat_importance, dt_concat_mse, dt_concat_pearson_results = train_dt(concat_complementary_guide_RNAs_target_DNAs, k_values, feature_names_concat_guide_RNA_target_DNA)
print(dt_concat_importance.sort_values(by = 'Importance', ascending = False))
print(dt_concat_mse)
print(dt_concat_pearson_results)


      Feature_names  Importance
108  target DNA A14    0.421864
3      guide RNA G1    0.347387
39    target DNA G5    0.044643
128   guide RNA A17    0.017398
107   guide RNA G14    0.014933
..              ...         ...
74    guide RNA C10    0.000000
73    guide RNA U10    0.000000
72    guide RNA A10    0.000000
71    target DNA G9    0.000000
0      guide RNA A1    0.000000

[160 rows x 2 columns]
0.03171246280915801
(0.3152025980651273, 0.11677226539389007)


In [None]:
# Add context dependency of 20 bp either side of the target DNA

In [36]:
# Train with random forest model instead
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

def train_rf(input_features, outputs, input_feature_names):
    input_train, input_test, output_train, output_test = train_test_split(
        input_features, outputs, test_size=0.2, random_state=42)
    model = RandomForestRegressor(n_estimators = 1000, random_state = 42)
    model = model.fit(input_train, output_train)
    model_importance = pd.DataFrame({'Feature_names':input_feature_names,'Importance': model.feature_importances_})
    output_predictions = model.predict(input_test)
    mse = mean_squared_error(output_test, output_predictions)
    return(model_importance,mse)
    

In [37]:
# For target RNA sequences
rf_guide_RNA_importance = []
rf_guide_RNA_mse = []
rf_guide_RNA_importance, rf_guide_RNA_mse = train_rf(guide_RNA_one_hot_encoded_sequences, k_values, feature_names_gRNA)

rf_guide_RNA_importance.sort_values(by = 'Importance', ascending = False)
rf_guide_RNA_mse

0.042309000198761035

In [41]:
#For target DNA sequences
rf_target_DNA_importance = []
rf_target_DNA_mse = []
rf_target_DNA_importance, rf_target_DNA_mse = train_rf(target_DNA_one_hot_encoded_sequences, k_values, feature_names_DNA)

rf_target_DNA_importance.sort_values(by = 'Importance', ascending = False)
rf_target_DNA_mse

0.03858754168867855

In [42]:
# For combined model
rf_combined_model_importance = []
rf_combined_mse =[]
rf_combined_model_importance, rf_combined_mse = train_rf(combined_one_hot_encoded_sequences, k_values, feature_names_combined)
rf_combined_model_importance.sort_values(by = 'Importance', ascending = False)
rf_combined_mse

0.03488783107536453

In [46]:
# Add in lightgbm and a function to train lgb
from lightgbm import LGBMRegressor
def train_lightgbm(input_features, outputs, input_feature_names):
    input_train, input_test, output_train, output_test = train_test_split(
        input_features, outputs, test_size=0.2, random_state=42)
    model = LGBMRegressor()
    model = model.fit(input_train, output_train)
    model_importance = pd.DataFrame({'Feature_names':input_feature_names,'Importance': model.feature_importances_})
    output_predictions = model.predict(input_test)
    mse = mean_squared_error(output_test, output_predictions)
    return(model_importance,mse)

In [47]:
#For guide RNA sequences
lgb_gRNA_model_importance = []
lgb_mse = []
lgb_gRNA_model_importance, lgb_gRNA_mse = train_lightgbm(guide_RNA_one_hot_encoded_sequences,k_values,feature_names_gRNA)
lgb_gRNA_model_importance.sort_values(by = 'Importance', ascending = False)
lgb_gRNA_mse

0.030547662720544846

In [52]:
#For target DNA sequences
lgb_target_DNA_importance = []
lgb_target_DNA_mse = []
lgb_target_DNA_importance,lgb_target_DNA_mse = train_lightgbm(target_DNA_one_hot_encoded_sequences, k_values, feature_names_DNA)
lgb_target_DNA_importance.sort_values(by = 'Importance', ascending = False)
lgb_target_DNA_mse

0.0374439796267538

In [51]:
# For combined sequences
lgb_combined_model_importance = []
lgb_combined_mse =[]
lgb_combined_model_importance, lgb_combined_mse = train_lightgbm(combined_one_hot_encoded_sequences, k_values, feature_names_combined)
lgb_combined_model_importance.sort_values(by = 'Importance', ascending = False)
lgb_combined_mse

0.03403550284166197

In [64]:
# Add in xgboost
from xgboost import XGBRegressor
def train_xgboost(input_features, outputs, input_feature_names):
    input_train, input_test, output_train, output_test = train_test_split(
        input_features, outputs, test_size=0.2, random_state=42)
    model = XGBRegressor()
    model = model.fit(input_train, output_train)
    model_importance = pd.DataFrame({'Feature_names':input_feature_names,'Importance': model.feature_importances_})
    output_predictions = model.predict(input_test)
    mse = mean_squared_error(output_test, output_predictions)
    return(model_importance,mse)

In [65]:
#For guide RNA sequences
xgb_gRNA_model_importance = []
xgb_mse = []
xgb_gRNA_model_importance, xgb_gRNA_mse = train_xgboost(guide_RNA_one_hot_encoded_sequences,k_values,feature_names_gRNA)
xgb_gRNA_model_importance.sort_values(by = 'Importance', ascending = False)
xgb_gRNA_mse

0.015122088954935458

In [66]:
#For target DNA sequences
xgb_target_DNA_importance = []
xgb_target_DNA_mse = []
xgb_target_DNA_importance,xgb_target_DNA_mse = train_xgboost(target_DNA_one_hot_encoded_sequences, k_values, feature_names_DNA)
xgb_target_DNA_importance.sort_values(by = 'Importance', ascending = False)
xgb_target_DNA_mse

0.23950571481640417

In [67]:
# For combined sequences
xgb_combined_model_importance = []
xgb_combined_mse =[]
xgb_combined_model_importance, xgb_combined_mse = train_xgboost(combined_one_hot_encoded_sequences, k_values, feature_names_combined)
xgb_combined_model_importance.sort_values(by = 'Importance', ascending = False)
xgb_combined_mse

0.33516521686319095