This notebook takes in sequence features and runs them through a different models. All model hyperparameters are 
selected using optuna. Models are run across 10 folds and benchmarking results are plotted
for mean squared error and pearson's correlation coefficient



In [95]:
#import libraries
import pandas as pd
import numpy as np 
import sklearn
import os
ROOT_PATH = os.path.abspath('')
ROOT_PATH

'C:\\Users\\dasak\\OneDrive\\Documents\\GitHub\\cas12-collateral-cleavage-prediction'

In [96]:
#Load up guide RNA And target DNA as dataframes
df_guide_RNA = pd.read_csv(ROOT_PATH + "/full_guide_RNA_sequences.csv")
df_target_DNA = pd.read_csv(ROOT_PATH + "/target_DNA_sequences.csv")

# # Load k_value_sigmoid from experiment on 24_05_2022 
from numpy import genfromtxt
k_values = genfromtxt(ROOT_PATH + '/k_values_sigmoid.csv', delimiter=',')
k_values

array([ 0.11984317,  0.13894219,  0.15437609,  0.06970868,  0.13297348,
        0.12425794,  0.18793106,  0.27095051,  0.16818124,  0.1347531 ,
        0.29226458,  0.12293203,  0.38708549,  0.19778096,  0.3622934 ,
        0.34890034,  0.20684573,  0.11417769, -0.07936754,  0.31408586,
        0.18672599,  0.13433442,  0.1847865 ,  0.24778192,  0.42079963,
        0.35078867,  0.09680571,  0.5202575 ,  0.22697745,  0.3248318 ,
        0.1702902 ,  0.24634137,  0.14220321, -2.51561011,  0.17162254,
        0.04222993,  0.24852536,  0.296885  ,  0.27256568,  0.32140478,
        0.21027936,  0.1127019 ,  0.20870236,  0.28398043,  0.32744511,
        0.2282549 ,  0.14870497,  0.32030304,  0.2445419 ,  0.08811278,
        0.44583873,  0.15249895,  0.23265821,  0.22829356,  0.2489641 ,
        0.16632398,  0.27098826,  0.18329734,  0.11389634,  0.29132781,
        0.35664269, -0.2822598 ,  0.14580803,  0.06875822,  0.09644608,
        0.10429324,  0.16959348,  0.32049426,  0.3353713 ,  0.18

In [97]:
def one_hot_encode_DNA(DNA_sequence):
    mapping = dict(zip("atcg", range(4)))    
    one_hot_encoded_sequence = [mapping[i] for i in DNA_sequence]
    return np.eye(4)[one_hot_encoded_sequence]

In [98]:
def one_hot_encode_RNA(gRNA_sequence):
    mapping = dict(zip("aucg", range(4)))    
    one_hot_encoded_sequence = [mapping[i] for i in gRNA_sequence]
    return np.eye(4)[one_hot_encoded_sequence]

In [99]:
guide_RNA_sequences = df_guide_RNA['Sequence']
guide_RNA_one_hot_encoded_sequences = []
for i in guide_RNA_sequences:
    one_hot_encoded_sequence = one_hot_encode_RNA(i)
    guide_RNA_one_hot_encoded_sequences.append(one_hot_encoded_sequence.ravel())

#generate feature names
feature_names_gRNA = []
for i in range(41):
    feature_names_gRNA.append('guide RNA A' + str(i+1))
    feature_names_gRNA.append('guide RNA U' + str(i+1))
    feature_names_gRNA.append('guide RNA C' + str(i+1))
    feature_names_gRNA.append('guide RNA G' + str(i+1))

In [100]:
#Add all objective functions to one cell 
import optuna
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from scipy.stats.mstats import spearmanr
from sklearn.metrics import r2_score
from sklearn.model_selection import cross_validate
from functools import *
from sklearn.linear_model import LinearRegression,Lasso, Ridge, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor 
from lightgbm import LGBMRegressor
from sklearn.svm import SVR

#Objective function for ridge regression 
def objective_ridge(trial,x,y):
    
    alpha = trial.suggest_float('alpha', 0, 1)
    solver = trial.suggest_categorical('solver', ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga'])
    
    
    model = Ridge(
        alpha = alpha,
        solver = solver,
        )
    
    #Use 10 way cross validation for training.
    score = cross_validate(model, x, y, cv = 10, scoring = spearman_rank_scorer)
    score_mean = np.mean(score['test_score'])
    return(score_mean)

#Objective function for lasso regression 
def objective_lasso(trial,x,y):
    alpha = trial.suggest_float('alpha', 0, 1)
    
    model = Lasso(
        alpha = alpha
        )
    
    #Use 10 way cross validation for training.
    score = cross_validate(model, x, y, cv = 10, scoring = spearman_rank_scorer)
    score_mean = np.mean(score['test_score'])
    return(score_mean)

#Objective function for ElasticNet regression 
def objective_ElasticNet(trial,x,y):
    alpha = trial.suggest_float('alpha', 0, 1)
    l1_ratio = trial.suggest_float('l1_ratio',0,1)
    
    model = ElasticNet(
        alpha = alpha,
        l1_ratio = l1_ratio
        )
    
    #Use 10 way cross validation for training.
    score = cross_validate(model, x, y, cv = 10, scoring = spearman_rank_scorer)
    score_mean = np.mean(score['test_score'])
    return(score_mean)

#Objective function for decision tree regression 
def objective_dt(trial,x,y):
    
    ccp_alpha = trial.suggest_float('ccp_alpha', 0, 1)
    criterion = trial.suggest_categorical('criterion', ['squared_error', 'friedman_mse', 'absolute_error'])
    
    model =  DecisionTreeRegressor(
        ccp_alpha = ccp_alpha,
        criterion = criterion,
        random_state = 42)
    
    #Use 10 way cross validation for training.
    score = cross_validate(model, x, y, cv = 10, scoring = spearman_rank_scorer)
    score_mean = np.mean(score['test_score'])
    return(score_mean)

#Objective function for random forest regression
def objective_rf(trial,x,y):
    n_estimators = trial.suggest_int("n_estimators", 100, 1000)
    min_samples_leaf = trial.suggest_int("min_samples_leaf", 1, 30)
    max_samples = trial.suggest_float("max_samples", 0.01, 1.0)
    max_features = trial.suggest_float("max_features", 0.01, 1.0)
    
    model = RandomForestRegressor(
        n_estimators = n_estimators,
        min_samples_leaf = min_samples_leaf,
        max_samples = max_samples,
        max_features = max_features, 
        random_state = 42)
    
    #Use 10 way cross validation for training.
    score = cross_validate(model, x, y, cv = 10, scoring = spearman_rank_scorer)
    score_mean = np.mean(score['test_score'])
    return(score_mean)

#Objective function of xgboost regression 
def objective_xgb(trial,x,y):
    eta = trial.suggest_float("eta", 0, 0.2)
    max_depth = trial.suggest_int("max_depth", 7, 11)
    min_child_weight = trial.suggest_int("min_child_weight", 2, 6)
    subsample = trial.suggest_float("subsample", 0.6, 1.0)
    reg_lambda = trial.suggest_float("lambda", 0.8, 1.0)
    
    model = XGBRegressor(
        eta = eta,
        max_depth = max_depth,
        min_child_weight = min_child_weight,
        subsample = subsample,
        reg_lambda = reg_lambda,
        nthread = 1)
    
    #Use 10 way cross validation for training.
    score = cross_validate(model, x, y, cv = 10, scoring = spearman_rank_scorer)
    score_mean = np.mean(score['test_score'])
    return(score_mean)

#Objective function for lightgbm regression 
def objective_lgbm(trial,x,y):
    boosting_type = trial.suggest_categorical("boosting_type", ['gbdt', 'dart', 'goss'])
    num_leaves = trial.suggest_int("num_leaves", 2, 50)
    max_depth = trial.suggest_int("max_depth", -1, 50)
    learning_rate = trial.suggest_float("learning_rate", 0.001, 0.1)
    n_estimators = trial.suggest_int("n_estimators", 50, 200)
    min_child_weight = trial.suggest_float("min_child_weight", 0.001, 0.005)
        
    model = LGBMRegressor(
        boosting_type = boosting_type,
        num_leaves = num_leaves,
        max_depth = max_depth,
        learning_rate = learning_rate,
        n_estimators = n_estimators,
        min_child_weight = min_child_weight,
        )
    
    #Use 10 way cross validation for training.
    score = cross_validate(model, x, y, cv = 10, scoring = spearman_rank_scorer)
    score_mean = np.mean(score['test_score'])
    return(score_mean)

#Objective function for support vector machine regressor 
def objective_svr(trial,x,y):
    kernel = trial.suggest_categorical("kernel", ["linear", "poly", "rbf", "sigmoid"])
    degree = trial.suggest_int("degree", 1, 10)
    C = trial.suggest_float("C", 0, 5)
    epsilon = trial.suggest_float("epsilon", 0, 5)
    
        
    model = SVR(
        kernel = kernel,
        degree = degree,
        C = C,
        epsilon = epsilon
        )
    
    #Use 10 way cross validation for training.
    score = cross_validate(model, x, y, cv = 5, scoring = spearman_rank_scorer)
    score_mean = np.mean(score['test_score'])
    return(score_mean)

In [101]:
from sklearn.metrics import make_scorer

#Function for spearman rank scorer
def spearman_rank_scorer(model,X,Y):
    Y_predict = model.predict(X)
    spearman_rank = spearmanr(Y,Y_predict)
    rho = spearman_rank[0]
    return(rho)



In [102]:
#Linear regression with guide RNA one hot encoding
model = LinearRegression()
score = cross_validate(model, guide_RNA_one_hot_encoded_sequences, k_values, cv = 10, scoring = spearman_rank_scorer)
linear_regression_gRNA = np.mean(score['test_score'])


In [103]:
#Optimisation study for ridge regression with guide RNA one hot encoding
study = optuna.create_study(direction = 'maximize')
optimization_function = partial(
                objective_ridge,
                x = guide_RNA_one_hot_encoded_sequences,
                y = k_values
            )
study.optimize(optimization_function, n_trials=100) 
ridge_gRNA = study.best_value



[32m[I 2022-10-25 14:47:06,832][0m A new study created in memory with name: no-name-569cc8dd-28f6-4064-a626-eed6859ac828[0m
[32m[I 2022-10-25 14:47:06,931][0m Trial 0 finished with value: 0.05599400599400599 and parameters: {'alpha': 0.2862170396417645, 'solver': 'saga'}. Best is trial 0 with value: 0.05599400599400599.[0m
[32m[I 2022-10-25 14:47:06,965][0m Trial 1 finished with value: 0.05934065934065935 and parameters: {'alpha': 0.7359411398750361, 'solver': 'lsqr'}. Best is trial 1 with value: 0.05934065934065935.[0m
[32m[I 2022-10-25 14:47:06,991][0m Trial 2 finished with value: 0.051398601398601404 and parameters: {'alpha': 0.29206451898424823, 'solver': 'cholesky'}. Best is trial 1 with value: 0.05934065934065935.[0m
[32m[I 2022-10-25 14:47:07,024][0m Trial 3 finished with value: 0.06423576423576424 and parameters: {'alpha': 0.7794922919596493, 'solver': 'lsqr'}. Best is trial 3 with value: 0.06423576423576424.[0m
[32m[I 2022-10-25 14:47:07,063][0m Trial 4 finish

[32m[I 2022-10-25 14:47:10,331][0m Trial 40 finished with value: 0.054345654345654325 and parameters: {'alpha': 0.22249348956666604, 'solver': 'saga'}. Best is trial 26 with value: 0.07432567432567433.[0m
[32m[I 2022-10-25 14:47:10,418][0m Trial 41 finished with value: 0.06818181818181818 and parameters: {'alpha': 0.929865165120993, 'solver': 'saga'}. Best is trial 26 with value: 0.07432567432567433.[0m
[32m[I 2022-10-25 14:47:10,541][0m Trial 42 finished with value: 0.06818181818181818 and parameters: {'alpha': 0.9326910763432505, 'solver': 'saga'}. Best is trial 26 with value: 0.07432567432567433.[0m
[32m[I 2022-10-25 14:47:10,656][0m Trial 43 finished with value: 0.06818181818181818 and parameters: {'alpha': 0.84428092561472, 'solver': 'saga'}. Best is trial 26 with value: 0.07432567432567433.[0m
[32m[I 2022-10-25 14:47:10,905][0m Trial 44 finished with value: 0.06818181818181818 and parameters: {'alpha': 0.9920780205652956, 'solver': 'sag'}. Best is trial 26 with valu

[32m[I 2022-10-25 14:47:13,298][0m Trial 80 finished with value: 0.06818181818181818 and parameters: {'alpha': 0.9068026941716928, 'solver': 'cholesky'}. Best is trial 26 with value: 0.07432567432567433.[0m
[32m[I 2022-10-25 14:47:13,325][0m Trial 81 finished with value: 0.06818181818181818 and parameters: {'alpha': 0.9128498100422927, 'solver': 'cholesky'}. Best is trial 26 with value: 0.07432567432567433.[0m
[32m[I 2022-10-25 14:47:13,355][0m Trial 82 finished with value: 0.06818181818181818 and parameters: {'alpha': 0.9108785894075649, 'solver': 'cholesky'}. Best is trial 26 with value: 0.07432567432567433.[0m
[32m[I 2022-10-25 14:47:13,455][0m Trial 83 finished with value: 0.06818181818181818 and parameters: {'alpha': 0.9683506138632187, 'solver': 'saga'}. Best is trial 26 with value: 0.07432567432567433.[0m
[32m[I 2022-10-25 14:47:13,491][0m Trial 84 finished with value: 0.06543456543456544 and parameters: {'alpha': 0.8696241628617281, 'solver': 'lsqr'}. Best is tria

In [11]:
#Linear regression with guide RNA one hot encoding
model = LinearRegression()
score = cross_validate(model, guide_RNA_one_hot_encoded_sequences, k_values, cv = 10, scoring = spearman_rank_scorer)
score_mean = np.mean(score['test_score'])
score_mean

0.045708443218873775

In [12]:
#Optimisation study for lasso regression with guide RNA one hot encoding
study = optuna.create_study(direction = 'maximize')
optimization_function = partial(
                objective_lasso,
                x = guide_RNA_one_hot_encoded_sequences,
                y = k_values
            )
study.optimize(optimization_function, n_trials=100) 
lasso_gRNA = study.best_value


[32m[I 2022-10-25 11:30:12,299][0m A new study created in memory with name: no-name-14828e4e-d50b-48bd-aeb5-c290063cc165[0m
[32m[I 2022-10-25 11:30:12,321][0m Trial 0 finished with value: 0.0 and parameters: {'alpha': 0.5102738560430958}. Best is trial 0 with value: 0.0.[0m
[32m[I 2022-10-25 11:30:12,343][0m Trial 1 finished with value: 0.0 and parameters: {'alpha': 0.18798540761643023}. Best is trial 0 with value: 0.0.[0m
[32m[I 2022-10-25 11:30:12,366][0m Trial 2 finished with value: 0.0 and parameters: {'alpha': 0.11742147555584781}. Best is trial 0 with value: 0.0.[0m
[32m[I 2022-10-25 11:30:12,388][0m Trial 3 finished with value: 0.0 and parameters: {'alpha': 0.2885139479169929}. Best is trial 0 with value: 0.0.[0m
[32m[I 2022-10-25 11:30:12,408][0m Trial 4 finished with value: 0.0 and parameters: {'alpha': 0.41108398746661623}. Best is trial 0 with value: 0.0.[0m
[32m[I 2022-10-25 11:30:12,431][0m Trial 5 finished with value: 0.0 and parameters: {'alpha': 0.36

[32m[I 2022-10-25 11:30:13,544][0m Trial 53 finished with value: 0.0 and parameters: {'alpha': 0.3530379104626948}. Best is trial 0 with value: 0.0.[0m
[32m[I 2022-10-25 11:30:13,569][0m Trial 54 finished with value: 0.0 and parameters: {'alpha': 0.788862383608014}. Best is trial 0 with value: 0.0.[0m
[32m[I 2022-10-25 11:30:13,592][0m Trial 55 finished with value: 0.0 and parameters: {'alpha': 0.4335775903426382}. Best is trial 0 with value: 0.0.[0m
[32m[I 2022-10-25 11:30:13,614][0m Trial 56 finished with value: 0.0 and parameters: {'alpha': 0.49032843768068946}. Best is trial 0 with value: 0.0.[0m
[32m[I 2022-10-25 11:30:13,639][0m Trial 57 finished with value: 0.0 and parameters: {'alpha': 0.3536849823405865}. Best is trial 0 with value: 0.0.[0m
[32m[I 2022-10-25 11:30:13,663][0m Trial 58 finished with value: 0.0 and parameters: {'alpha': 0.5107005801266756}. Best is trial 0 with value: 0.0.[0m
[32m[I 2022-10-25 11:30:13,688][0m Trial 59 finished with value: 0.0

In [13]:
#Optimisation study for Elastic Net with guide RNA one hot encoding
study = optuna.create_study(direction = 'maximize')
optimization_function = partial(
                objective_ElasticNet,
                x = guide_RNA_one_hot_encoded_sequences,
                y = k_values
            )
study.optimize(optimization_function, n_trials=100) 
ElasticNet_gRNA = study.best_value

[32m[I 2022-10-25 11:30:14,660][0m A new study created in memory with name: no-name-19a4f02f-f257-4f2c-b38d-9fdd12a4cde9[0m
[32m[I 2022-10-25 11:30:14,680][0m Trial 0 finished with value: 0.0 and parameters: {'alpha': 0.9141751619377887, 'l1_ratio': 0.3356599042865598}. Best is trial 0 with value: 0.0.[0m
[32m[I 2022-10-25 11:30:14,734][0m Trial 1 finished with value: 0.11863136863136865 and parameters: {'alpha': 0.002256958043597601, 'l1_ratio': 0.9156205767413349}. Best is trial 1 with value: 0.11863136863136865.[0m
[32m[I 2022-10-25 11:30:14,754][0m Trial 2 finished with value: 0.0 and parameters: {'alpha': 0.36400952109085316, 'l1_ratio': 0.41503881821075783}. Best is trial 1 with value: 0.11863136863136865.[0m
[32m[I 2022-10-25 11:30:14,778][0m Trial 3 finished with value: 0.0 and parameters: {'alpha': 0.45900988890409244, 'l1_ratio': 0.38307287524900124}. Best is trial 1 with value: 0.11863136863136865.[0m
[32m[I 2022-10-25 11:30:14,801][0m Trial 4 finished with 

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
[32m[I 2022-10-25 11:30:15,764][0m Trial 36 finished with value: 0.04645354645354647 and parameters: {'alpha': 2.7011048013645105e-05, 'l1_ratio': 0.15585149022012795}. Best is trial 21 with value: 0.14575424575424573.[0m
[32m[I 2022-10-25 11:30:15,791][0m Trial 37 finished with value: 0.0 and parameters: {'alpha': 0.37252153520533265, 'l1_ratio': 0.36144408103007786}. Best is trial 21 with value: 0.14575424575424573.[0m
[32m[I 2022-10-25 11:30:15,815][0m Trial 38 finished with value: 0.0 and parameters: {'alpha': 0.10552821509427851, 'l1_ratio': 0.7095542566848536}. Best is trial 21 with value: 0.14575424575424573.[0m
[32m[I 2022-10-25 11:30:15,840][0m Trial 39 finished with value: 0.0 and parameters: {'alpha': 0.2046368

[32m[I 2022-10-25 11:30:16,580][0m Trial 65 finished with value: 0.0 and parameters: {'alpha': 0.12996729327754125, 'l1_ratio': 0.35231959785924666}. Best is trial 21 with value: 0.14575424575424573.[0m
[32m[I 2022-10-25 11:30:16,606][0m Trial 66 finished with value: 0.0 and parameters: {'alpha': 0.06975217646742393, 'l1_ratio': 0.5550037056443055}. Best is trial 21 with value: 0.14575424575424573.[0m
[32m[I 2022-10-25 11:30:16,637][0m Trial 67 finished with value: 0.0 and parameters: {'alpha': 0.20240151078235347, 'l1_ratio': 0.2663433614459079}. Best is trial 21 with value: 0.14575424575424573.[0m
[32m[I 2022-10-25 11:30:16,678][0m Trial 68 finished with value: 0.0 and parameters: {'alpha': 0.961308259333068, 'l1_ratio': 0.7214102436654042}. Best is trial 21 with value: 0.14575424575424573.[0m
[32m[I 2022-10-25 11:30:16,707][0m Trial 69 finished with value: 0.09615384615384612 and parameters: {'alpha': 0.024714846781929214, 'l1_ratio': 3.311491711022185e-05}. Best is tr

In [107]:
#Optimisation study for decision tree regression with guide RNA one hot encoding
study = optuna.create_study(direction = 'maximize')
optimization_function = partial(
                objective_dt,
                x = guide_RNA_one_hot_encoded_sequences,
                y = k_values
            )
study.optimize(optimization_function, n_trials=100) 
dt_gRNA = study.best_value

[32m[I 2022-10-25 14:48:04,746][0m A new study created in memory with name: no-name-6b26e8e9-86ba-419a-aec0-861542faf500[0m
[32m[I 2022-10-25 14:48:04,784][0m Trial 0 finished with value: 0.0 and parameters: {'ccp_alpha': 0.6888920974881101, 'criterion': 'friedman_mse'}. Best is trial 0 with value: 0.0.[0m
[32m[I 2022-10-25 14:48:04,818][0m Trial 1 finished with value: 0.0 and parameters: {'ccp_alpha': 0.03959221315741346, 'criterion': 'friedman_mse'}. Best is trial 0 with value: 0.0.[0m
[32m[I 2022-10-25 14:48:04,855][0m Trial 2 finished with value: 0.0 and parameters: {'ccp_alpha': 0.38076906137207456, 'criterion': 'friedman_mse'}. Best is trial 0 with value: 0.0.[0m
[32m[I 2022-10-25 14:48:04,888][0m Trial 3 finished with value: 0.0 and parameters: {'ccp_alpha': 0.7220894880717403, 'criterion': 'friedman_mse'}. Best is trial 0 with value: 0.0.[0m
[32m[I 2022-10-25 14:48:04,922][0m Trial 4 finished with value: 0.0 and parameters: {'ccp_alpha': 0.8080763178503232, 'cr

[32m[I 2022-10-25 14:48:07,264][0m Trial 40 finished with value: 0.0 and parameters: {'ccp_alpha': 0.6801815496911097, 'criterion': 'squared_error'}. Best is trial 25 with value: 0.06921382479621459.[0m
[32m[I 2022-10-25 14:48:07,307][0m Trial 41 finished with value: 0.0 and parameters: {'ccp_alpha': 0.8125633282069901, 'criterion': 'friedman_mse'}. Best is trial 25 with value: 0.06921382479621459.[0m
[32m[I 2022-10-25 14:48:07,342][0m Trial 42 finished with value: 0.0 and parameters: {'ccp_alpha': 0.10257993678126179, 'criterion': 'friedman_mse'}. Best is trial 25 with value: 0.06921382479621459.[0m
[32m[I 2022-10-25 14:48:07,488][0m Trial 43 finished with value: 0.0 and parameters: {'ccp_alpha': 0.16847647645683797, 'criterion': 'absolute_error'}. Best is trial 25 with value: 0.06921382479621459.[0m
[32m[I 2022-10-25 14:48:07,530][0m Trial 44 finished with value: 0.0 and parameters: {'ccp_alpha': 0.08776233457808211, 'criterion': 'squared_error'}. Best is trial 25 with 

[32m[I 2022-10-25 14:48:08,989][0m Trial 79 finished with value: 0.0 and parameters: {'ccp_alpha': 0.11819215051293776, 'criterion': 'friedman_mse'}. Best is trial 64 with value: 0.14492582697052686.[0m
[32m[I 2022-10-25 14:48:09,025][0m Trial 80 finished with value: 0.16943262720118837 and parameters: {'ccp_alpha': 0.0004969194031820019, 'criterion': 'friedman_mse'}. Best is trial 80 with value: 0.16943262720118837.[0m
[32m[I 2022-10-25 14:48:09,072][0m Trial 81 finished with value: 0.016601905468488942 and parameters: {'ccp_alpha': 0.0024220864337305553, 'criterion': 'friedman_mse'}. Best is trial 80 with value: 0.16943262720118837.[0m
[32m[I 2022-10-25 14:48:09,108][0m Trial 82 finished with value: 0.0 and parameters: {'ccp_alpha': 0.05531787974426226, 'criterion': 'friedman_mse'}. Best is trial 80 with value: 0.16943262720118837.[0m
[32m[I 2022-10-25 14:48:09,154][0m Trial 83 finished with value: 0.0 and parameters: {'ccp_alpha': 0.033150032811464114, 'criterion': 'fr

In [108]:
#Optimisation study for random forest regression with guide RNA one hot encoding
study = optuna.create_study(direction = 'maximize')
optimization_function = partial(
                objective_rf,
                x = guide_RNA_one_hot_encoded_sequences,
                y = k_values
            )
study.optimize(optimization_function, n_trials=100) 
rf_gRNA = study.best_value

[32m[I 2022-10-25 14:48:15,369][0m A new study created in memory with name: no-name-3fc71a1f-3d36-4083-b6bd-eaba8001d36a[0m
[32m[I 2022-10-25 14:48:16,424][0m Trial 0 finished with value: 0.0 and parameters: {'n_estimators': 135, 'min_samples_leaf': 30, 'max_samples': 0.27688095537827084, 'max_features': 0.19495964086297946}. Best is trial 0 with value: 0.0.[0m
[32m[I 2022-10-25 14:48:24,453][0m Trial 1 finished with value: 0.05204795204795206 and parameters: {'n_estimators': 940, 'min_samples_leaf': 16, 'max_samples': 0.538419383633643, 'max_features': 0.9276901779212413}. Best is trial 1 with value: 0.05204795204795206.[0m
[32m[I 2022-10-25 14:48:26,854][0m Trial 2 finished with value: 0.2674825174825175 and parameters: {'n_estimators': 283, 'min_samples_leaf': 8, 'max_samples': 0.902024467374156, 'max_features': 0.21234818471660064}. Best is trial 2 with value: 0.2674825174825175.[0m
[32m[I 2022-10-25 14:48:30,204][0m Trial 3 finished with value: 0.06708291708291708 an

[32m[I 2022-10-25 14:50:33,294][0m Trial 30 finished with value: 0.24130869130869131 and parameters: {'n_estimators': 280, 'min_samples_leaf': 8, 'max_samples': 0.8439864242329964, 'max_features': 0.9895365107596013}. Best is trial 15 with value: 0.2774225774225774.[0m
[32m[I 2022-10-25 14:50:39,770][0m Trial 31 finished with value: 0.23411588411588413 and parameters: {'n_estimators': 730, 'min_samples_leaf': 3, 'max_samples': 0.7384499982601387, 'max_features': 0.112505559535717}. Best is trial 15 with value: 0.2774225774225774.[0m
[32m[I 2022-10-25 14:50:46,979][0m Trial 32 finished with value: 0.25439560439560444 and parameters: {'n_estimators': 805, 'min_samples_leaf': 6, 'max_samples': 0.9504132526268796, 'max_features': 0.1777548813733782}. Best is trial 15 with value: 0.2774225774225774.[0m
[32m[I 2022-10-25 14:50:53,049][0m Trial 33 finished with value: 0.24915084915084917 and parameters: {'n_estimators': 661, 'min_samples_leaf': 3, 'max_samples': 0.7551780927935576,

[32m[I 2022-10-25 14:52:49,898][0m Trial 61 finished with value: 0.26128871128871134 and parameters: {'n_estimators': 385, 'min_samples_leaf': 7, 'max_samples': 0.9140919998757457, 'max_features': 0.33566754594106896}. Best is trial 50 with value: 0.2926073926073926.[0m
[32m[I 2022-10-25 14:52:52,799][0m Trial 62 finished with value: 0.24055944055944053 and parameters: {'n_estimators': 330, 'min_samples_leaf': 10, 'max_samples': 0.8256363144767497, 'max_features': 0.29333225529294815}. Best is trial 50 with value: 0.2926073926073926.[0m
[32m[I 2022-10-25 14:52:57,270][0m Trial 63 finished with value: 0.21188811188811188 and parameters: {'n_estimators': 419, 'min_samples_leaf': 4, 'max_samples': 0.8619254868088875, 'max_features': 0.3579212512399619}. Best is trial 50 with value: 0.2926073926073926.[0m
[32m[I 2022-10-25 14:53:01,491][0m Trial 64 finished with value: 0.2394605394605395 and parameters: {'n_estimators': 446, 'min_samples_leaf': 7, 'max_samples': 0.91649234852876

[32m[I 2022-10-25 14:54:06,817][0m Trial 91 finished with value: 0.21868131868131865 and parameters: {'n_estimators': 253, 'min_samples_leaf': 5, 'max_samples': 0.5931277822814783, 'max_features': 0.24645801401094966}. Best is trial 83 with value: 0.3025474525474525.[0m
[32m[I 2022-10-25 14:54:08,740][0m Trial 92 finished with value: 0.29010989010989013 and parameters: {'n_estimators': 209, 'min_samples_leaf': 3, 'max_samples': 0.5179082361914638, 'max_features': 0.2870492291750324}. Best is trial 83 with value: 0.3025474525474525.[0m
[32m[I 2022-10-25 14:54:10,525][0m Trial 93 finished with value: 0.1918081918081918 and parameters: {'n_estimators': 202, 'min_samples_leaf': 3, 'max_samples': 0.16949515571995433, 'max_features': 0.28810849925280013}. Best is trial 83 with value: 0.3025474525474525.[0m
[32m[I 2022-10-25 14:54:12,199][0m Trial 94 finished with value: 0.26413586413586415 and parameters: {'n_estimators': 171, 'min_samples_leaf': 3, 'max_samples': 0.52582690084299

In [None]:
#Optimisation study for xgboost regression with guide RNA one hot encoding
study = optuna.create_study(direction = 'maximize')
optimization_function = partial(
                objective_xgb,
                x = guide_RNA_one_hot_encoded_sequences,
                y = k_values
            )
study.optimize(optimization_function, n_trials=100) 
xgb_gRNA = study.best_value

In [None]:
#Optimisation study for lightgbm regression with guide RNA one hot encoding
study = optuna.create_study(direction = 'maximize')
optimization_function = partial(
                objective_lgbm,
                x = guide_RNA_one_hot_encoded_sequences,
                y = k_values
            )
study.optimize(optimization_function, n_trials=100) 
lgbm_gRNA = study.best_value

In [None]:
study = optuna.create_study(direction = 'maximize')
optimization_function = partial(
                objective_svr,
                x = guide_RNA_one_hot_encoded_sequences,
                y = k_values
            )
study.optimize(optimization_function, n_trials=100) 
svr_gRNA = study.best_value

In [115]:
#One hot encode only the 20 gRNA bases that interact with the target DNA
complementary_guide_RNA_sequences = []
for i in guide_RNA_sequences:
    complementary_guide_RNA_sequence = i[21:41]
    complementary_guide_RNA_sequences.append(complementary_guide_RNA_sequence)

#One hot encode them
complementary_guide_RNA_one_hot_encoded_sequences = []
for i in complementary_guide_RNA_sequences:
    one_hot_encoded_sequence = one_hot_encode_RNA(i).ravel()
    complementary_guide_RNA_one_hot_encoded_sequences.append(one_hot_encoded_sequence)

#generate feature names
feature_names_complementary_gRNA = []
for i in range(20):
    feature_names_complementary_gRNA.append('guide RNA A' + str(i+1))
    feature_names_complementary_gRNA.append('guide RNA U' + str(i+1))
    feature_names_complementary_gRNA.append('guide RNA C' + str(i+1))
    feature_names_complementary_gRNA.append('guide RNA G' + str(i+1))

In [116]:
#Optimisation function for linear regression with complementary guide RNA one hot encoding
model = LinearRegression()
score = cross_validate(model, complementary_guide_RNA_one_hot_encoded_sequences, k_values, cv = 10, scoring = spearman_rank_scorer)
score_mean = np.mean(score['test_score'])
linear_regression_complementary_gRNA = score_mean
linear_regression_complementary_gRNA

0.10281942520601048

In [117]:
#Optimisation function for ridge regression with complementary guide RNA one hot encoding
study = optuna.create_study(direction = 'maximize')
optimization_function = partial(
                objective_ridge,
                x = complementary_guide_RNA_one_hot_encoded_sequences,
                y = k_values
            )
study.optimize(optimization_function, n_trials=100) 
ridge_complementary_gRNA = study.best_value

[32m[I 2022-10-25 15:00:06,264][0m A new study created in memory with name: no-name-63e386d8-e1ef-419c-b361-b3d1089a1322[0m
[32m[I 2022-10-25 15:00:06,303][0m Trial 0 finished with value: 0.04590409590409591 and parameters: {'alpha': 0.16352694818830216, 'solver': 'svd'}. Best is trial 0 with value: 0.04590409590409591.[0m
[32m[I 2022-10-25 15:00:06,331][0m Trial 1 finished with value: 0.05159840159840159 and parameters: {'alpha': 0.6248304907547247, 'solver': 'cholesky'}. Best is trial 1 with value: 0.05159840159840159.[0m
[32m[I 2022-10-25 15:00:06,389][0m Trial 2 finished with value: 0.06818181818181818 and parameters: {'alpha': 0.839152925945745, 'solver': 'saga'}. Best is trial 2 with value: 0.06818181818181818.[0m
[32m[I 2022-10-25 15:00:06,454][0m Trial 3 finished with value: 0.054345654345654346 and parameters: {'alpha': 0.6029244132507404, 'solver': 'saga'}. Best is trial 2 with value: 0.06818181818181818.[0m
[32m[I 2022-10-25 15:00:06,486][0m Trial 4 finished

[32m[I 2022-10-25 15:00:09,150][0m Trial 40 finished with value: 0.06818181818181818 and parameters: {'alpha': 0.8779366479000118, 'solver': 'sparse_cg'}. Best is trial 33 with value: 0.06983016983016985.[0m
[32m[I 2022-10-25 15:00:09,183][0m Trial 41 finished with value: 0.05704295704295705 and parameters: {'alpha': 0.7854174018812593, 'solver': 'sparse_cg'}. Best is trial 33 with value: 0.06983016983016985.[0m
[32m[I 2022-10-25 15:00:09,242][0m Trial 42 finished with value: 0.05924075924075924 and parameters: {'alpha': 0.6851526347186297, 'solver': 'saga'}. Best is trial 33 with value: 0.06983016983016985.[0m
[32m[I 2022-10-25 15:00:09,390][0m Trial 43 finished with value: 0.06818181818181818 and parameters: {'alpha': 0.921074378524968, 'solver': 'sag'}. Best is trial 33 with value: 0.06983016983016985.[0m
[32m[I 2022-10-25 15:00:09,428][0m Trial 44 finished with value: 0.06818181818181818 and parameters: {'alpha': 0.8729856879861233, 'solver': 'sparse_cg'}. Best is tri

[32m[I 2022-10-25 15:00:11,743][0m Trial 80 finished with value: 0.06818181818181818 and parameters: {'alpha': 0.890648130036501, 'solver': 'svd'}. Best is trial 46 with value: 0.07377622377622378.[0m
[32m[I 2022-10-25 15:00:11,770][0m Trial 81 finished with value: 0.06818181818181818 and parameters: {'alpha': 0.9960656591978547, 'solver': 'auto'}. Best is trial 46 with value: 0.07377622377622378.[0m
[32m[I 2022-10-25 15:00:11,796][0m Trial 82 finished with value: 0.06818181818181818 and parameters: {'alpha': 0.9277436751287303, 'solver': 'auto'}. Best is trial 46 with value: 0.07377622377622378.[0m
[32m[I 2022-10-25 15:00:11,853][0m Trial 83 finished with value: 0.06818181818181818 and parameters: {'alpha': 0.9020507088065084, 'solver': 'saga'}. Best is trial 46 with value: 0.07377622377622378.[0m
[32m[I 2022-10-25 15:00:11,920][0m Trial 84 finished with value: 0.06818181818181818 and parameters: {'alpha': 0.9411544588020567, 'solver': 'saga'}. Best is trial 46 with valu

In [118]:
#Optimisation function for ridge regression with complementary guide RNA one hot encoding
study = optuna.create_study(direction = 'maximize')
optimization_function = partial(
                objective_lasso,
                x = complementary_guide_RNA_one_hot_encoded_sequences,
                y = k_values
            )
study.optimize(optimization_function, n_trials=100) 
lasso_complementary_gRNA = study.best_value

[32m[I 2022-10-25 15:00:22,629][0m A new study created in memory with name: no-name-353f7d24-833f-4d4a-9c51-333f8080493f[0m
[32m[I 2022-10-25 15:00:22,655][0m Trial 0 finished with value: 0.0 and parameters: {'alpha': 0.3214804576837038}. Best is trial 0 with value: 0.0.[0m
[32m[I 2022-10-25 15:00:22,679][0m Trial 1 finished with value: 0.0 and parameters: {'alpha': 0.7063891034229088}. Best is trial 0 with value: 0.0.[0m
[32m[I 2022-10-25 15:00:22,703][0m Trial 2 finished with value: 0.0 and parameters: {'alpha': 0.7569283599900187}. Best is trial 0 with value: 0.0.[0m
[32m[I 2022-10-25 15:00:22,727][0m Trial 3 finished with value: 0.0 and parameters: {'alpha': 0.8863581153042777}. Best is trial 0 with value: 0.0.[0m
[32m[I 2022-10-25 15:00:22,750][0m Trial 4 finished with value: 0.0 and parameters: {'alpha': 0.21469701479500092}. Best is trial 0 with value: 0.0.[0m
[32m[I 2022-10-25 15:00:22,770][0m Trial 5 finished with value: 0.0 and parameters: {'alpha': 0.2865

[32m[I 2022-10-25 15:00:24,032][0m Trial 47 finished with value: 0.13351648351648354 and parameters: {'alpha': 0.0028811834381449438}. Best is trial 25 with value: 0.15654345654345656.[0m
[32m[I 2022-10-25 15:00:24,064][0m Trial 48 finished with value: 0.0 and parameters: {'alpha': 0.22648466560297043}. Best is trial 25 with value: 0.15654345654345656.[0m
[32m[I 2022-10-25 15:00:24,094][0m Trial 49 finished with value: 0.0 and parameters: {'alpha': 0.11922677294101852}. Best is trial 25 with value: 0.15654345654345656.[0m
[32m[I 2022-10-25 15:00:24,120][0m Trial 50 finished with value: 0.13351648351648354 and parameters: {'alpha': 0.002905413206977181}. Best is trial 25 with value: 0.15654345654345656.[0m
[32m[I 2022-10-25 15:00:24,145][0m Trial 51 finished with value: 0.0 and parameters: {'alpha': 0.04412682456420938}. Best is trial 25 with value: 0.15654345654345656.[0m
[32m[I 2022-10-25 15:00:24,172][0m Trial 52 finished with value: 0.15059940059940063 and parameter

[32m[I 2022-10-25 15:00:25,235][0m Trial 93 finished with value: 0.0 and parameters: {'alpha': 0.08969279833659452}. Best is trial 25 with value: 0.15654345654345656.[0m
[32m[I 2022-10-25 15:00:25,260][0m Trial 94 finished with value: 0.0 and parameters: {'alpha': 0.04586367546444751}. Best is trial 25 with value: 0.15654345654345656.[0m
[32m[I 2022-10-25 15:00:25,284][0m Trial 95 finished with value: 0.0 and parameters: {'alpha': 0.3661127349174927}. Best is trial 25 with value: 0.15654345654345656.[0m
[32m[I 2022-10-25 15:00:25,311][0m Trial 96 finished with value: 0.040912279234877655 and parameters: {'alpha': 0.023469670117134987}. Best is trial 25 with value: 0.15654345654345656.[0m
[32m[I 2022-10-25 15:00:25,336][0m Trial 97 finished with value: 0.0 and parameters: {'alpha': 0.16427296380009554}. Best is trial 25 with value: 0.15654345654345656.[0m
[32m[I 2022-10-25 15:00:25,361][0m Trial 98 finished with value: 0.0 and parameters: {'alpha': 0.0669390594566304}. 

In [119]:
#Optimisation function for Elastic Net regression with complementary guide RNA one hot encoding
study = optuna.create_study(direction = 'maximize')
optimization_function = partial(
                objective_ElasticNet,
                x = complementary_guide_RNA_one_hot_encoded_sequences,
                y = k_values
            )
study.optimize(optimization_function, n_trials=100) 
ElasticNet_complementary_gRNA = study.best_value

[32m[I 2022-10-25 15:00:34,968][0m A new study created in memory with name: no-name-a63a86c0-2c8c-4044-8f53-51072b1c83bd[0m
[32m[I 2022-10-25 15:00:34,992][0m Trial 0 finished with value: 0.0 and parameters: {'alpha': 0.10960660843433734, 'l1_ratio': 0.8904715127988555}. Best is trial 0 with value: 0.0.[0m
[32m[I 2022-10-25 15:00:35,015][0m Trial 1 finished with value: 0.0 and parameters: {'alpha': 0.44488131332314906, 'l1_ratio': 0.41416579672334}. Best is trial 0 with value: 0.0.[0m
[32m[I 2022-10-25 15:00:35,043][0m Trial 2 finished with value: 0.0 and parameters: {'alpha': 0.17341263453718891, 'l1_ratio': 0.478848293057072}. Best is trial 0 with value: 0.0.[0m
[32m[I 2022-10-25 15:00:35,067][0m Trial 3 finished with value: 0.0 and parameters: {'alpha': 0.34314459551666576, 'l1_ratio': 0.14224129114187978}. Best is trial 0 with value: 0.0.[0m
[32m[I 2022-10-25 15:00:35,090][0m Trial 4 finished with value: 0.0 and parameters: {'alpha': 0.36501323934567653, 'l1_ratio'

[32m[I 2022-10-25 15:00:36,105][0m Trial 40 finished with value: 0.0 and parameters: {'alpha': 0.5187118088859682, 'l1_ratio': 0.9178539637919747}. Best is trial 23 with value: 0.16608391608391607.[0m
[32m[I 2022-10-25 15:00:36,132][0m Trial 41 finished with value: 0.05231095309806928 and parameters: {'alpha': 0.014564480114493454, 'l1_ratio': 0.9153619961846085}. Best is trial 23 with value: 0.16608391608391607.[0m
[32m[I 2022-10-25 15:00:36,171][0m Trial 42 finished with value: 0.13806193806193806 and parameters: {'alpha': 0.004686058619379742, 'l1_ratio': 0.7970284771176513}. Best is trial 23 with value: 0.16608391608391607.[0m
[32m[I 2022-10-25 15:00:36,197][0m Trial 43 finished with value: 0.0 and parameters: {'alpha': 0.09450277475322873, 'l1_ratio': 0.5955594399062686}. Best is trial 23 with value: 0.16608391608391607.[0m
[32m[I 2022-10-25 15:00:36,225][0m Trial 44 finished with value: 0.0 and parameters: {'alpha': 0.04085599409355492, 'l1_ratio': 0.795341108236060

[32m[I 2022-10-25 15:00:37,303][0m Trial 79 finished with value: 0.08956574900461295 and parameters: {'alpha': 0.10810310702487486, 'l1_ratio': 0.11636883098411388}. Best is trial 23 with value: 0.16608391608391607.[0m
[32m[I 2022-10-25 15:00:37,332][0m Trial 80 finished with value: 0.07732136272376385 and parameters: {'alpha': 0.07213677247697457, 'l1_ratio': 0.1804692809223133}. Best is trial 23 with value: 0.16608391608391607.[0m
[32m[I 2022-10-25 15:00:37,362][0m Trial 81 finished with value: 0.13376623376623378 and parameters: {'alpha': 0.10305743207793634, 'l1_ratio': 0.057799182863648074}. Best is trial 23 with value: 0.16608391608391607.[0m
[32m[I 2022-10-25 15:00:37,393][0m Trial 82 finished with value: 0.14065934065934066 and parameters: {'alpha': 0.18700454403409833, 'l1_ratio': 0.0012460209831443442}. Best is trial 23 with value: 0.16608391608391607.[0m
[32m[I 2022-10-25 15:00:37,421][0m Trial 83 finished with value: 0.10304695304695304 and parameters: {'alpha

In [122]:
#Optimisation function for decision tree regression with complementary guide RNA one hot encoding
study = optuna.create_study(direction = 'maximize')
optimization_function = partial(
                objective_dt,
                x = complementary_guide_RNA_one_hot_encoded_sequences,
                y = k_values
            )
study.optimize(optimization_function, n_trials=100) 
dt_complementary_gRNA = study.best_value

[32m[I 2022-10-25 15:01:30,720][0m A new study created in memory with name: no-name-b1e8fd1d-8980-4d75-b711-6f198121522e[0m
[32m[I 2022-10-25 15:01:30,756][0m Trial 0 finished with value: 0.0 and parameters: {'ccp_alpha': 0.897903449321542, 'criterion': 'squared_error'}. Best is trial 0 with value: 0.0.[0m
[32m[I 2022-10-25 15:01:30,793][0m Trial 1 finished with value: 0.0 and parameters: {'ccp_alpha': 0.779013008451025, 'criterion': 'squared_error'}. Best is trial 0 with value: 0.0.[0m
[32m[I 2022-10-25 15:01:30,938][0m Trial 2 finished with value: 0.0 and parameters: {'ccp_alpha': 0.7311466453924875, 'criterion': 'absolute_error'}. Best is trial 0 with value: 0.0.[0m
[32m[I 2022-10-25 15:01:30,972][0m Trial 3 finished with value: 0.0 and parameters: {'ccp_alpha': 0.855156428983429, 'criterion': 'squared_error'}. Best is trial 0 with value: 0.0.[0m
[32m[I 2022-10-25 15:01:31,104][0m Trial 4 finished with value: 0.0 and parameters: {'ccp_alpha': 0.17548479952611995, 'c

[32m[I 2022-10-25 15:01:32,997][0m Trial 41 finished with value: 0.04379053888611482 and parameters: {'ccp_alpha': 0.013328372306759297, 'criterion': 'friedman_mse'}. Best is trial 24 with value: 0.2848370811799066.[0m
[32m[I 2022-10-25 15:01:33,033][0m Trial 42 finished with value: 0.0 and parameters: {'ccp_alpha': 0.16146659546216577, 'criterion': 'friedman_mse'}. Best is trial 24 with value: 0.2848370811799066.[0m
[32m[I 2022-10-25 15:01:33,069][0m Trial 43 finished with value: 0.0 and parameters: {'ccp_alpha': 0.07329309300025141, 'criterion': 'friedman_mse'}. Best is trial 24 with value: 0.2848370811799066.[0m
[32m[I 2022-10-25 15:01:33,102][0m Trial 44 finished with value: 0.0 and parameters: {'ccp_alpha': 0.2620261534276257, 'criterion': 'friedman_mse'}. Best is trial 24 with value: 0.2848370811799066.[0m
[32m[I 2022-10-25 15:01:33,139][0m Trial 45 finished with value: 0.04379053888611482 and parameters: {'ccp_alpha': 0.00605200198914322, 'criterion': 'friedman_mse

[32m[I 2022-10-25 15:01:34,780][0m Trial 81 finished with value: 0.06526258255335064 and parameters: {'ccp_alpha': 0.006081374264366737, 'criterion': 'squared_error'}. Best is trial 24 with value: 0.2848370811799066.[0m
[32m[I 2022-10-25 15:01:34,833][0m Trial 82 finished with value: 0.0 and parameters: {'ccp_alpha': 0.02611964126745019, 'criterion': 'squared_error'}. Best is trial 24 with value: 0.2848370811799066.[0m
[32m[I 2022-10-25 15:01:34,898][0m Trial 83 finished with value: 0.21516685042946082 and parameters: {'ccp_alpha': 0.00017769950271697755, 'criterion': 'squared_error'}. Best is trial 24 with value: 0.2848370811799066.[0m
[32m[I 2022-10-25 15:01:34,955][0m Trial 84 finished with value: 0.0 and parameters: {'ccp_alpha': 0.11718368133607818, 'criterion': 'squared_error'}. Best is trial 24 with value: 0.2848370811799066.[0m
[32m[I 2022-10-25 15:01:35,000][0m Trial 85 finished with value: 0.0 and parameters: {'ccp_alpha': 0.057518427469463486, 'criterion': 'squ

In [25]:
#Optimisation function for random forest regression with complementary guide RNA one hot encoding
study = optuna.create_study(direction = 'maximize')
optimization_function = partial(
                objective_rf,
                x = complementary_guide_RNA_one_hot_encoded_sequences,
                y = k_values
            )
study.optimize(optimization_function, n_trials=100) 
rf_complementary_gRNA = study.best_value

[32m[I 2022-10-25 11:37:48,134][0m A new study created in memory with name: no-name-fbd58498-d4ac-46c6-8af1-3cc433b279ec[0m
[32m[I 2022-10-25 11:37:49,502][0m Trial 0 finished with value: 0.0703424670534887 and parameters: {'n_estimators': 228, 'min_samples_leaf': 21, 'max_samples': 0.4305606371973182, 'max_features': 0.16558039434805258}. Best is trial 0 with value: 0.0703424670534887.[0m
[32m[I 2022-10-25 11:37:52,031][0m Trial 1 finished with value: 0.06441583196926343 and parameters: {'n_estimators': 399, 'min_samples_leaf': 26, 'max_samples': 0.7054702544986825, 'max_features': 0.6790785075772271}. Best is trial 0 with value: 0.0703424670534887.[0m
[32m[I 2022-10-25 11:37:58,645][0m Trial 2 finished with value: 0.005594405594405586 and parameters: {'n_estimators': 983, 'min_samples_leaf': 26, 'max_samples': 0.7713173955863841, 'max_features': 0.7731641128664605}. Best is trial 0 with value: 0.0703424670534887.[0m
[32m[I 2022-10-25 11:38:05,119][0m Trial 3 finished wi

[32m[I 2022-10-25 11:39:53,522][0m Trial 30 finished with value: 0.0 and parameters: {'n_estimators': 380, 'min_samples_leaf': 20, 'max_samples': 0.35798668011650076, 'max_features': 0.7535183076015328}. Best is trial 22 with value: 0.2775224775224775.[0m
[32m[I 2022-10-25 11:39:57,448][0m Trial 31 finished with value: 0.2514985014985015 and parameters: {'n_estimators': 583, 'min_samples_leaf': 3, 'max_samples': 0.23234441045393034, 'max_features': 0.6147641373452339}. Best is trial 22 with value: 0.2775224775224775.[0m
[32m[I 2022-10-25 11:40:01,412][0m Trial 32 finished with value: 0.2446553446553447 and parameters: {'n_estimators': 629, 'min_samples_leaf': 3, 'max_samples': 0.15703503535292906, 'max_features': 0.6490089465304376}. Best is trial 22 with value: 0.2775224775224775.[0m
[32m[I 2022-10-25 11:40:08,017][0m Trial 33 finished with value: 0.20479520479520477 and parameters: {'n_estimators': 755, 'min_samples_leaf': 1, 'max_samples': 0.48363426817895083, 'max_featur

[32m[I 2022-10-25 11:41:13,302][0m Trial 61 finished with value: 0.2780719280719281 and parameters: {'n_estimators': 171, 'min_samples_leaf': 2, 'max_samples': 0.6931604484010322, 'max_features': 0.4557504893628423}. Best is trial 40 with value: 0.29220779220779225.[0m
[32m[I 2022-10-25 11:41:15,365][0m Trial 62 finished with value: 0.23986013986013988 and parameters: {'n_estimators': 141, 'min_samples_leaf': 2, 'max_samples': 0.7158898270617894, 'max_features': 0.38238037957109977}. Best is trial 40 with value: 0.29220779220779225.[0m
[32m[I 2022-10-25 11:41:20,160][0m Trial 63 finished with value: 0.26203796203796204 and parameters: {'n_estimators': 386, 'min_samples_leaf': 4, 'max_samples': 0.9373023941352713, 'max_features': 0.5819457902344796}. Best is trial 40 with value: 0.29220779220779225.[0m
[32m[I 2022-10-25 11:41:24,961][0m Trial 64 finished with value: 0.22682317682317682 and parameters: {'n_estimators': 306, 'min_samples_leaf': 1, 'max_samples': 0.7631476456193

[32m[I 2022-10-25 11:42:33,297][0m Trial 91 finished with value: 0.27282717282717284 and parameters: {'n_estimators': 365, 'min_samples_leaf': 6, 'max_samples': 0.6671716356030786, 'max_features': 0.5897956813968235}. Best is trial 76 with value: 0.3185314685314685.[0m
[32m[I 2022-10-25 11:42:35,549][0m Trial 92 finished with value: 0.13546453546453546 and parameters: {'n_estimators': 348, 'min_samples_leaf': 7, 'max_samples': 0.28876286055490963, 'max_features': 0.5245806678768115}. Best is trial 76 with value: 0.3185314685314685.[0m
[32m[I 2022-10-25 11:42:37,837][0m Trial 93 finished with value: 0.21223776223776225 and parameters: {'n_estimators': 319, 'min_samples_leaf': 3, 'max_samples': 0.7876737266041776, 'max_features': 0.6242445667960818}. Best is trial 76 with value: 0.3185314685314685.[0m
[32m[I 2022-10-25 11:42:40,765][0m Trial 94 finished with value: 0.22527472527472528 and parameters: {'n_estimators': 438, 'min_samples_leaf': 10, 'max_samples': 0.69113913242266

In [26]:
#Optimisation function for xgboost regression with complementary guide RNA one hot encoding
study = optuna.create_study(direction = 'maximize')
optimization_function = partial(
                objective_xgb,
                x = complementary_guide_RNA_one_hot_encoded_sequences,
                y = k_values
            )
study.optimize(optimization_function, n_trials=100) 
xgb_complementary_gRNA = study.best_value

[32m[I 2022-10-25 11:42:53,090][0m A new study created in memory with name: no-name-a4cc0e90-8467-4f86-bf81-381a0b4c24b6[0m
[32m[I 2022-10-25 11:42:54,024][0m Trial 0 finished with value: 0.15114885114885115 and parameters: {'eta': 0.16763459487456453, 'max_depth': 10, 'min_child_weight': 6, 'subsample': 0.8905436151879591, 'lambda': 0.937751308152736}. Best is trial 0 with value: 0.15114885114885115.[0m
[32m[I 2022-10-25 11:42:54,995][0m Trial 1 finished with value: 0.13221778221778224 and parameters: {'eta': 0.07591035750350145, 'max_depth': 11, 'min_child_weight': 4, 'subsample': 0.9287909996791588, 'lambda': 0.9057862235708402}. Best is trial 0 with value: 0.15114885114885115.[0m
[32m[I 2022-10-25 11:42:55,887][0m Trial 2 finished with value: 0.1518981018981019 and parameters: {'eta': 0.0903894089094308, 'max_depth': 8, 'min_child_weight': 4, 'subsample': 0.7713193283844086, 'lambda': 0.8184669698202273}. Best is trial 2 with value: 0.1518981018981019.[0m
[32m[I 2022-1

[32m[I 2022-10-25 11:43:21,940][0m Trial 29 finished with value: 0.14975024975024975 and parameters: {'eta': 0.16092972287446652, 'max_depth': 11, 'min_child_weight': 4, 'subsample': 0.6724751747418449, 'lambda': 0.9354751710353191}. Best is trial 4 with value: 0.21888111888111889.[0m
[32m[I 2022-10-25 11:43:22,984][0m Trial 30 finished with value: 0.06433566433566432 and parameters: {'eta': 0.12354326322974689, 'max_depth': 10, 'min_child_weight': 2, 'subsample': 0.9185049484526463, 'lambda': 0.9596663211594201}. Best is trial 4 with value: 0.21888111888111889.[0m
[32m[I 2022-10-25 11:43:23,909][0m Trial 31 finished with value: 0.26443556443556443 and parameters: {'eta': 0.051109874266959524, 'max_depth': 9, 'min_child_weight': 4, 'subsample': 0.7557553186880067, 'lambda': 0.9218680255959273}. Best is trial 31 with value: 0.26443556443556443.[0m
[32m[I 2022-10-25 11:43:24,934][0m Trial 32 finished with value: 0.2170829170829171 and parameters: {'eta': 0.028720071886063056, 

[32m[I 2022-10-25 11:43:51,363][0m Trial 58 finished with value: 0.22022977022977028 and parameters: {'eta': 0.02519865734759431, 'max_depth': 9, 'min_child_weight': 4, 'subsample': 0.83521849635622, 'lambda': 0.8793974829440738}. Best is trial 35 with value: 0.27602397602397605.[0m
[32m[I 2022-10-25 11:43:52,250][0m Trial 59 finished with value: 0.18586413586413586 and parameters: {'eta': 0.05967990086373127, 'max_depth': 8, 'min_child_weight': 5, 'subsample': 0.6974458510618042, 'lambda': 0.8693890518849606}. Best is trial 35 with value: 0.27602397602397605.[0m
[32m[I 2022-10-25 11:43:53,175][0m Trial 60 finished with value: 0.19375624375624376 and parameters: {'eta': 0.11482700792248783, 'max_depth': 10, 'min_child_weight': 4, 'subsample': 0.8096756567844168, 'lambda': 0.9123599222217398}. Best is trial 35 with value: 0.27602397602397605.[0m
[32m[I 2022-10-25 11:43:54,063][0m Trial 61 finished with value: 0.23741258741258742 and parameters: {'eta': 0.04043275379173057, 'm

[32m[I 2022-10-25 11:44:18,868][0m Trial 87 finished with value: 0.25314685314685315 and parameters: {'eta': 0.024555616058226707, 'max_depth': 11, 'min_child_weight': 4, 'subsample': 0.7391523957000029, 'lambda': 0.8775783176690057}. Best is trial 85 with value: 0.2861138861138861.[0m
[32m[I 2022-10-25 11:44:20,307][0m Trial 88 finished with value: 0.1778721278721279 and parameters: {'eta': 0.023539199216457835, 'max_depth': 11, 'min_child_weight': 4, 'subsample': 0.7181111170543559, 'lambda': 0.8583419702502724}. Best is trial 85 with value: 0.2861138861138861.[0m
[32m[I 2022-10-25 11:44:21,415][0m Trial 89 finished with value: 0.25549450549450553 and parameters: {'eta': 0.018184268588591304, 'max_depth': 11, 'min_child_weight': 3, 'subsample': 0.6821873086529109, 'lambda': 0.8672448121648945}. Best is trial 85 with value: 0.2861138861138861.[0m
[32m[I 2022-10-25 11:44:22,403][0m Trial 90 finished with value: 0.2419080919080919 and parameters: {'eta': 0.017439475295191416,

In [123]:
#Optimisation function for lightgbm regression with complementary guide RNA one hot encoding
study = optuna.create_study(direction = 'maximize')
optimization_function = partial(
                objective_lgbm,
                x = complementary_guide_RNA_one_hot_encoded_sequences,
                y = k_values
            )
study.optimize(optimization_function, n_trials=100) 
lgbm_complementary_gRNA = study.best_value

[32m[I 2022-10-25 15:01:39,384][0m A new study created in memory with name: no-name-3b6f4a90-6091-4c7f-b820-fa74f5d91efa[0m
[32m[I 2022-10-25 15:01:39,515][0m Trial 0 finished with value: 0.17532467532467536 and parameters: {'boosting_type': 'goss', 'num_leaves': 33, 'max_depth': 36, 'learning_rate': 0.019239311407266587, 'n_estimators': 54, 'min_child_weight': 0.004941915253353401}. Best is trial 0 with value: 0.17532467532467536.[0m
[32m[I 2022-10-25 15:01:39,678][0m Trial 1 finished with value: 0.18516483516483517 and parameters: {'boosting_type': 'dart', 'num_leaves': 36, 'max_depth': 22, 'learning_rate': 0.027641151926883576, 'n_estimators': 83, 'min_child_weight': 0.0016308633622378692}. Best is trial 1 with value: 0.18516483516483517.[0m
[32m[I 2022-10-25 15:01:39,883][0m Trial 2 finished with value: 0.1315684315684316 and parameters: {'boosting_type': 'gbdt', 'num_leaves': 24, 'max_depth': 49, 'learning_rate': 0.045201504358118476, 'n_estimators': 155, 'min_child_wei

[32m[I 2022-10-25 15:01:44,220][0m Trial 26 finished with value: 0.1844655344655345 and parameters: {'boosting_type': 'goss', 'num_leaves': 45, 'max_depth': 22, 'learning_rate': 0.052505850258593424, 'n_estimators': 70, 'min_child_weight': 0.001014971538579824}. Best is trial 19 with value: 0.20857639295903146.[0m
[32m[I 2022-10-25 15:01:44,355][0m Trial 27 finished with value: 0.1834015677842063 and parameters: {'boosting_type': 'goss', 'num_leaves': 37, 'max_depth': 25, 'learning_rate': 0.06728878192301592, 'n_estimators': 89, 'min_child_weight': 0.0017936090490094825}. Best is trial 19 with value: 0.20857639295903146.[0m
[32m[I 2022-10-25 15:01:44,482][0m Trial 28 finished with value: 0.2312390064079024 and parameters: {'boosting_type': 'goss', 'num_leaves': 46, 'max_depth': 15, 'learning_rate': 0.08211616765247864, 'n_estimators': 61, 'min_child_weight': 0.002254435394616252}. Best is trial 28 with value: 0.2312390064079024.[0m
[32m[I 2022-10-25 15:01:44,599][0m Trial 29

[32m[I 2022-10-25 15:01:47,807][0m Trial 52 finished with value: 0.20768464667415648 and parameters: {'boosting_type': 'goss', 'num_leaves': 33, 'max_depth': 41, 'learning_rate': 0.07350949905500803, 'n_estimators': 60, 'min_child_weight': 0.0025869341214837566}. Best is trial 33 with value: 0.2543428964036537.[0m
[32m[I 2022-10-25 15:01:47,921][0m Trial 53 finished with value: 0.1984279524359078 and parameters: {'boosting_type': 'goss', 'num_leaves': 31, 'max_depth': 23, 'learning_rate': 0.09015926739450862, 'n_estimators': 67, 'min_child_weight': 0.0032700985007062706}. Best is trial 33 with value: 0.2543428964036537.[0m
[32m[I 2022-10-25 15:01:48,031][0m Trial 54 finished with value: 0.2543428964036537 and parameters: {'boosting_type': 'goss', 'num_leaves': 37, 'max_depth': 36, 'learning_rate': 0.07956852532153794, 'n_estimators': 57, 'min_child_weight': 0.003860821598974424}. Best is trial 33 with value: 0.2543428964036537.[0m
[32m[I 2022-10-25 15:01:48,153][0m Trial 55 

[32m[I 2022-10-25 15:01:53,365][0m Trial 78 finished with value: 0.1876123876123876 and parameters: {'boosting_type': 'goss', 'num_leaves': 46, 'max_depth': 29, 'learning_rate': 0.02407786360985618, 'n_estimators': 59, 'min_child_weight': 0.0018609266676566682}. Best is trial 33 with value: 0.2543428964036537.[0m
[32m[I 2022-10-25 15:01:53,592][0m Trial 79 finished with value: 0.23893131410021007 and parameters: {'boosting_type': 'goss', 'num_leaves': 20, 'max_depth': 44, 'learning_rate': 0.08006597900614232, 'n_estimators': 95, 'min_child_weight': 0.003672204802484461}. Best is trial 33 with value: 0.2543428964036537.[0m
[32m[I 2022-10-25 15:01:53,854][0m Trial 80 finished with value: 0.1831668331668332 and parameters: {'boosting_type': 'gbdt', 'num_leaves': 20, 'max_depth': 46, 'learning_rate': 0.012998410745534851, 'n_estimators': 95, 'min_child_weight': 0.0038203941601323275}. Best is trial 33 with value: 0.2543428964036537.[0m
[32m[I 2022-10-25 15:01:54,076][0m Trial 81

In [121]:
#Optimisation function for support vector machine regression with complementary guide RNA one hot encoding
study = optuna.create_study(direction = 'maximize')
optimization_function = partial(
                objective_svr,
                x = complementary_guide_RNA_one_hot_encoded_sequences,
                y = k_values
            )
study.optimize(optimization_function, n_trials=100) 
svr_complementary_gRNA = study.best_value

[32m[I 2022-10-25 15:01:03,961][0m A new study created in memory with name: no-name-bda258f9-c7d5-4910-8fcf-91954b14a6e1[0m
[32m[I 2022-10-25 15:01:03,978][0m Trial 0 finished with value: -0.012700854700854702 and parameters: {'kernel': 'sigmoid', 'degree': 10, 'C': 4.977227275298782, 'epsilon': 0.954456339054589}. Best is trial 0 with value: -0.012700854700854702.[0m
[32m[I 2022-10-25 15:01:03,989][0m Trial 1 finished with value: 0.10184615384615385 and parameters: {'kernel': 'poly', 'degree': 5, 'C': 3.298941712049321, 'epsilon': 1.2370755540238365}. Best is trial 1 with value: 0.10184615384615385.[0m
[32m[I 2022-10-25 15:01:04,001][0m Trial 2 finished with value: 0.0 and parameters: {'kernel': 'poly', 'degree': 1, 'C': 3.5426763655917055, 'epsilon': 3.852823253490061}. Best is trial 1 with value: 0.10184615384615385.[0m
[32m[I 2022-10-25 15:01:04,026][0m Trial 3 finished with value: 0.05165761846622383 and parameters: {'kernel': 'linear', 'degree': 3, 'C': 0.0646521460

[32m[I 2022-10-25 15:01:04,938][0m Trial 34 finished with value: 0.10511078266151903 and parameters: {'kernel': 'sigmoid', 'degree': 7, 'C': 0.7256970085259875, 'epsilon': 0.3649024823189224}. Best is trial 27 with value: 0.39447863247863246.[0m
[32m[I 2022-10-25 15:01:04,958][0m Trial 35 finished with value: 0.11876923076923078 and parameters: {'kernel': 'poly', 'degree': 8, 'C': 0.5576928579103541, 'epsilon': 1.2865433891723874}. Best is trial 27 with value: 0.39447863247863246.[0m
[32m[I 2022-10-25 15:01:04,981][0m Trial 36 finished with value: 0.0367863247863248 and parameters: {'kernel': 'rbf', 'degree': 6, 'C': 2.7112333972690323, 'epsilon': 0.27483843473544955}. Best is trial 27 with value: 0.39447863247863246.[0m
[32m[I 2022-10-25 15:01:05,004][0m Trial 37 finished with value: 0.1743931623931624 and parameters: {'kernel': 'poly', 'degree': 9, 'C': 3.2446798899428475, 'epsilon': 0.8984184199042207}. Best is trial 27 with value: 0.39447863247863246.[0m
[32m[I 2022-10

[32m[I 2022-10-25 15:01:06,063][0m Trial 68 finished with value: 0.05488888888888889 and parameters: {'kernel': 'poly', 'degree': 2, 'C': 0.2861721926177194, 'epsilon': 1.0217227505164541}. Best is trial 27 with value: 0.39447863247863246.[0m
[32m[I 2022-10-25 15:01:06,084][0m Trial 69 finished with value: 0.0 and parameters: {'kernel': 'poly', 'degree': 9, 'C': 2.448641926142095, 'epsilon': 3.313846219488139}. Best is trial 27 with value: 0.39447863247863246.[0m
[32m[I 2022-10-25 15:01:06,105][0m Trial 70 finished with value: 0.12495726495726496 and parameters: {'kernel': 'rbf', 'degree': 8, 'C': 1.2419023270197385, 'epsilon': 0.17683514293996694}. Best is trial 27 with value: 0.39447863247863246.[0m
[32m[I 2022-10-25 15:01:06,128][0m Trial 71 finished with value: 0.2167863247863248 and parameters: {'kernel': 'poly', 'degree': 9, 'C': 0.6946327398593981, 'epsilon': 0.012880571760879311}. Best is trial 27 with value: 0.39447863247863246.[0m
[32m[I 2022-10-25 15:01:06,150]

In [29]:
#Try 8 bit vectors where the first 4 bits represent the guide RNA and the second 4 bits represent the target DNA. Use only the 20bp matching region to begin with.
#Take the last twenty bases of the guide RNA sequences
complementary_guide_RNA_sequences = []
for i in guide_RNA_sequences:
    complementary_guide_RNA_sequence = i[21:41]
    complementary_guide_RNA_sequences.append(complementary_guide_RNA_sequence)

#One hot encode them
complementary_guide_RNA_one_hot_encoded_sequences = []
for i in complementary_guide_RNA_sequences:
    one_hot_encoded_sequence = one_hot_encode_RNA(i)
    complementary_guide_RNA_one_hot_encoded_sequences.append(one_hot_encoded_sequence)


#Take the middle 20 of the target complements and reorder back to fron so they become complementary to guide RNA sequence
target_DNA_sequences = df_target_DNA['Sequence']
target_DNA_complements = []
complementary_target_DNA_sequences = []
for i in target_DNA_sequences:
    complementary_target_DNA_sequence = i[39:19:-1] #This generates the complement to the guide
    complementary_target_DNA_sequences.append(complementary_target_DNA_sequence)   

complementary_target_DNA_one_hot_encoded_sequences = []
for i in complementary_target_DNA_sequences:
    one_hot_encoded_sequence = one_hot_encode_DNA(i)
    complementary_target_DNA_one_hot_encoded_sequences.append(one_hot_encoded_sequence)  

#Concatenate the one hot encodes sequences
concat_complementary_guide_RNAs_target_DNAs = []
# test with 1st element 
for i in range(len(complementary_guide_RNA_one_hot_encoded_sequences)):
    # print(i)
    concat_complementary_guide_RNA_target_DNA = np.hstack((complementary_guide_RNA_one_hot_encoded_sequences[i], complementary_target_DNA_one_hot_encoded_sequences[i])).ravel()
    concat_complementary_guide_RNAs_target_DNAs.append(concat_complementary_guide_RNA_target_DNA)

#Set up feature names 
feature_names_concat_guide_RNA_target_DNA = []
for i in range(20):
    feature_names_concat_guide_RNA_target_DNA.append('guide RNA A' + str(i+1))
    feature_names_concat_guide_RNA_target_DNA.append('guide RNA U' + str(i+1))
    feature_names_concat_guide_RNA_target_DNA.append('guide RNA C' + str(i+1))
    feature_names_concat_guide_RNA_target_DNA.append('guide RNA G' + str(i+1))
    feature_names_concat_guide_RNA_target_DNA.append('target DNA A' + str(i+1))
    feature_names_concat_guide_RNA_target_DNA.append('target DNA T' + str(i+1))
    feature_names_concat_guide_RNA_target_DNA.append('target DNA C' + str(i+1))
    feature_names_concat_guide_RNA_target_DNA.append('target DNA G' + str(i+1))

In [30]:
#Optimisation function for linear regression with concat guide RNA target DNA one hot encoding
model = LinearRegression()
score = cross_validate(model, concat_complementary_guide_RNAs_target_DNAs, k_values, cv = 10, scoring = spearman_rank_scorer)
score_mean = np.mean(score['test_score'])
linear_regression_concat_gRNA_target_DNA = score_mean
linear_regression_concat_gRNA_target_DNA

0.030133443845440998

In [31]:
#Optimisation function for ridge regression with concat guide RNA target DNA one hot encoding
study = optuna.create_study(direction = 'maximize')
optimization_function = partial(
                objective_ridge,
                x = concat_complementary_guide_RNAs_target_DNAs,
                y = k_values
            )
study.optimize(optimization_function, n_trials=100) 
ridge_concat_gRNA_target_DNA = study.best_value

[32m[I 2022-10-25 11:44:54,183][0m A new study created in memory with name: no-name-8820c1cc-ee83-4fc6-80db-54da11686fd0[0m
[32m[I 2022-10-25 11:44:54,467][0m Trial 0 finished with value: 0.10479520479520481 and parameters: {'alpha': 0.4666100781818675, 'solver': 'sag'}. Best is trial 0 with value: 0.10479520479520481.[0m
[32m[I 2022-10-25 11:44:54,508][0m Trial 1 finished with value: 0.11653346653346655 and parameters: {'alpha': 0.9686092965954891, 'solver': 'svd'}. Best is trial 1 with value: 0.11653346653346655.[0m
[32m[I 2022-10-25 11:44:54,535][0m Trial 2 finished with value: 0.10864135864135864 and parameters: {'alpha': 0.16380795303512574, 'solver': 'cholesky'}. Best is trial 1 with value: 0.11653346653346655.[0m
[32m[I 2022-10-25 11:44:54,577][0m Trial 3 finished with value: 0.10479520479520481 and parameters: {'alpha': 0.42936827461555793, 'solver': 'svd'}. Best is trial 1 with value: 0.11653346653346655.[0m
[32m[I 2022-10-25 11:44:54,604][0m Trial 4 finished 

[32m[I 2022-10-25 11:44:57,186][0m Trial 40 finished with value: 0.1170829170829171 and parameters: {'alpha': 0.8886298597758291, 'solver': 'lsqr'}. Best is trial 14 with value: 0.11983016983016984.[0m
[32m[I 2022-10-25 11:44:57,225][0m Trial 41 finished with value: 0.11983016983016984 and parameters: {'alpha': 0.9597425402820339, 'solver': 'lsqr'}. Best is trial 14 with value: 0.11983016983016984.[0m
[32m[I 2022-10-25 11:44:57,262][0m Trial 42 finished with value: 0.11983016983016984 and parameters: {'alpha': 0.9443571813489278, 'solver': 'lsqr'}. Best is trial 14 with value: 0.11983016983016984.[0m
[32m[I 2022-10-25 11:44:57,304][0m Trial 43 finished with value: 0.1170829170829171 and parameters: {'alpha': 0.8962329447605075, 'solver': 'lsqr'}. Best is trial 14 with value: 0.11983016983016984.[0m
[32m[I 2022-10-25 11:44:57,356][0m Trial 44 finished with value: 0.106993006993007 and parameters: {'alpha': 0.2766662131886327, 'solver': 'svd'}. Best is trial 14 with value: 

[32m[I 2022-10-25 11:44:59,510][0m Trial 80 finished with value: 0.106993006993007 and parameters: {'alpha': 0.2606991659329091, 'solver': 'sag'}. Best is trial 14 with value: 0.11983016983016984.[0m
[32m[I 2022-10-25 11:44:59,545][0m Trial 81 finished with value: 0.11983016983016984 and parameters: {'alpha': 0.9851225942361884, 'solver': 'lsqr'}. Best is trial 14 with value: 0.11983016983016984.[0m
[32m[I 2022-10-25 11:44:59,580][0m Trial 82 finished with value: 0.11983016983016984 and parameters: {'alpha': 0.9786224756708932, 'solver': 'lsqr'}. Best is trial 14 with value: 0.11983016983016984.[0m
[32m[I 2022-10-25 11:44:59,616][0m Trial 83 finished with value: 0.11983016983016984 and parameters: {'alpha': 0.9598733521341813, 'solver': 'lsqr'}. Best is trial 14 with value: 0.11983016983016984.[0m
[32m[I 2022-10-25 11:44:59,654][0m Trial 84 finished with value: 0.1170829170829171 and parameters: {'alpha': 0.8549705945931122, 'solver': 'lsqr'}. Best is trial 14 with value:

In [32]:
#Optimisation function for lasso regression with concat guide RNA target DNA one hot encoding
study = optuna.create_study(direction = 'maximize')
optimization_function = partial(
                objective_lasso,
                x = concat_complementary_guide_RNAs_target_DNAs,
                y = k_values
            )
study.optimize(optimization_function, n_trials=100) 
lasso_concat_gRNA_target_DNA = study.best_value

[32m[I 2022-10-25 11:45:00,331][0m A new study created in memory with name: no-name-4d1e78bc-1c19-4d85-baaa-5f7067eb5623[0m
[32m[I 2022-10-25 11:45:00,361][0m Trial 0 finished with value: 0.0 and parameters: {'alpha': 0.6574424339674705}. Best is trial 0 with value: 0.0.[0m
[32m[I 2022-10-25 11:45:00,391][0m Trial 1 finished with value: 0.0 and parameters: {'alpha': 0.9607170402026265}. Best is trial 0 with value: 0.0.[0m
[32m[I 2022-10-25 11:45:00,421][0m Trial 2 finished with value: 0.0 and parameters: {'alpha': 0.8305536439318283}. Best is trial 0 with value: 0.0.[0m
[32m[I 2022-10-25 11:45:00,446][0m Trial 3 finished with value: 0.0 and parameters: {'alpha': 0.319428094468734}. Best is trial 0 with value: 0.0.[0m
[32m[I 2022-10-25 11:45:00,475][0m Trial 4 finished with value: 0.0 and parameters: {'alpha': 0.859624481707026}. Best is trial 0 with value: 0.0.[0m
[32m[I 2022-10-25 11:45:00,504][0m Trial 5 finished with value: 0.0 and parameters: {'alpha': 0.3115600

[32m[I 2022-10-25 11:45:02,103][0m Trial 52 finished with value: 0.0 and parameters: {'alpha': 0.15977299022937894}. Best is trial 0 with value: 0.0.[0m
[32m[I 2022-10-25 11:45:02,133][0m Trial 53 finished with value: 0.0 and parameters: {'alpha': 0.06691583259163135}. Best is trial 0 with value: 0.0.[0m
[32m[I 2022-10-25 11:45:02,164][0m Trial 54 finished with value: 0.0 and parameters: {'alpha': 0.19455567050467462}. Best is trial 0 with value: 0.0.[0m
[32m[I 2022-10-25 11:45:02,193][0m Trial 55 finished with value: 0.0 and parameters: {'alpha': 0.07172039600165254}. Best is trial 0 with value: 0.0.[0m
[32m[I 2022-10-25 11:45:02,220][0m Trial 56 finished with value: 0.0 and parameters: {'alpha': 0.1924281982781365}. Best is trial 0 with value: 0.0.[0m
[32m[I 2022-10-25 11:45:02,250][0m Trial 57 finished with value: 0.0 and parameters: {'alpha': 0.2592844020008547}. Best is trial 0 with value: 0.0.[0m
[32m[I 2022-10-25 11:45:02,280][0m Trial 58 finished with value:

In [33]:
#Optimisation function for Elastic Net regression with concat guide RNA target DNA one hot encoding
study = optuna.create_study(direction = 'maximize')
optimization_function = partial(
                objective_ElasticNet,
                x = concat_complementary_guide_RNAs_target_DNAs,
                y = k_values
            )
study.optimize(optimization_function, n_trials=100) 
ElasticNet_concat_gRNA_target_DNA = study.best_value

[32m[I 2022-10-25 11:45:03,635][0m A new study created in memory with name: no-name-49a5679f-405d-4d34-b69a-ac8522856390[0m
[32m[I 2022-10-25 11:45:03,662][0m Trial 0 finished with value: 0.0 and parameters: {'alpha': 0.7013629444557515, 'l1_ratio': 0.5016005913419714}. Best is trial 0 with value: 0.0.[0m
[32m[I 2022-10-25 11:45:03,687][0m Trial 1 finished with value: 0.0 and parameters: {'alpha': 0.23989035834581207, 'l1_ratio': 0.4911613048288096}. Best is trial 0 with value: 0.0.[0m
[32m[I 2022-10-25 11:45:03,714][0m Trial 2 finished with value: 0.0 and parameters: {'alpha': 0.24450036595878888, 'l1_ratio': 0.8134264147683512}. Best is trial 0 with value: 0.0.[0m
[32m[I 2022-10-25 11:45:03,739][0m Trial 3 finished with value: 0.0 and parameters: {'alpha': 0.20046836750843833, 'l1_ratio': 0.3199768009608649}. Best is trial 0 with value: 0.0.[0m
[32m[I 2022-10-25 11:45:03,765][0m Trial 4 finished with value: 0.0 and parameters: {'alpha': 0.8206595686861549, 'l1_ratio'

[32m[I 2022-10-25 11:45:05,061][0m Trial 39 finished with value: 0.06264298849275719 and parameters: {'alpha': 0.33381297383852926, 'l1_ratio': 0.049300866276918216}. Best is trial 12 with value: 0.14400599400599398.[0m
[32m[I 2022-10-25 11:45:05,094][0m Trial 40 finished with value: 0.0 and parameters: {'alpha': 0.39611343036121044, 'l1_ratio': 0.38151416280563766}. Best is trial 12 with value: 0.14400599400599398.[0m
[32m[I 2022-10-25 11:45:05,133][0m Trial 41 finished with value: 0.1345154845154845 and parameters: {'alpha': 0.0634801816636133, 'l1_ratio': 0.007607892852577984}. Best is trial 12 with value: 0.14400599400599398.[0m
[32m[I 2022-10-25 11:45:05,170][0m Trial 42 finished with value: 0.08491515390673572 and parameters: {'alpha': 0.09297191820169626, 'l1_ratio': 0.1112463434020137}. Best is trial 12 with value: 0.14400599400599398.[0m
[32m[I 2022-10-25 11:45:05,203][0m Trial 43 finished with value: 0.06679389445649556 and parameters: {'alpha': 0.20783027781775

[32m[I 2022-10-25 11:45:06,602][0m Trial 77 finished with value: 0.11313686313686315 and parameters: {'alpha': 0.007222284854239095, 'l1_ratio': 0.19164519535828103}. Best is trial 45 with value: 0.2011488511488512.[0m
[32m[I 2022-10-25 11:45:06,638][0m Trial 78 finished with value: 0.0 and parameters: {'alpha': 0.45411910705295305, 'l1_ratio': 0.10469960939726777}. Best is trial 45 with value: 0.2011488511488512.[0m
[32m[I 2022-10-25 11:45:06,675][0m Trial 79 finished with value: 0.0 and parameters: {'alpha': 0.9851610367805994, 'l1_ratio': 0.16068315542310802}. Best is trial 45 with value: 0.2011488511488512.[0m
[32m[I 2022-10-25 11:45:06,721][0m Trial 80 finished with value: 0.11043956043956045 and parameters: {'alpha': 0.03037137663448186, 'l1_ratio': 0.021250281714337696}. Best is trial 45 with value: 0.2011488511488512.[0m
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.

[32m[I 2022-10-25 11:45:07,743][0m Trial 99 finished with value: 0.18236763236763237 and parameters: {'alpha': 0.05741544900056706, 'l1_ratio': 0.08752736813183958}. Best is trial 45 with value: 0.2011488511488512.[0m


In [34]:
#Optimisation function for decision tree regression with concat guide RNA target DNA one hot encoding
study = optuna.create_study(direction = 'maximize')
optimization_function = partial(
                objective_dt,
                x = concat_complementary_guide_RNAs_target_DNAs,
                y = k_values
            )
study.optimize(optimization_function, n_trials=100) 
dt_concat_gRNA_target_DNA = study.best_value

[32m[I 2022-10-25 11:45:07,761][0m A new study created in memory with name: no-name-d5d3fbff-6e00-4721-a331-bc27aee72ba0[0m
[32m[I 2022-10-25 11:45:07,829][0m Trial 0 finished with value: 0.02123978265650417 and parameters: {'ccp_alpha': 0.009886491585228696, 'criterion': 'friedman_mse'}. Best is trial 0 with value: 0.02123978265650417.[0m
[32m[I 2022-10-25 11:45:07,889][0m Trial 1 finished with value: 0.0 and parameters: {'ccp_alpha': 0.6772702019368908, 'criterion': 'squared_error'}. Best is trial 0 with value: 0.02123978265650417.[0m
[32m[I 2022-10-25 11:45:07,959][0m Trial 2 finished with value: 0.0 and parameters: {'ccp_alpha': 0.09048955722700391, 'criterion': 'friedman_mse'}. Best is trial 0 with value: 0.02123978265650417.[0m
[32m[I 2022-10-25 11:45:08,223][0m Trial 3 finished with value: 0.0 and parameters: {'ccp_alpha': 0.9145880356073012, 'criterion': 'absolute_error'}. Best is trial 0 with value: 0.02123978265650417.[0m
[32m[I 2022-10-25 11:45:08,443][0m Tr

[32m[I 2022-10-25 11:45:10,779][0m Trial 39 finished with value: 0.0 and parameters: {'ccp_alpha': 0.2115121136709656, 'criterion': 'absolute_error'}. Best is trial 22 with value: 0.25859132659901113.[0m
[32m[I 2022-10-25 11:45:11,024][0m Trial 40 finished with value: -0.039764606259209315 and parameters: {'ccp_alpha': 0.0002916864458032126, 'criterion': 'absolute_error'}. Best is trial 22 with value: 0.25859132659901113.[0m
[32m[I 2022-10-25 11:45:11,261][0m Trial 41 finished with value: 0.0 and parameters: {'ccp_alpha': 0.06044724060549159, 'criterion': 'absolute_error'}. Best is trial 22 with value: 0.25859132659901113.[0m
[32m[I 2022-10-25 11:45:11,488][0m Trial 42 finished with value: 0.0 and parameters: {'ccp_alpha': 0.12323623091223662, 'criterion': 'absolute_error'}. Best is trial 22 with value: 0.25859132659901113.[0m
[32m[I 2022-10-25 11:45:11,526][0m Trial 43 finished with value: 0.0 and parameters: {'ccp_alpha': 0.054817483174498784, 'criterion': 'friedman_mse

[32m[I 2022-10-25 11:45:13,762][0m Trial 78 finished with value: -0.05773549265992352 and parameters: {'ccp_alpha': 0.0008314644256306172, 'criterion': 'absolute_error'}. Best is trial 22 with value: 0.25859132659901113.[0m
[32m[I 2022-10-25 11:45:13,810][0m Trial 79 finished with value: 0.0 and parameters: {'ccp_alpha': 0.983442190364686, 'criterion': 'friedman_mse'}. Best is trial 22 with value: 0.25859132659901113.[0m
[32m[I 2022-10-25 11:45:13,849][0m Trial 80 finished with value: 0.0 and parameters: {'ccp_alpha': 0.18092852392300826, 'criterion': 'friedman_mse'}. Best is trial 22 with value: 0.25859132659901113.[0m
[32m[I 2022-10-25 11:45:13,886][0m Trial 81 finished with value: 0.0 and parameters: {'ccp_alpha': 0.025840414100634174, 'criterion': 'friedman_mse'}. Best is trial 22 with value: 0.25859132659901113.[0m
[32m[I 2022-10-25 11:45:13,922][0m Trial 82 finished with value: 0.0 and parameters: {'ccp_alpha': 0.058638560121043806, 'criterion': 'friedman_mse'}. Bes

In [35]:
#Optimisation function for Randome Forest regression with concat guide RNA target DNA one hot encoding
study = optuna.create_study(direction = 'maximize')
optimization_function = partial(
                objective_rf,
                x = concat_complementary_guide_RNAs_target_DNAs,
                y = k_values
            )
study.optimize(optimization_function, n_trials=100) 
rf_concat_gRNA_target_DNA = study.best_value

[32m[I 2022-10-25 11:45:14,859][0m A new study created in memory with name: no-name-8e716f24-98ee-46ca-9c7c-a74012ebf847[0m
[32m[I 2022-10-25 11:45:16,174][0m Trial 0 finished with value: 0.1031968031968032 and parameters: {'n_estimators': 216, 'min_samples_leaf': 10, 'max_samples': 0.33144018801876385, 'max_features': 0.1003886304149516}. Best is trial 0 with value: 0.1031968031968032.[0m
[32m[I 2022-10-25 11:45:18,422][0m Trial 1 finished with value: 0.2804195804195805 and parameters: {'n_estimators': 297, 'min_samples_leaf': 7, 'max_samples': 0.912070044961912, 'max_features': 0.5177955654932214}. Best is trial 1 with value: 0.2804195804195805.[0m
[32m[I 2022-10-25 11:45:20,911][0m Trial 2 finished with value: 0.06923076923076923 and parameters: {'n_estimators': 432, 'min_samples_leaf': 16, 'max_samples': 0.6836027308968906, 'max_features': 0.18720391603527192}. Best is trial 1 with value: 0.2804195804195805.[0m
[32m[I 2022-10-25 11:45:24,722][0m Trial 3 finished with 

[32m[I 2022-10-25 11:46:57,644][0m Trial 30 finished with value: 0.19585414585414587 and parameters: {'n_estimators': 404, 'min_samples_leaf': 9, 'max_samples': 0.4684398608005898, 'max_features': 0.6172440971730457}. Best is trial 25 with value: 0.2839160839160839.[0m
[32m[I 2022-10-25 11:47:02,680][0m Trial 31 finished with value: 0.26843156843156846 and parameters: {'n_estimators': 504, 'min_samples_leaf': 7, 'max_samples': 0.7865984678366303, 'max_features': 0.8613283301142425}. Best is trial 25 with value: 0.2839160839160839.[0m
[32m[I 2022-10-25 11:47:06,261][0m Trial 32 finished with value: 0.25534465534465534 and parameters: {'n_estimators': 438, 'min_samples_leaf': 5, 'max_samples': 0.7330854957141855, 'max_features': 0.5606661934343393}. Best is trial 25 with value: 0.2839160839160839.[0m
[32m[I 2022-10-25 11:47:15,365][0m Trial 33 finished with value: 0.22667332667332668 and parameters: {'n_estimators': 687, 'min_samples_leaf': 1, 'max_samples': 0.690911218389926,

[32m[I 2022-10-25 11:48:10,822][0m Trial 61 finished with value: 0.06343656343656344 and parameters: {'n_estimators': 240, 'min_samples_leaf': 1, 'max_samples': 0.06279343704177726, 'max_features': 0.45196282688989287}. Best is trial 52 with value: 0.32742257742257747.[0m
[32m[I 2022-10-25 11:48:13,875][0m Trial 62 finished with value: 0.266983016983017 and parameters: {'n_estimators': 376, 'min_samples_leaf': 4, 'max_samples': 0.8113011516321824, 'max_features': 0.614910626007676}. Best is trial 52 with value: 0.32742257742257747.[0m
[32m[I 2022-10-25 11:48:15,736][0m Trial 63 finished with value: 0.051698301698301696 and parameters: {'n_estimators': 313, 'min_samples_leaf': 6, 'max_samples': 0.15227590426716262, 'max_features': 0.5294017836412597}. Best is trial 52 with value: 0.32742257742257747.[0m
[32m[I 2022-10-25 11:48:16,876][0m Trial 64 finished with value: 0.1586913086913087 and parameters: {'n_estimators': 194, 'min_samples_leaf': 8, 'max_samples': 0.2308650175864

[32m[I 2022-10-25 11:49:25,469][0m Trial 91 finished with value: 0.2692807192807193 and parameters: {'n_estimators': 327, 'min_samples_leaf': 5, 'max_samples': 0.3362780277711328, 'max_features': 0.7398420951360443}. Best is trial 52 with value: 0.32742257742257747.[0m
[32m[I 2022-10-25 11:49:27,112][0m Trial 92 finished with value: 0.25914085914085916 and parameters: {'n_estimators': 252, 'min_samples_leaf': 4, 'max_samples': 0.25749281388072714, 'max_features': 0.6822677067013707}. Best is trial 52 with value: 0.32742257742257747.[0m
[32m[I 2022-10-25 11:49:28,834][0m Trial 93 finished with value: 0.2436063936063936 and parameters: {'n_estimators': 268, 'min_samples_leaf': 5, 'max_samples': 0.2863545821345784, 'max_features': 0.5591483136171095}. Best is trial 52 with value: 0.32742257742257747.[0m
[32m[I 2022-10-25 11:49:30,737][0m Trial 94 finished with value: 0.2754745254745255 and parameters: {'n_estimators': 241, 'min_samples_leaf': 7, 'max_samples': 0.709849549545254

In [36]:
#Optimisation function for xgboost regression with concat guide RNA target DNA one hot encoding
study = optuna.create_study(direction = 'maximize')
optimization_function = partial(
                objective_xgb,
                x = concat_complementary_guide_RNAs_target_DNAs,
                y = k_values
            )
study.optimize(optimization_function, n_trials=100) 
xgb_concat_gRNA_target_DNA = study.best_value

[32m[I 2022-10-25 11:49:49,895][0m A new study created in memory with name: no-name-9e25dff8-ecbc-4342-88e6-18685923a001[0m
[32m[I 2022-10-25 11:49:51,132][0m Trial 0 finished with value: 0.043806193806193786 and parameters: {'eta': 0.09579391709544341, 'max_depth': 8, 'min_child_weight': 6, 'subsample': 0.9399796824166398, 'lambda': 0.8614703546713748}. Best is trial 0 with value: 0.043806193806193786.[0m
[32m[I 2022-10-25 11:49:52,365][0m Trial 1 finished with value: 0.1794705294705295 and parameters: {'eta': 0.04066346849115217, 'max_depth': 11, 'min_child_weight': 5, 'subsample': 0.9358828583776584, 'lambda': 0.9760292488640142}. Best is trial 1 with value: 0.1794705294705295.[0m
[32m[I 2022-10-25 11:49:53,647][0m Trial 2 finished with value: 0.10179820179820183 and parameters: {'eta': 0.14082193075171454, 'max_depth': 9, 'min_child_weight': 4, 'subsample': 0.8668243551111631, 'lambda': 0.8741638100934089}. Best is trial 1 with value: 0.1794705294705295.[0m
[32m[I 2022

[32m[I 2022-10-25 11:50:28,099][0m Trial 29 finished with value: 0.14575424575424573 and parameters: {'eta': 0.0985963326916175, 'max_depth': 8, 'min_child_weight': 2, 'subsample': 0.7592262076954719, 'lambda': 0.8492205818030232}. Best is trial 27 with value: 0.2753246753246753.[0m
[32m[I 2022-10-25 11:50:29,395][0m Trial 30 finished with value: 0.17767232767232768 and parameters: {'eta': 0.050617311099066094, 'max_depth': 9, 'min_child_weight': 3, 'subsample': 0.6665427287798131, 'lambda': 0.8196900645082844}. Best is trial 27 with value: 0.2753246753246753.[0m
[32m[I 2022-10-25 11:50:30,640][0m Trial 31 finished with value: 0.19215784215784218 and parameters: {'eta': 0.0757496606044212, 'max_depth': 9, 'min_child_weight': 3, 'subsample': 0.6275509078129097, 'lambda': 0.9312026139395156}. Best is trial 27 with value: 0.2753246753246753.[0m
[32m[I 2022-10-25 11:50:31,946][0m Trial 32 finished with value: 0.1831668331668332 and parameters: {'eta': 0.05739486056933168, 'max_d

[32m[I 2022-10-25 11:51:04,166][0m Trial 58 finished with value: 0.15644355644355645 and parameters: {'eta': 0.061103636614477416, 'max_depth': 7, 'min_child_weight': 3, 'subsample': 0.7107518843784982, 'lambda': 0.9647538066407806}. Best is trial 27 with value: 0.2753246753246753.[0m
[32m[I 2022-10-25 11:51:05,299][0m Trial 59 finished with value: 0.16693306693306695 and parameters: {'eta': 0.03564151492672407, 'max_depth': 10, 'min_child_weight': 4, 'subsample': 0.611867569671972, 'lambda': 0.9317231783075485}. Best is trial 27 with value: 0.2753246753246753.[0m
[32m[I 2022-10-25 11:51:06,495][0m Trial 60 finished with value: 0.22207792207792204 and parameters: {'eta': 0.070091104332033, 'max_depth': 8, 'min_child_weight': 4, 'subsample': 0.6568799439331039, 'lambda': 0.8186851670571318}. Best is trial 27 with value: 0.2753246753246753.[0m
[32m[I 2022-10-25 11:51:07,813][0m Trial 61 finished with value: 0.15084915084915082 and parameters: {'eta': 0.1960838595584511, 'max_d

[32m[I 2022-10-25 11:51:40,674][0m Trial 87 finished with value: 0.06568431568431568 and parameters: {'eta': 0.06916806717818666, 'max_depth': 8, 'min_child_weight': 2, 'subsample': 0.6818500297148028, 'lambda': 0.9430319874210649}. Best is trial 27 with value: 0.2753246753246753.[0m
[32m[I 2022-10-25 11:51:42,017][0m Trial 88 finished with value: 0.21993006993007 and parameters: {'eta': 0.07895453620508788, 'max_depth': 10, 'min_child_weight': 3, 'subsample': 0.6987443872539401, 'lambda': 0.9996996364853928}. Best is trial 27 with value: 0.2753246753246753.[0m
[32m[I 2022-10-25 11:51:43,245][0m Trial 89 finished with value: 0.21523476523476526 and parameters: {'eta': 0.02780339415091186, 'max_depth': 11, 'min_child_weight': 4, 'subsample': 0.6670795516344981, 'lambda': 0.9694302367426209}. Best is trial 27 with value: 0.2753246753246753.[0m
[32m[I 2022-10-25 11:51:44,458][0m Trial 90 finished with value: 0.1563936063936064 and parameters: {'eta': 0.037335832893448544, 'max_

In [37]:
#Optimisation function for lightgbm regression with concat guide RNA target DNA one hot encoding
study = optuna.create_study(direction = 'maximize')
optimization_function = partial(
                objective_lgbm,
                x = concat_complementary_guide_RNAs_target_DNAs,
                y = k_values
            )
study.optimize(optimization_function, n_trials=100) 
lgbm_concat_gRNA_target_DNA = study.best_value

[32m[I 2022-10-25 11:51:55,838][0m A new study created in memory with name: no-name-c9a5c8d0-cc2d-4281-9c0e-4aa1b34e8ef6[0m
[32m[I 2022-10-25 11:51:55,963][0m Trial 0 finished with value: 0.1504995004995005 and parameters: {'boosting_type': 'goss', 'num_leaves': 13, 'max_depth': 9, 'learning_rate': 0.09703050333054523, 'n_estimators': 147, 'min_child_weight': 0.0036788445445081817}. Best is trial 0 with value: 0.1504995004995005.[0m
[32m[I 2022-10-25 11:51:56,188][0m Trial 1 finished with value: 0.1654845154845155 and parameters: {'boosting_type': 'gbdt', 'num_leaves': 11, 'max_depth': 37, 'learning_rate': 0.08406862924425804, 'n_estimators': 124, 'min_child_weight': 0.0036658729158120422}. Best is trial 1 with value: 0.1654845154845155.[0m
[32m[I 2022-10-25 11:51:56,337][0m Trial 2 finished with value: 0.11492164591880902 and parameters: {'boosting_type': 'goss', 'num_leaves': 3, 'max_depth': 0, 'learning_rate': 0.060666526156697397, 'n_estimators': 102, 'min_child_weight':

[32m[I 2022-10-25 11:52:04,999][0m Trial 26 finished with value: 0.1304662497870424 and parameters: {'boosting_type': 'dart', 'num_leaves': 30, 'max_depth': 22, 'learning_rate': 0.0056357038683335706, 'n_estimators': 112, 'min_child_weight': 0.004993356757056326}. Best is trial 23 with value: 0.21583416583416587.[0m
[32m[I 2022-10-25 11:52:05,343][0m Trial 27 finished with value: 0.19725274725274727 and parameters: {'boosting_type': 'dart', 'num_leaves': 20, 'max_depth': 6, 'learning_rate': 0.06261595725798065, 'n_estimators': 138, 'min_child_weight': 0.0010263610440138242}. Best is trial 23 with value: 0.21583416583416587.[0m
[32m[I 2022-10-25 11:52:05,807][0m Trial 28 finished with value: 0.20499500499500503 and parameters: {'boosting_type': 'dart', 'num_leaves': 38, 'max_depth': 28, 'learning_rate': 0.03693943488398373, 'n_estimators': 183, 'min_child_weight': 0.003338986522730432}. Best is trial 23 with value: 0.21583416583416587.[0m
[32m[I 2022-10-25 11:52:06,136][0m Tr

[32m[I 2022-10-25 11:52:13,111][0m Trial 52 finished with value: 0.20299700299700302 and parameters: {'boosting_type': 'gbdt', 'num_leaves': 24, 'max_depth': 4, 'learning_rate': 0.021264493905788565, 'n_estimators': 161, 'min_child_weight': 0.0046723916025780245}. Best is trial 39 with value: 0.2272227772227772.[0m
[32m[I 2022-10-25 11:52:13,362][0m Trial 53 finished with value: 0.10408587183547939 and parameters: {'boosting_type': 'gbdt', 'num_leaves': 26, 'max_depth': 1, 'learning_rate': 0.01432778469843061, 'n_estimators': 168, 'min_child_weight': 0.004819301403112197}. Best is trial 39 with value: 0.2272227772227772.[0m
[32m[I 2022-10-25 11:52:13,660][0m Trial 54 finished with value: 0.1618881118881119 and parameters: {'boosting_type': 'gbdt', 'num_leaves': 33, 'max_depth': 10, 'learning_rate': 0.005934139705167655, 'n_estimators': 144, 'min_child_weight': 0.004442793429780063}. Best is trial 39 with value: 0.2272227772227772.[0m
[32m[I 2022-10-25 11:52:13,972][0m Trial 

[32m[I 2022-10-25 11:52:21,081][0m Trial 78 finished with value: 0.2263236763236763 and parameters: {'boosting_type': 'gbdt', 'num_leaves': 30, 'max_depth': 15, 'learning_rate': 0.027133529118997475, 'n_estimators': 104, 'min_child_weight': 0.0044085645654395435}. Best is trial 72 with value: 0.23206793206793205.[0m
[32m[I 2022-10-25 11:52:21,305][0m Trial 79 finished with value: 0.23476523476523475 and parameters: {'boosting_type': 'gbdt', 'num_leaves': 32, 'max_depth': 16, 'learning_rate': 0.03375881226162691, 'n_estimators': 86, 'min_child_weight': 0.004383383181403568}. Best is trial 79 with value: 0.23476523476523475.[0m
[32m[I 2022-10-25 11:52:21,499][0m Trial 80 finished with value: 0.23156843156843157 and parameters: {'boosting_type': 'gbdt', 'num_leaves': 32, 'max_depth': 16, 'learning_rate': 0.034020107624259256, 'n_estimators': 64, 'min_child_weight': 0.004199635674548596}. Best is trial 79 with value: 0.23476523476523475.[0m
[32m[I 2022-10-25 11:52:21,705][0m Tri

In [38]:
#Optimisation function for support vector machine  regression with concat guide RNA target DNA one hot encoding
study = optuna.create_study(direction = 'maximize')
optimization_function = partial(
                objective_svr,
                x = concat_complementary_guide_RNAs_target_DNAs,
                y = k_values
            )
study.optimize(optimization_function, n_trials=100) 
svr_concat_gRNA_target_DNA = study.best_value

[32m[I 2022-10-25 11:52:25,691][0m A new study created in memory with name: no-name-98ccd585-b3b4-48d2-a9ce-35ad5ea85851[0m
[32m[I 2022-10-25 11:52:25,711][0m Trial 0 finished with value: -0.0056581196581196565 and parameters: {'kernel': 'linear', 'degree': 5, 'C': 4.791170358078583, 'epsilon': 1.0533613374128876}. Best is trial 0 with value: -0.0056581196581196565.[0m
[32m[I 2022-10-25 11:52:25,729][0m Trial 1 finished with value: 0.0 and parameters: {'kernel': 'poly', 'degree': 1, 'C': 3.250522890327928, 'epsilon': 1.9598343637613114}. Best is trial 1 with value: 0.0.[0m
[32m[I 2022-10-25 11:52:25,747][0m Trial 2 finished with value: 0.0 and parameters: {'kernel': 'sigmoid', 'degree': 7, 'C': 0.2208932175173406, 'epsilon': 2.7563281187382382}. Best is trial 1 with value: 0.0.[0m
[32m[I 2022-10-25 11:52:25,766][0m Trial 3 finished with value: 0.0 and parameters: {'kernel': 'sigmoid', 'degree': 3, 'C': 3.6385799828537406, 'epsilon': 2.9444685687167182}. Best is trial 1 wi

[32m[I 2022-10-25 11:52:26,477][0m Trial 34 finished with value: 0.022666666666666665 and parameters: {'kernel': 'rbf', 'degree': 8, 'C': 3.5114581241315075, 'epsilon': 0.23131005873329966}. Best is trial 10 with value: 0.24805128205128205.[0m
[32m[I 2022-10-25 11:52:26,497][0m Trial 35 finished with value: 0.0 and parameters: {'kernel': 'sigmoid', 'degree': 6, 'C': 2.5905293203051625, 'epsilon': 3.2368375409485544}. Best is trial 10 with value: 0.24805128205128205.[0m
[32m[I 2022-10-25 11:52:26,518][0m Trial 36 finished with value: 0.14964102564102563 and parameters: {'kernel': 'poly', 'degree': 7, 'C': 2.9431900726494407, 'epsilon': 0.6582438718624712}. Best is trial 10 with value: 0.24805128205128205.[0m
[32m[I 2022-10-25 11:52:26,539][0m Trial 37 finished with value: 0.014222222222222225 and parameters: {'kernel': 'rbf', 'degree': 6, 'C': 3.422551801815932, 'epsilon': 0.2917600763523026}. Best is trial 10 with value: 0.24805128205128205.[0m
[32m[I 2022-10-25 11:52:26,5

[32m[I 2022-10-25 11:52:27,346][0m Trial 68 finished with value: 0.01834188034188034 and parameters: {'kernel': 'rbf', 'degree': 5, 'C': 3.578729699190683, 'epsilon': 0.5144751317957715}. Best is trial 10 with value: 0.24805128205128205.[0m
[32m[I 2022-10-25 11:52:27,365][0m Trial 69 finished with value: 0.14923076923076922 and parameters: {'kernel': 'sigmoid', 'degree': 5, 'C': 0.4461125571874023, 'epsilon': 0.21202169454582162}. Best is trial 10 with value: 0.24805128205128205.[0m
[32m[I 2022-10-25 11:52:27,383][0m Trial 70 finished with value: 0.0 and parameters: {'kernel': 'rbf', 'degree': 5, 'C': 2.6342674559470867, 'epsilon': 1.6568517921414538}. Best is trial 10 with value: 0.24805128205128205.[0m
[32m[I 2022-10-25 11:52:27,405][0m Trial 71 finished with value: 0.1183760683760684 and parameters: {'kernel': 'rbf', 'degree': 7, 'C': 2.844244111766819, 'epsilon': 0.08820827435851736}. Best is trial 10 with value: 0.24805128205128205.[0m
[32m[I 2022-10-25 11:52:27,428]

In [39]:
#Add context dependency of 4 bp either side of the target DNA. This represents the PAM site
#Make a array of zeros to add to either side of the gRNA one hot encoded sequence. This signifies the area around
#the target DNA where the gRNA should (hopefully!)not bind.

zeros = np.zeros((4,4))
pam_included_guide_RNA_one_hot_encoded_sequence = []
pam_included_guide_RNA_one_hot_encoded_sequences = []
for i in complementary_guide_RNA_sequences:
    pam_included_guide_RNA_one_hot_encoded_sequence = np.vstack((zeros,one_hot_encode_RNA(i)))
    pam_included_guide_RNA_one_hot_encoded_sequences.append(pam_included_guide_RNA_one_hot_encoded_sequence)

#one hot encode full 60 bp target DNA sequence so that they line up as complement to the gRNA sequence 
#with context included

pam_included_target_DNA_sequence = []
pam_included_target_DNA_sequences = []
for i in target_DNA_sequences:
    pam_included_target_DNA_sequence = i[43:19:-1] #This generates the complement to the guide
    pam_included_target_DNA_sequences.append(pam_included_target_DNA_sequence)

pam_included_target_DNA_one_hot_encoded_sequences = []
for i in pam_included_target_DNA_sequences:
    one_hot_encoded_sequence = one_hot_encode_DNA(i)
    pam_included_target_DNA_one_hot_encoded_sequences.append(one_hot_encoded_sequence)

    
# Concatenate the one hot encodes sequences
concat_pam_included_guide_RNA_target_DNA = []
concat_pam_included_guide_RNAs_target_DNAs = []
for i in range(len(pam_included_target_DNA_one_hot_encoded_sequences)):
    concat_pam_included_guide_RNA_target_DNA = np.hstack((pam_included_guide_RNA_one_hot_encoded_sequences[i], pam_included_target_DNA_one_hot_encoded_sequences[i])).ravel()
    concat_pam_included_guide_RNAs_target_DNAs.append(concat_pam_included_guide_RNA_target_DNA)

# #Add feature names. Note guide RNA 1-4 denote non pairing as no binding should happen there
feature_names_pam_included_guide_RNA_target_DNA = []
for i in range(24):
    feature_names_pam_included_guide_RNA_target_DNA.append('guide RNA A' + str(i+1))
    feature_names_pam_included_guide_RNA_target_DNA.append('guide RNA U' + str(i+1))
    feature_names_pam_included_guide_RNA_target_DNA.append('guide RNA C' + str(i+1))
    feature_names_pam_included_guide_RNA_target_DNA.append('guide RNA G' + str(i+1))
    feature_names_pam_included_guide_RNA_target_DNA.append('target DNA A' + str(i+1))
    feature_names_pam_included_guide_RNA_target_DNA.append('target DNA T' + str(i+1))
    feature_names_pam_included_guide_RNA_target_DNA.append('target DNA C' + str(i+1))
    feature_names_pam_included_guide_RNA_target_DNA.append('target DNA G' + str(i+1))

In [40]:
#Add context dependency of 4 bp at the start of the target DNA. This represents the PAM site
#Make a array of zeros to add to either side of the gRNA one hot encoded sequence. This signifies the area around
#the target DNA where the gRNA should (hopefully!)not bind.

zeros = np.zeros((4,4))
pam_included_guide_RNA_one_hot_encoded_sequence = []
pam_included_guide_RNA_one_hot_encoded_sequences = []
for i in complementary_guide_RNA_sequences:
    pam_included_guide_RNA_one_hot_encoded_sequence = np.vstack((zeros,one_hot_encode_RNA(i)))
    pam_included_guide_RNA_one_hot_encoded_sequences.append(pam_included_guide_RNA_one_hot_encoded_sequence)

#one hot encode pam included target DNA sequence so that they line up as complement to the gRNA sequence 
#with context included

pam_included_target_DNA_sequence = []
pam_included_target_DNA_sequences = []
for i in target_DNA_sequences:
    pam_included_target_DNA_sequence = i[43:19:-1] #This generates the complement to the guide
    pam_included_target_DNA_sequences.append(pam_included_target_DNA_sequence)

pam_included_target_DNA_one_hot_encoded_sequences = []
for i in pam_included_target_DNA_sequences:
    one_hot_encoded_sequence = one_hot_encode_DNA(i)
    pam_included_target_DNA_one_hot_encoded_sequences.append(one_hot_encoded_sequence)

    
# Concatenate the one hot encodes sequences
concat_pam_included_guide_RNA_target_DNA = []
concat_pam_included_guide_RNAs_target_DNAs = []
for i in range(len(pam_included_target_DNA_one_hot_encoded_sequences)):
    concat_pam_included_guide_RNA_target_DNA = np.hstack((pam_included_guide_RNA_one_hot_encoded_sequences[i], pam_included_target_DNA_one_hot_encoded_sequences[i])).ravel()
    concat_pam_included_guide_RNAs_target_DNAs.append(concat_pam_included_guide_RNA_target_DNA)

# #Add feature names. Note guide RNA 1-4 denote non pairing as no binding should happen there
feature_names_pam_included_guide_RNA_target_DNA = []
for i in range(24):
    feature_names_pam_included_guide_RNA_target_DNA.append('guide RNA A' + str(i+1))
    feature_names_pam_included_guide_RNA_target_DNA.append('guide RNA U' + str(i+1))
    feature_names_pam_included_guide_RNA_target_DNA.append('guide RNA C' + str(i+1))
    feature_names_pam_included_guide_RNA_target_DNA.append('guide RNA G' + str(i+1))
    feature_names_pam_included_guide_RNA_target_DNA.append('target DNA A' + str(i+1))
    feature_names_pam_included_guide_RNA_target_DNA.append('target DNA T' + str(i+1))
    feature_names_pam_included_guide_RNA_target_DNA.append('target DNA C' + str(i+1))
    feature_names_pam_included_guide_RNA_target_DNA.append('target DNA G' + str(i+1))

In [41]:
#Linear regression with PAM included concat guide RNA target DNA one hot encoding
model = LinearRegression()
score = cross_validate(model, concat_pam_included_guide_RNAs_target_DNAs, k_values, cv = 10, scoring = spearman_rank_scorer)
linear_regression_concat_pam_included_guide_RNA_target_DNA = np.mean(score['test_score'])


In [42]:
#Optimisation study with PAM included concat guide RNA target DNA one hot encoding
study = optuna.create_study(direction = 'maximize')
optimization_function = partial(
                objective_ridge,
                x = concat_pam_included_guide_RNAs_target_DNAs,
                y = k_values
            )
study.optimize(optimization_function, n_trials=100) 
ridge_concat_pam_included_guide_RNA_target_DNA = study.best_value

[32m[I 2022-10-25 11:52:29,273][0m A new study created in memory with name: no-name-837699ca-acc4-45d2-920c-ab889d848b4e[0m
[32m[I 2022-10-25 11:52:29,298][0m Trial 0 finished with value: 0.11663336663336663 and parameters: {'alpha': 0.44631863834550334, 'solver': 'auto'}. Best is trial 0 with value: 0.11663336663336663.[0m
[32m[I 2022-10-25 11:52:29,324][0m Trial 1 finished with value: 0.10754245754245755 and parameters: {'alpha': 0.9455256369421677, 'solver': 'auto'}. Best is trial 0 with value: 0.11663336663336663.[0m
[32m[I 2022-10-25 11:52:29,548][0m Trial 2 finished with value: 0.10589410589410589 and parameters: {'alpha': 0.855071761746358, 'solver': 'sag'}. Best is trial 0 with value: 0.11663336663336663.[0m
[32m[I 2022-10-25 11:52:29,652][0m Trial 3 finished with value: 0.11223776223776223 and parameters: {'alpha': 0.5974598865390783, 'solver': 'saga'}. Best is trial 0 with value: 0.11663336663336663.[0m
[32m[I 2022-10-25 11:52:29,686][0m Trial 4 finished with

[32m[I 2022-10-25 11:52:32,496][0m Trial 40 finished with value: 0.11523476523476524 and parameters: {'alpha': 0.6246695428049215, 'solver': 'svd'}. Best is trial 0 with value: 0.11663336663336663.[0m
[32m[I 2022-10-25 11:52:32,752][0m Trial 41 finished with value: 0.11663336663336663 and parameters: {'alpha': 0.4672533312838451, 'solver': 'sag'}. Best is trial 0 with value: 0.11663336663336663.[0m
[32m[I 2022-10-25 11:52:32,957][0m Trial 42 finished with value: 0.11523476523476524 and parameters: {'alpha': 0.5784085487784753, 'solver': 'sag'}. Best is trial 0 with value: 0.11663336663336663.[0m
[32m[I 2022-10-25 11:52:33,157][0m Trial 43 finished with value: 0.11103896103896105 and parameters: {'alpha': 0.33293746908364763, 'solver': 'sag'}. Best is trial 0 with value: 0.11663336663336663.[0m
[32m[I 2022-10-25 11:52:33,352][0m Trial 44 finished with value: 0.11663336663336663 and parameters: {'alpha': 0.4544337401776784, 'solver': 'sag'}. Best is trial 0 with value: 0.11

[32m[I 2022-10-25 11:52:37,367][0m Trial 80 finished with value: 0.11663336663336663 and parameters: {'alpha': 0.4440531787656232, 'solver': 'sag'}. Best is trial 0 with value: 0.11663336663336663.[0m
[32m[I 2022-10-25 11:52:37,571][0m Trial 81 finished with value: 0.11663336663336663 and parameters: {'alpha': 0.4713960653973127, 'solver': 'sag'}. Best is trial 0 with value: 0.11663336663336663.[0m
[32m[I 2022-10-25 11:52:37,756][0m Trial 82 finished with value: 0.11663336663336663 and parameters: {'alpha': 0.4474949018292047, 'solver': 'sag'}. Best is trial 0 with value: 0.11663336663336663.[0m
[32m[I 2022-10-25 11:52:37,930][0m Trial 83 finished with value: 0.11663336663336663 and parameters: {'alpha': 0.4678650501689319, 'solver': 'sag'}. Best is trial 0 with value: 0.11663336663336663.[0m
[32m[I 2022-10-25 11:52:38,123][0m Trial 84 finished with value: 0.11103896103896105 and parameters: {'alpha': 0.3648114523077143, 'solver': 'sag'}. Best is trial 0 with value: 0.116

In [43]:
#Optimisation study with PAM included concat guide RNA target DNA one hot encoding
study = optuna.create_study(direction = 'maximize')
optimization_function = partial(
                objective_lasso,
                x = concat_pam_included_guide_RNAs_target_DNAs,
                y = k_values
            )
study.optimize(optimization_function, n_trials=100) 
lasso_concat_pam_included_guide_RNA_target_DNA = study.best_value

[32m[I 2022-10-25 11:52:39,470][0m A new study created in memory with name: no-name-dfc7e4b8-4d27-403b-b3e3-ca37fc02fbe3[0m
[32m[I 2022-10-25 11:52:39,496][0m Trial 0 finished with value: 0.0 and parameters: {'alpha': 0.37691693378390057}. Best is trial 0 with value: 0.0.[0m
[32m[I 2022-10-25 11:52:39,518][0m Trial 1 finished with value: 0.0 and parameters: {'alpha': 0.6569296273950692}. Best is trial 0 with value: 0.0.[0m
[32m[I 2022-10-25 11:52:39,541][0m Trial 2 finished with value: 0.0 and parameters: {'alpha': 0.02950735291042683}. Best is trial 0 with value: 0.0.[0m
[32m[I 2022-10-25 11:52:39,564][0m Trial 3 finished with value: 0.0 and parameters: {'alpha': 0.863978162641686}. Best is trial 0 with value: 0.0.[0m
[32m[I 2022-10-25 11:52:39,589][0m Trial 4 finished with value: 0.0 and parameters: {'alpha': 0.8590488362505805}. Best is trial 0 with value: 0.0.[0m
[32m[I 2022-10-25 11:52:39,613][0m Trial 5 finished with value: 0.0 and parameters: {'alpha': 0.5779

[32m[I 2022-10-25 11:52:41,058][0m Trial 50 finished with value: 0.0 and parameters: {'alpha': 0.10580026854272101}. Best is trial 37 with value: 0.14602668495690138.[0m
[32m[I 2022-10-25 11:52:41,106][0m Trial 51 finished with value: 0.09585414585414584 and parameters: {'alpha': 0.0010016017549707452}. Best is trial 37 with value: 0.14602668495690138.[0m
[32m[I 2022-10-25 11:52:41,136][0m Trial 52 finished with value: 0.0 and parameters: {'alpha': 0.04873582468147419}. Best is trial 37 with value: 0.14602668495690138.[0m
[32m[I 2022-10-25 11:52:41,180][0m Trial 53 finished with value: 0.09150849150849152 and parameters: {'alpha': 0.0006254416457249137}. Best is trial 37 with value: 0.14602668495690138.[0m
[32m[I 2022-10-25 11:52:41,210][0m Trial 54 finished with value: 0.0 and parameters: {'alpha': 0.23562057702703645}. Best is trial 37 with value: 0.14602668495690138.[0m
[32m[I 2022-10-25 11:52:41,239][0m Trial 55 finished with value: 0.0 and parameters: {'alpha': 0.

[32m[I 2022-10-25 11:52:42,528][0m Trial 96 finished with value: 0.0 and parameters: {'alpha': 0.04272098289841841}. Best is trial 72 with value: 0.18161838161838162.[0m
[32m[I 2022-10-25 11:52:42,557][0m Trial 97 finished with value: 0.10685593588529578 and parameters: {'alpha': 0.016269527795019747}. Best is trial 72 with value: 0.18161838161838162.[0m
[32m[I 2022-10-25 11:52:42,585][0m Trial 98 finished with value: 0.0 and parameters: {'alpha': 0.732189689083248}. Best is trial 72 with value: 0.18161838161838162.[0m
[32m[I 2022-10-25 11:52:42,615][0m Trial 99 finished with value: 0.0 and parameters: {'alpha': 0.12407971249736588}. Best is trial 72 with value: 0.18161838161838162.[0m


In [44]:
#Optimisation study for Elastic Net with PAM included concat guide RNA target DNA one hot encoding
study = optuna.create_study(direction = 'maximize')
optimization_function = partial(
                objective_ElasticNet,
                x = concat_pam_included_guide_RNAs_target_DNAs,
                y = k_values
            )
study.optimize(optimization_function, n_trials=100) 
ElasticNet_concat_pam_included_guide_RNA_target_DNA = study.best_value

[32m[I 2022-10-25 11:52:42,620][0m A new study created in memory with name: no-name-5f8953d9-9997-4831-9893-f76f5f3e3676[0m
[32m[I 2022-10-25 11:52:42,647][0m Trial 0 finished with value: 0.0 and parameters: {'alpha': 0.966952224015185, 'l1_ratio': 0.2764815189498053}. Best is trial 0 with value: 0.0.[0m
[32m[I 2022-10-25 11:52:42,676][0m Trial 1 finished with value: 0.11832043540617221 and parameters: {'alpha': 0.47875707142877677, 'l1_ratio': 0.0278907494768752}. Best is trial 1 with value: 0.11832043540617221.[0m
[32m[I 2022-10-25 11:52:42,710][0m Trial 2 finished with value: 0.18151848151848152 and parameters: {'alpha': 0.029908769447436145, 'l1_ratio': 0.17787854533336755}. Best is trial 2 with value: 0.18151848151848152.[0m
[32m[I 2022-10-25 11:52:42,743][0m Trial 3 finished with value: 0.0 and parameters: {'alpha': 0.7271139188924192, 'l1_ratio': 0.8857448867290768}. Best is trial 2 with value: 0.18151848151848152.[0m
[32m[I 2022-10-25 11:52:42,773][0m Trial 4 f

[32m[I 2022-10-25 11:52:43,951][0m Trial 39 finished with value: 0.0 and parameters: {'alpha': 0.6032325142830356, 'l1_ratio': 0.24586264601575092}. Best is trial 2 with value: 0.18151848151848152.[0m
[32m[I 2022-10-25 11:52:43,984][0m Trial 40 finished with value: 0.0 and parameters: {'alpha': 0.6577787510611934, 'l1_ratio': 0.7944185388971434}. Best is trial 2 with value: 0.18151848151848152.[0m
[32m[I 2022-10-25 11:52:44,020][0m Trial 41 finished with value: 0.14950049950049954 and parameters: {'alpha': 0.563783440081201, 'l1_ratio': 0.005670938559066113}. Best is trial 2 with value: 0.18151848151848152.[0m
[32m[I 2022-10-25 11:52:44,054][0m Trial 42 finished with value: 0.0 and parameters: {'alpha': 0.6938212767567868, 'l1_ratio': 0.058914491790708234}. Best is trial 2 with value: 0.18151848151848152.[0m
[32m[I 2022-10-25 11:52:44,090][0m Trial 43 finished with value: 0.0 and parameters: {'alpha': 0.5813510810163878, 'l1_ratio': 0.10151597070351774}. Best is trial 2 w

[32m[I 2022-10-25 11:52:45,262][0m Trial 78 finished with value: 0.13986013986013987 and parameters: {'alpha': 0.9444052897051423, 'l1_ratio': 0.0045275174611524565}. Best is trial 2 with value: 0.18151848151848152.[0m
[32m[I 2022-10-25 11:52:45,294][0m Trial 79 finished with value: 0.0 and parameters: {'alpha': 0.7143637345116591, 'l1_ratio': 0.0847957909619868}. Best is trial 2 with value: 0.18151848151848152.[0m
[32m[I 2022-10-25 11:52:45,327][0m Trial 80 finished with value: 0.0 and parameters: {'alpha': 0.8496462479887529, 'l1_ratio': 0.15145179472270484}. Best is trial 2 with value: 0.18151848151848152.[0m
[32m[I 2022-10-25 11:52:45,360][0m Trial 81 finished with value: 0.04192522288506027 and parameters: {'alpha': 0.6912032790586197, 'l1_ratio': 0.03341460192554265}. Best is trial 2 with value: 0.18151848151848152.[0m
[32m[I 2022-10-25 11:52:45,392][0m Trial 82 finished with value: 0.0 and parameters: {'alpha': 0.6412208733228997, 'l1_ratio': 0.059074751122992444}.

In [80]:
#Optimisation study for decision tree regressor with PAM included concat guide RNA target DNA one hot encoding
study = optuna.create_study(direction = 'maximize')
optimization_function = partial(
                objective_dt,
                x = concat_pam_included_guide_RNAs_target_DNAs,
                y = k_values
            )
study.optimize(optimization_function, n_trials=100) 
dt_concat_pam_included_guide_RNA_target_DNA = study.best_value

[32m[I 2022-10-25 13:58:15,478][0m A new study created in memory with name: no-name-84ef8f54-5867-4bb5-960a-c7486bead64a[0m
[32m[I 2022-10-25 13:58:15,526][0m Trial 0 finished with value: 0.0 and parameters: {'ccp_alpha': 0.2764908888773483, 'criterion': 'squared_error'}. Best is trial 0 with value: 0.0.[0m
[32m[I 2022-10-25 13:58:15,569][0m Trial 1 finished with value: 0.0 and parameters: {'ccp_alpha': 0.957172676273137, 'criterion': 'friedman_mse'}. Best is trial 0 with value: 0.0.[0m
[32m[I 2022-10-25 13:58:15,617][0m Trial 2 finished with value: 0.14447740063020936 and parameters: {'ccp_alpha': 0.00012089146367899595, 'criterion': 'squared_error'}. Best is trial 2 with value: 0.14447740063020936.[0m
[32m[I 2022-10-25 13:58:15,882][0m Trial 3 finished with value: 0.0 and parameters: {'ccp_alpha': 0.7069304863734386, 'criterion': 'absolute_error'}. Best is trial 2 with value: 0.14447740063020936.[0m
[32m[I 2022-10-25 13:58:15,924][0m Trial 4 finished with value: 0.0 

[32m[I 2022-10-25 13:58:18,679][0m Trial 39 finished with value: 0.0 and parameters: {'ccp_alpha': 0.12391276100693535, 'criterion': 'squared_error'}. Best is trial 24 with value: 0.15140217929494676.[0m
[32m[I 2022-10-25 13:58:18,725][0m Trial 40 finished with value: 0.0 and parameters: {'ccp_alpha': 0.46881774790184727, 'criterion': 'friedman_mse'}. Best is trial 24 with value: 0.15140217929494676.[0m
[32m[I 2022-10-25 13:58:18,775][0m Trial 41 finished with value: 0.12249034114815838 and parameters: {'ccp_alpha': 0.0003202456298577567, 'criterion': 'squared_error'}. Best is trial 24 with value: 0.15140217929494676.[0m
[32m[I 2022-10-25 13:58:18,820][0m Trial 42 finished with value: 0.0 and parameters: {'ccp_alpha': 0.05016144427443065, 'criterion': 'squared_error'}. Best is trial 24 with value: 0.15140217929494676.[0m
[32m[I 2022-10-25 13:58:18,865][0m Trial 43 finished with value: 0.10598619996887168 and parameters: {'ccp_alpha': 0.0008467881141176516, 'criterion': 's

[32m[I 2022-10-25 13:58:21,170][0m Trial 78 finished with value: 0.0 and parameters: {'ccp_alpha': 0.07204187999513506, 'criterion': 'squared_error'}. Best is trial 63 with value: 0.15839064881760537.[0m
[32m[I 2022-10-25 13:58:21,223][0m Trial 79 finished with value: 0.0 and parameters: {'ccp_alpha': 0.32196024266891915, 'criterion': 'squared_error'}. Best is trial 63 with value: 0.15839064881760537.[0m
[32m[I 2022-10-25 13:58:21,266][0m Trial 80 finished with value: 0.0 and parameters: {'ccp_alpha': 0.032319834624624906, 'criterion': 'squared_error'}. Best is trial 63 with value: 0.15839064881760537.[0m
[32m[I 2022-10-25 13:58:21,311][0m Trial 81 finished with value: 0.03490164518460297 and parameters: {'ccp_alpha': 0.0024949055742603194, 'criterion': 'squared_error'}. Best is trial 63 with value: 0.15839064881760537.[0m
[32m[I 2022-10-25 13:58:21,360][0m Trial 82 finished with value: 0.09853701050450164 and parameters: {'ccp_alpha': 0.00028401709385975196, 'criterion':

In [46]:
#Optimisation study for Random Forest regressor with PAM included concat guide RNA target DNA one hot encoding
study = optuna.create_study(direction = 'maximize')
optimization_function = partial(
                objective_rf,
                x = concat_pam_included_guide_RNAs_target_DNAs,
                y = k_values
            )
study.optimize(optimization_function, n_trials=100) 
rf_concat_pam_included_guide_RNA_target_DNA = study.best_value

[32m[I 2022-10-25 11:52:51,914][0m A new study created in memory with name: no-name-c6264a40-f1bd-4c74-a322-27702375c1f2[0m
[32m[I 2022-10-25 11:52:56,331][0m Trial 0 finished with value: 0.19075924075924075 and parameters: {'n_estimators': 623, 'min_samples_leaf': 11, 'max_samples': 0.5997980277436762, 'max_features': 0.8937148575264643}. Best is trial 0 with value: 0.19075924075924075.[0m
[32m[I 2022-10-25 11:52:59,796][0m Trial 1 finished with value: 0.26803196803196805 and parameters: {'n_estimators': 445, 'min_samples_leaf': 5, 'max_samples': 0.5307625812793917, 'max_features': 0.7737731194188722}. Best is trial 1 with value: 0.26803196803196805.[0m
[32m[I 2022-10-25 11:53:02,752][0m Trial 2 finished with value: 0.04720279720279721 and parameters: {'n_estimators': 511, 'min_samples_leaf': 13, 'max_samples': 0.39560034824709783, 'max_features': 0.13798183286293664}. Best is trial 1 with value: 0.26803196803196805.[0m
[32m[I 2022-10-25 11:53:07,990][0m Trial 3 finished

[32m[I 2022-10-25 11:54:24,125][0m Trial 30 finished with value: 0.27852147852147857 and parameters: {'n_estimators': 179, 'min_samples_leaf': 3, 'max_samples': 0.2033467303020371, 'max_features': 0.9993456389903914}. Best is trial 23 with value: 0.33091908091908095.[0m
[32m[I 2022-10-25 11:54:26,710][0m Trial 31 finished with value: 0.3095404595404595 and parameters: {'n_estimators': 275, 'min_samples_leaf': 3, 'max_samples': 0.3323349321341256, 'max_features': 0.9156307737456187}. Best is trial 23 with value: 0.33091908091908095.[0m
[32m[I 2022-10-25 11:54:27,736][0m Trial 32 finished with value: 0.31063936063936065 and parameters: {'n_estimators': 101, 'min_samples_leaf': 3, 'max_samples': 0.2891980233220233, 'max_features': 0.837530783322852}. Best is trial 23 with value: 0.33091908091908095.[0m
[32m[I 2022-10-25 11:54:29,095][0m Trial 33 finished with value: 0.2013986013986014 and parameters: {'n_estimators': 177, 'min_samples_leaf': 10, 'max_samples': 0.471734218270692

[32m[I 2022-10-25 11:55:46,829][0m Trial 61 finished with value: 0.31118881118881114 and parameters: {'n_estimators': 266, 'min_samples_leaf': 4, 'max_samples': 0.24639188470150414, 'max_features': 0.4581648028252263}. Best is trial 23 with value: 0.33091908091908095.[0m
[32m[I 2022-10-25 11:55:48,061][0m Trial 62 finished with value: 0.24085914085914087 and parameters: {'n_estimators': 159, 'min_samples_leaf': 3, 'max_samples': 0.2871968133342513, 'max_features': 0.2159887100081893}. Best is trial 23 with value: 0.33091908091908095.[0m
[32m[I 2022-10-25 11:55:49,621][0m Trial 63 finished with value: 0.29370629370629375 and parameters: {'n_estimators': 226, 'min_samples_leaf': 2, 'max_samples': 0.20144114494727697, 'max_features': 0.12112785085287629}. Best is trial 23 with value: 0.33091908091908095.[0m
[32m[I 2022-10-25 11:55:52,055][0m Trial 64 finished with value: 0.3228271728271729 and parameters: {'n_estimators': 306, 'min_samples_leaf': 4, 'max_samples': 0.31055840100

[32m[I 2022-10-25 11:57:02,287][0m Trial 92 finished with value: 0.27382617382617386 and parameters: {'n_estimators': 999, 'min_samples_leaf': 3, 'max_samples': 0.2404924021477321, 'max_features': 0.9351286470901736}. Best is trial 85 with value: 0.34065934065934067.[0m
[32m[I 2022-10-25 11:57:05,240][0m Trial 93 finished with value: 0.35454545454545455 and parameters: {'n_estimators': 404, 'min_samples_leaf': 2, 'max_samples': 0.14703966879761768, 'max_features': 0.8364092907468845}. Best is trial 93 with value: 0.35454545454545455.[0m
[32m[I 2022-10-25 11:57:09,692][0m Trial 94 finished with value: 0.3198301698301699 and parameters: {'n_estimators': 412, 'min_samples_leaf': 2, 'max_samples': 0.1530356397049979, 'max_features': 0.7935370608235184}. Best is trial 93 with value: 0.35454545454545455.[0m
[32m[I 2022-10-25 11:57:13,102][0m Trial 95 finished with value: 0.11953046953046953 and parameters: {'n_estimators': 446, 'min_samples_leaf': 1, 'max_samples': 0.0409519546280

In [70]:
#Optimisation study for xgboost regressor with PAM included concat guide RNA target DNA one hot encoding
study = optuna.create_study(direction = 'maximize')
optimization_function = partial(
                objective_xgb,
                x = concat_pam_included_guide_RNAs_target_DNAs,
                y = k_values
            )
study.optimize(optimization_function, n_trials=100) 
xgb_concat_pam_included_guide_RNA_target_DNA = study.best_value

[32m[I 2022-10-25 12:15:27,927][0m A new study created in memory with name: no-name-aceaae62-1361-4cc9-aae5-0bb0cafefe7d[0m
[32m[I 2022-10-25 12:15:29,045][0m Trial 0 finished with value: 0.21328671328671328 and parameters: {'eta': 0.1735737452306573, 'max_depth': 11, 'min_child_weight': 6, 'subsample': 0.6889545897067983, 'lambda': 0.9896815761823257}. Best is trial 0 with value: 0.21328671328671328.[0m
[32m[I 2022-10-25 12:15:30,135][0m Trial 1 finished with value: 0.16313686313686315 and parameters: {'eta': 0.19983098432620447, 'max_depth': 8, 'min_child_weight': 6, 'subsample': 0.8010305202118799, 'lambda': 0.8521115953182926}. Best is trial 0 with value: 0.21328671328671328.[0m
[32m[I 2022-10-25 12:15:31,253][0m Trial 2 finished with value: 0.13791208791208792 and parameters: {'eta': 0.18169528735757884, 'max_depth': 8, 'min_child_weight': 6, 'subsample': 0.6620657058028458, 'lambda': 0.9551152841496198}. Best is trial 0 with value: 0.21328671328671328.[0m
[32m[I 2022

[32m[I 2022-10-25 12:16:04,620][0m Trial 29 finished with value: 0.09610389610389611 and parameters: {'eta': 0.1718161428224117, 'max_depth': 9, 'min_child_weight': 4, 'subsample': 0.8225854718641811, 'lambda': 0.9927163443800849}. Best is trial 26 with value: 0.2843156843156843.[0m
[32m[I 2022-10-25 12:16:05,716][0m Trial 30 finished with value: 0.15584415584415584 and parameters: {'eta': 0.1853610021218915, 'max_depth': 11, 'min_child_weight': 6, 'subsample': 0.6787493819347737, 'lambda': 0.9319916913505617}. Best is trial 26 with value: 0.2843156843156843.[0m
[32m[I 2022-10-25 12:16:06,799][0m Trial 31 finished with value: 0.24260739260739267 and parameters: {'eta': 0.08703810439301365, 'max_depth': 7, 'min_child_weight': 5, 'subsample': 0.7961072697893229, 'lambda': 0.8649811383265478}. Best is trial 26 with value: 0.2843156843156843.[0m
[32m[I 2022-10-25 12:16:07,907][0m Trial 32 finished with value: 0.22027972027972026 and parameters: {'eta': 0.08769856734967144, 'max_

[32m[I 2022-10-25 12:16:39,340][0m Trial 58 finished with value: 0.15344655344655342 and parameters: {'eta': 0.0015018020836191134, 'max_depth': 9, 'min_child_weight': 3, 'subsample': 0.8109208033560895, 'lambda': 0.8696998520105249}. Best is trial 26 with value: 0.2843156843156843.[0m
[32m[I 2022-10-25 12:16:40,458][0m Trial 59 finished with value: 0.2093906093906094 and parameters: {'eta': 0.05248222986509968, 'max_depth': 7, 'min_child_weight': 5, 'subsample': 0.6925748478809187, 'lambda': 0.8437636739620128}. Best is trial 26 with value: 0.2843156843156843.[0m
[32m[I 2022-10-25 12:16:41,858][0m Trial 60 finished with value: 0.1346153846153846 and parameters: {'eta': 0.1426334392497608, 'max_depth': 10, 'min_child_weight': 4, 'subsample': 0.8804797652149527, 'lambda': 0.8194665338254592}. Best is trial 26 with value: 0.2843156843156843.[0m
[32m[I 2022-10-25 12:16:43,150][0m Trial 61 finished with value: 0.18841158841158842 and parameters: {'eta': 0.07051602144542236, 'max

[32m[I 2022-10-25 12:17:16,234][0m Trial 87 finished with value: 0.2586913086913087 and parameters: {'eta': 0.04111842509639319, 'max_depth': 9, 'min_child_weight': 5, 'subsample': 0.7736514929812269, 'lambda': 0.8194871286813247}. Best is trial 85 with value: 0.2917082917082917.[0m
[32m[I 2022-10-25 12:17:17,568][0m Trial 88 finished with value: 0.21598401598401598 and parameters: {'eta': 0.04042950446811656, 'max_depth': 9, 'min_child_weight': 5, 'subsample': 0.7179827455587362, 'lambda': 0.8265476869981451}. Best is trial 85 with value: 0.2917082917082917.[0m
[32m[I 2022-10-25 12:17:18,871][0m Trial 89 finished with value: 0.23961038961038955 and parameters: {'eta': 0.06303671608643245, 'max_depth': 9, 'min_child_weight': 5, 'subsample': 0.7919834658771615, 'lambda': 0.8086416158908672}. Best is trial 85 with value: 0.2917082917082917.[0m
[32m[I 2022-10-25 12:17:20,176][0m Trial 90 finished with value: 0.22622377622377626 and parameters: {'eta': 0.04638935073903756, 'max_

In [48]:
#Optimisation study for lightgbm regressor with PAM included concat guide RNA target DNA one hot encoding
study = optuna.create_study(direction = 'maximize')
optimization_function = partial(
                objective_lgbm,
                x = concat_pam_included_guide_RNAs_target_DNAs,
                y = k_values
            )
study.optimize(optimization_function, n_trials=100) 
lgbm_concat_pam_included_guide_RNA_target_DNA = study.best_value

[32m[I 2022-10-25 11:59:43,370][0m A new study created in memory with name: no-name-e8a97848-5c95-478c-99f2-4837df9c87e7[0m
[32m[I 2022-10-25 11:59:43,554][0m Trial 0 finished with value: 0.17897102897102896 and parameters: {'boosting_type': 'dart', 'num_leaves': 5, 'max_depth': 27, 'learning_rate': 0.08098914760475524, 'n_estimators': 59, 'min_child_weight': 0.003875214855652225}. Best is trial 0 with value: 0.17897102897102896.[0m
[32m[I 2022-10-25 11:59:43,873][0m Trial 1 finished with value: 0.23476523476523478 and parameters: {'boosting_type': 'dart', 'num_leaves': 18, 'max_depth': 33, 'learning_rate': 0.07542136128668822, 'n_estimators': 116, 'min_child_weight': 0.002432756892503169}. Best is trial 1 with value: 0.23476523476523478.[0m
[32m[I 2022-10-25 11:59:44,067][0m Trial 2 finished with value: 0.21128871128871127 and parameters: {'boosting_type': 'gbdt', 'num_leaves': 50, 'max_depth': 39, 'learning_rate': 0.05846709346444208, 'n_estimators': 57, 'min_child_weight'

[32m[I 2022-10-25 11:59:53,080][0m Trial 26 finished with value: 0.14654973036820057 and parameters: {'boosting_type': 'dart', 'num_leaves': 23, 'max_depth': 4, 'learning_rate': 0.0014439328437596127, 'n_estimators': 172, 'min_child_weight': 0.0036727497111805115}. Best is trial 21 with value: 0.25144855144855144.[0m
[32m[I 2022-10-25 11:59:53,299][0m Trial 27 finished with value: 0.1838161838161838 and parameters: {'boosting_type': 'goss', 'num_leaves': 12, 'max_depth': 10, 'learning_rate': 0.0660281456851942, 'n_estimators': 199, 'min_child_weight': 0.0030961716944348677}. Best is trial 21 with value: 0.25144855144855144.[0m
[32m[I 2022-10-25 11:59:53,691][0m Trial 28 finished with value: 0.15214785214785215 and parameters: {'boosting_type': 'gbdt', 'num_leaves': 27, 'max_depth': 3, 'learning_rate': 0.08027914058258967, 'n_estimators': 184, 'min_child_weight': 0.003974706705617523}. Best is trial 21 with value: 0.25144855144855144.[0m
[32m[I 2022-10-25 11:59:54,175][0m Tri

[32m[I 2022-10-25 12:00:02,390][0m Trial 52 finished with value: 0.12968281969668632 and parameters: {'boosting_type': 'dart', 'num_leaves': 32, 'max_depth': 1, 'learning_rate': 0.07092414721701046, 'n_estimators': 148, 'min_child_weight': 0.004070944854474688}. Best is trial 21 with value: 0.25144855144855144.[0m
[32m[I 2022-10-25 12:00:03,008][0m Trial 53 finished with value: 0.23426573426573433 and parameters: {'boosting_type': 'dart', 'num_leaves': 36, 'max_depth': 34, 'learning_rate': 0.06437552810425243, 'n_estimators': 162, 'min_child_weight': 0.004311714414348788}. Best is trial 21 with value: 0.25144855144855144.[0m
[32m[I 2022-10-25 12:00:03,556][0m Trial 54 finished with value: 0.22612387612387613 and parameters: {'boosting_type': 'dart', 'num_leaves': 39, 'max_depth': -1, 'learning_rate': 0.038104671429638776, 'n_estimators': 144, 'min_child_weight': 0.004521610973131898}. Best is trial 21 with value: 0.25144855144855144.[0m
[32m[I 2022-10-25 12:00:04,063][0m Tri

[32m[I 2022-10-25 12:00:14,083][0m Trial 78 finished with value: 0.23641358641358642 and parameters: {'boosting_type': 'dart', 'num_leaves': 11, 'max_depth': 26, 'learning_rate': 0.06646766140828639, 'n_estimators': 138, 'min_child_weight': 0.004726625051289327}. Best is trial 66 with value: 0.2607892107892108.[0m
[32m[I 2022-10-25 12:00:14,505][0m Trial 79 finished with value: 0.2315184815184815 and parameters: {'boosting_type': 'dart', 'num_leaves': 13, 'max_depth': 28, 'learning_rate': 0.07486871320877943, 'n_estimators': 132, 'min_child_weight': 0.004986631028797345}. Best is trial 66 with value: 0.2607892107892108.[0m
[32m[I 2022-10-25 12:00:14,731][0m Trial 80 finished with value: 0.19355644355644358 and parameters: {'boosting_type': 'goss', 'num_leaves': 8, 'max_depth': 24, 'learning_rate': 0.06942368563784439, 'n_estimators': 143, 'min_child_weight': 0.004864969419813104}. Best is trial 66 with value: 0.2607892107892108.[0m
[32m[I 2022-10-25 12:00:15,178][0m Trial 81

In [49]:
#Optimisation study for support vector machine regressor with PAM included concat guide RNA target DNA one hot encoding
study = optuna.create_study(direction = 'maximize')
optimization_function = partial(
                objective_svr,
                x = concat_pam_included_guide_RNAs_target_DNAs,
                y = k_values
            )
study.optimize(optimization_function, n_trials=100) 
svr_concat_pam_included_guide_RNA_target_DNA = study.best_value

[32m[I 2022-10-25 12:00:24,010][0m A new study created in memory with name: no-name-75b2b0c2-68c8-4fde-a55f-06a66e827bf9[0m
[32m[I 2022-10-25 12:00:24,030][0m Trial 0 finished with value: 0.0 and parameters: {'kernel': 'poly', 'degree': 6, 'C': 2.696089863611953, 'epsilon': 3.917796108667594}. Best is trial 0 with value: 0.0.[0m
[32m[I 2022-10-25 12:00:24,048][0m Trial 1 finished with value: 0.0 and parameters: {'kernel': 'sigmoid', 'degree': 8, 'C': 3.5270686512138965, 'epsilon': 4.495616045941583}. Best is trial 0 with value: 0.0.[0m
[32m[I 2022-10-25 12:00:24,066][0m Trial 2 finished with value: 0.0 and parameters: {'kernel': 'poly', 'degree': 1, 'C': 0.30425874676016484, 'epsilon': 2.626044526892679}. Best is trial 0 with value: 0.0.[0m
[32m[I 2022-10-25 12:00:24,084][0m Trial 3 finished with value: 0.0 and parameters: {'kernel': 'sigmoid', 'degree': 3, 'C': 2.852786490491373, 'epsilon': 4.818319064535537}. Best is trial 0 with value: 0.0.[0m
[32m[I 2022-10-25 12:00

[32m[I 2022-10-25 12:00:24,822][0m Trial 35 finished with value: 0.0883629991886592 and parameters: {'kernel': 'poly', 'degree': 8, 'C': 0.2817901168833904, 'epsilon': 1.429521355195373}. Best is trial 5 with value: 0.2126837606837607.[0m
[32m[I 2022-10-25 12:00:24,848][0m Trial 36 finished with value: 0.20302564102564102 and parameters: {'kernel': 'sigmoid', 'degree': 5, 'C': 1.8631599315911742, 'epsilon': 0.23842385020597617}. Best is trial 5 with value: 0.2126837606837607.[0m
[32m[I 2022-10-25 12:00:24,867][0m Trial 37 finished with value: 0.18155555555555555 and parameters: {'kernel': 'poly', 'degree': 10, 'C': 0.8927200034775481, 'epsilon': 0.5433231753260994}. Best is trial 5 with value: 0.2126837606837607.[0m
[32m[I 2022-10-25 12:00:24,886][0m Trial 38 finished with value: 0.1945811965811966 and parameters: {'kernel': 'poly', 'degree': 9, 'C': 2.355692030477366, 'epsilon': 1.003290365517571}. Best is trial 5 with value: 0.2126837606837607.[0m
[32m[I 2022-10-25 12:00

[32m[I 2022-10-25 12:00:30,459][0m Trial 69 finished with value: 0.0 and parameters: {'kernel': 'poly', 'degree': 4, 'C': 3.3051751151865414, 'epsilon': 1.757860072301471}. Best is trial 42 with value: 0.30735042735042734.[0m
[32m[I 2022-10-25 12:00:30,499][0m Trial 70 finished with value: -0.008273504273504262 and parameters: {'kernel': 'linear', 'degree': 8, 'C': 0.8008110028335836, 'epsilon': 0.29869047985066194}. Best is trial 42 with value: 0.30735042735042734.[0m
[32m[I 2022-10-25 12:00:30,525][0m Trial 71 finished with value: 0.20206837606837605 and parameters: {'kernel': 'rbf', 'degree': 10, 'C': 2.917074404576295, 'epsilon': 0.019499826223665057}. Best is trial 42 with value: 0.30735042735042734.[0m
[32m[I 2022-10-25 12:00:30,544][0m Trial 72 finished with value: 0.0 and parameters: {'kernel': 'rbf', 'degree': 10, 'C': 2.785738360132596, 'epsilon': 4.662385837467691}. Best is trial 42 with value: 0.30735042735042734.[0m
[32m[I 2022-10-25 12:00:30,564][0m Trial 73

In [50]:
#Add context dependency of 20 bp either side of the target DNA
#Make a array of zeros to add to either side of the gRNA one hot encoded sequence. This signifies the area around
#the target DNA where the gRNA should (hopefully!) not bind.
zeros = np.zeros((20,4))
context_included_guide_RNA_one_hot_encoded_sequence = []
context_included_guide_RNA_one_hot_encoded_sequences = []
for i in complementary_guide_RNA_sequences:
    context_included_guide_RNA_one_hot_encoded_sequence = np.vstack((zeros,one_hot_encode_RNA(i),zeros))
    context_included_guide_RNA_one_hot_encoded_sequences.append(context_included_guide_RNA_one_hot_encoded_sequence)

#one hot encode full 60 bp target DNA sequence so that they line up as complement to the gRNA sequence with context
# included

len(target_DNA_sequences[0])

full_target_DNA_sequence = []
full_target_DNA_sequences = []
for i in target_DNA_sequences:
    full_target_DNA_sequence = i[::-1] #This generates the complement to the guide
    full_target_DNA_sequences.append(full_target_DNA_sequence)

full_target_DNA_one_hot_encoded_sequences = []
for i in full_target_DNA_sequences:
    one_hot_encoded_sequence = one_hot_encode_DNA(i)
    full_target_DNA_one_hot_encoded_sequences.append(one_hot_encoded_sequence)

    
# Concatenate the one hot encodes sequences
concat_context_included_guide_RNA_target_DNA = []
concat_context_included_guide_RNAs_target_DNAs = []
for i in range(len(full_target_DNA_one_hot_encoded_sequences)):
    concat_context_included_guide_RNA_target_DNA = np.hstack((context_included_guide_RNA_one_hot_encoded_sequences[i], full_target_DNA_one_hot_encoded_sequences[i])).ravel()
    concat_context_included_guide_RNAs_target_DNAs.append(concat_context_included_guide_RNA_target_DNA)

#Add feature names. Note guide RNA 1-20 and 41-60 denote non pairing as no binding should happen there
feature_names_context_included_guide_RNA_target_DNA = []
for i in range(60):
    feature_names_context_included_guide_RNA_target_DNA.append('guide RNA A' + str(i+1))
    feature_names_context_included_guide_RNA_target_DNA.append('guide RNA U' + str(i+1))
    feature_names_context_included_guide_RNA_target_DNA.append('guide RNA C' + str(i+1))
    feature_names_context_included_guide_RNA_target_DNA.append('guide RNA G' + str(i+1))
    feature_names_context_included_guide_RNA_target_DNA.append('target DNA A' + str(i+1))
    feature_names_context_included_guide_RNA_target_DNA.append('target DNA T' + str(i+1))
    feature_names_context_included_guide_RNA_target_DNA.append('target DNA C' + str(i+1))
    feature_names_context_included_guide_RNA_target_DNA.append('target DNA G' + str(i+1))


In [51]:
#Linear regression with context included concat guide RNA target DNA one hot encoding
model = LinearRegression()
score = cross_validate(model, concat_pam_included_guide_RNAs_target_DNAs, k_values, cv = 10, scoring = spearman_rank_scorer)
linear_regression_concat_context_included_guide_RNA_target_DNA = np.mean(score['test_score'])



In [52]:
#Optimisation study for ridge regressor with context included concat guide RNA target DNA one hot encoding
study = optuna.create_study(direction = 'maximize')
optimization_function = partial(
                objective_ridge,
                x = concat_context_included_guide_RNAs_target_DNAs,
                y = k_values
            )
study.optimize(optimization_function, n_trials=100) 
ridge_concat_context_included_guide_RNA_target_DNA = study.best_value

[32m[I 2022-10-25 12:00:33,883][0m A new study created in memory with name: no-name-91e517d4-6ba5-4cb9-add0-51f3c6d5d996[0m
[32m[I 2022-10-25 12:00:34,620][0m Trial 0 finished with value: 0.08386613386613387 and parameters: {'alpha': 0.46997268597049235, 'solver': 'sag'}. Best is trial 0 with value: 0.08386613386613387.[0m
[32m[I 2022-10-25 12:00:34,647][0m Trial 1 finished with value: 0.08246753246753247 and parameters: {'alpha': 0.5535701697394563, 'solver': 'cholesky'}. Best is trial 0 with value: 0.08386613386613387.[0m
[32m[I 2022-10-25 12:00:35,453][0m Trial 2 finished with value: 0.08816183816183816 and parameters: {'alpha': 0.19782373591476088, 'solver': 'saga'}. Best is trial 2 with value: 0.08816183816183816.[0m
[32m[I 2022-10-25 12:00:36,215][0m Trial 3 finished with value: 0.08386613386613387 and parameters: {'alpha': 0.5852675918686879, 'solver': 'sag'}. Best is trial 2 with value: 0.08816183816183816.[0m
[32m[I 2022-10-25 12:00:36,251][0m Trial 4 finished

[32m[I 2022-10-25 12:00:44,954][0m Trial 40 finished with value: 0.09465534465534467 and parameters: {'alpha': 0.9239369929033059, 'solver': 'auto'}. Best is trial 20 with value: 0.11383616383616382.[0m
[32m[I 2022-10-25 12:00:45,448][0m Trial 41 finished with value: 0.11103896103896102 and parameters: {'alpha': 0.8774257939046536, 'solver': 'saga'}. Best is trial 20 with value: 0.11383616383616382.[0m
[32m[I 2022-10-25 12:00:45,887][0m Trial 42 finished with value: 0.11508491508491507 and parameters: {'alpha': 0.7981016809100298, 'solver': 'saga'}. Best is trial 42 with value: 0.11508491508491507.[0m
[32m[I 2022-10-25 12:00:46,271][0m Trial 43 finished with value: 0.10809190809190808 and parameters: {'alpha': 0.795784891322814, 'solver': 'saga'}. Best is trial 42 with value: 0.11508491508491507.[0m
[32m[I 2022-10-25 12:00:46,636][0m Trial 44 finished with value: 0.11383616383616382 and parameters: {'alpha': 0.930300554560187, 'solver': 'saga'}. Best is trial 42 with valu

[32m[I 2022-10-25 12:00:58,643][0m Trial 80 finished with value: 0.09025974025974026 and parameters: {'alpha': 0.8391466177466171, 'solver': 'sag'}. Best is trial 73 with value: 0.11673326673326671.[0m
[32m[I 2022-10-25 12:00:59,028][0m Trial 81 finished with value: 0.10809190809190808 and parameters: {'alpha': 0.8014629515246174, 'solver': 'saga'}. Best is trial 73 with value: 0.11673326673326671.[0m
[32m[I 2022-10-25 12:00:59,450][0m Trial 82 finished with value: 0.11103896103896102 and parameters: {'alpha': 0.8520790824799542, 'solver': 'saga'}. Best is trial 73 with value: 0.11673326673326671.[0m
[32m[I 2022-10-25 12:00:59,890][0m Trial 83 finished with value: 0.0949050949050949 and parameters: {'alpha': 0.6758047733025723, 'solver': 'saga'}. Best is trial 73 with value: 0.11673326673326671.[0m
[32m[I 2022-10-25 12:01:00,335][0m Trial 84 finished with value: 0.11268731268731266 and parameters: {'alpha': 0.8886066686463278, 'solver': 'saga'}. Best is trial 73 with valu

In [53]:
#Optimisation study for lasso regressor with context included concat guide RNA target DNA one hot encoding
study = optuna.create_study(direction = 'maximize')
optimization_function = partial(
                objective_lasso,
                x = concat_context_included_guide_RNAs_target_DNAs,
                y = k_values
            )
study.optimize(optimization_function, n_trials=100) 
lasso_concat_context_included_guide_RNA_target_DNA = study.best_value

[32m[I 2022-10-25 12:01:05,225][0m A new study created in memory with name: no-name-64d28204-c4dc-4a97-8acc-4f6a467df124[0m
[32m[I 2022-10-25 12:01:05,253][0m Trial 0 finished with value: 0.0 and parameters: {'alpha': 0.20128490108130448}. Best is trial 0 with value: 0.0.[0m
[32m[I 2022-10-25 12:01:05,278][0m Trial 1 finished with value: 0.0 and parameters: {'alpha': 0.2786532673842528}. Best is trial 0 with value: 0.0.[0m
[32m[I 2022-10-25 12:01:05,302][0m Trial 2 finished with value: 0.0 and parameters: {'alpha': 0.5523592200950218}. Best is trial 0 with value: 0.0.[0m
[32m[I 2022-10-25 12:01:05,327][0m Trial 3 finished with value: 0.0 and parameters: {'alpha': 0.8220749716004823}. Best is trial 0 with value: 0.0.[0m
[32m[I 2022-10-25 12:01:05,352][0m Trial 4 finished with value: 0.0 and parameters: {'alpha': 0.446432159191837}. Best is trial 0 with value: 0.0.[0m
[32m[I 2022-10-25 12:01:05,377][0m Trial 5 finished with value: 0.0 and parameters: {'alpha': 0.43454

[32m[I 2022-10-25 12:01:06,669][0m Trial 36 finished with value: 0.0 and parameters: {'alpha': 0.10378641264139041}. Best is trial 24 with value: 0.09915084915084915.[0m
[32m[I 2022-10-25 12:01:06,703][0m Trial 37 finished with value: 0.0 and parameters: {'alpha': 0.042021660526082395}. Best is trial 24 with value: 0.09915084915084915.[0m
[32m[I 2022-10-25 12:01:06,739][0m Trial 38 finished with value: 0.0 and parameters: {'alpha': 0.20007271735996948}. Best is trial 24 with value: 0.09915084915084915.[0m
[32m[I 2022-10-25 12:01:06,774][0m Trial 39 finished with value: 0.0 and parameters: {'alpha': 0.3114002850357952}. Best is trial 24 with value: 0.09915084915084915.[0m
[32m[I 2022-10-25 12:01:06,808][0m Trial 40 finished with value: 0.0 and parameters: {'alpha': 0.3912838735901517}. Best is trial 24 with value: 0.09915084915084915.[0m
[32m[I 2022-10-25 12:01:06,840][0m Trial 41 finished with value: 0.0 and parameters: {'alpha': 0.043623750241991416}. Best is trial 24

[32m[I 2022-10-25 12:01:08,365][0m Trial 83 finished with value: 0.13572425992551604 and parameters: {'alpha': 0.02599519858781425}. Best is trial 75 with value: 0.17605482002169884.[0m
[32m[I 2022-10-25 12:01:08,394][0m Trial 84 finished with value: 0.0 and parameters: {'alpha': 0.06296362192806976}. Best is trial 75 with value: 0.17605482002169884.[0m
[32m[I 2022-10-25 12:01:08,423][0m Trial 85 finished with value: 0.12309282605458309 and parameters: {'alpha': 0.02979528272160608}. Best is trial 75 with value: 0.17605482002169884.[0m
[32m[I 2022-10-25 12:01:08,454][0m Trial 86 finished with value: 0.0 and parameters: {'alpha': 0.08486313083706032}. Best is trial 75 with value: 0.17605482002169884.[0m
[32m[I 2022-10-25 12:01:08,480][0m Trial 87 finished with value: 0.0 and parameters: {'alpha': 0.1468385703187062}. Best is trial 75 with value: 0.17605482002169884.[0m
[32m[I 2022-10-25 12:01:08,510][0m Trial 88 finished with value: 0.0 and parameters: {'alpha': 0.04951

In [54]:
#Optimisation study for elastic NET regressor with context included concat guide RNA target DNA one hot encoding
study = optuna.create_study(direction = 'maximize')
optimization_function = partial(
                objective_ElasticNet,
                x = concat_context_included_guide_RNAs_target_DNAs,
                y = k_values
            )
study.optimize(optimization_function, n_trials=100) 
ElasticNet_concat_context_included_guide_RNA_target_DNA = study.best_value

[32m[I 2022-10-25 12:01:08,845][0m A new study created in memory with name: no-name-234f0499-db35-48ac-a4cc-be4cdf2a0e55[0m
[32m[I 2022-10-25 12:01:08,876][0m Trial 0 finished with value: 0.0 and parameters: {'alpha': 0.16989926076776352, 'l1_ratio': 0.6384666446067736}. Best is trial 0 with value: 0.0.[0m
[32m[I 2022-10-25 12:01:08,904][0m Trial 1 finished with value: 0.0 and parameters: {'alpha': 0.19189786534123054, 'l1_ratio': 0.71390227733765}. Best is trial 0 with value: 0.0.[0m
[32m[I 2022-10-25 12:01:08,929][0m Trial 2 finished with value: 0.0 and parameters: {'alpha': 0.7804895985884754, 'l1_ratio': 0.2814220311077087}. Best is trial 0 with value: 0.0.[0m
[32m[I 2022-10-25 12:01:08,957][0m Trial 3 finished with value: 0.0 and parameters: {'alpha': 0.5053987577126035, 'l1_ratio': 0.5359145462777676}. Best is trial 0 with value: 0.0.[0m
[32m[I 2022-10-25 12:01:08,988][0m Trial 4 finished with value: 0.0 and parameters: {'alpha': 0.3882627721887659, 'l1_ratio': 0

[32m[I 2022-10-25 12:01:10,285][0m Trial 40 finished with value: 0.16578421578421582 and parameters: {'alpha': 0.8919640990363519, 'l1_ratio': 0.0036055858670149122}. Best is trial 32 with value: 0.16953046953046955.[0m
[32m[I 2022-10-25 12:01:10,321][0m Trial 41 finished with value: 0.1276223776223776 and parameters: {'alpha': 0.8757784563721437, 'l1_ratio': 0.006487139849174477}. Best is trial 32 with value: 0.16953046953046955.[0m
[32m[I 2022-10-25 12:01:10,356][0m Trial 42 finished with value: 0.0 and parameters: {'alpha': 0.9521754743413225, 'l1_ratio': 0.1494538571940119}. Best is trial 32 with value: 0.16953046953046955.[0m
[32m[I 2022-10-25 12:01:10,390][0m Trial 43 finished with value: 0.0 and parameters: {'alpha': 0.7763694651158833, 'l1_ratio': 0.07777108838666033}. Best is trial 32 with value: 0.16953046953046955.[0m
[32m[I 2022-10-25 12:01:10,425][0m Trial 44 finished with value: 0.0 and parameters: {'alpha': 0.9009707296740349, 'l1_ratio': 0.1955295984805104

[32m[I 2022-10-25 12:01:11,679][0m Trial 79 finished with value: 0.0 and parameters: {'alpha': 0.9987728921552977, 'l1_ratio': 0.1765753011824056}. Best is trial 61 with value: 0.17605482002169884.[0m
[32m[I 2022-10-25 12:01:11,717][0m Trial 80 finished with value: 0.15489510489510488 and parameters: {'alpha': 0.5408008165287448, 'l1_ratio': 0.002300531353500794}. Best is trial 61 with value: 0.17605482002169884.[0m
[32m[I 2022-10-25 12:01:11,751][0m Trial 81 finished with value: 0.14575424575424573 and parameters: {'alpha': 0.9339947476496869, 'l1_ratio': 0.005815211038679548}. Best is trial 61 with value: 0.17605482002169884.[0m
[32m[I 2022-10-25 12:01:11,786][0m Trial 82 finished with value: 0.0 and parameters: {'alpha': 0.872678272324565, 'l1_ratio': 0.048151826421683884}. Best is trial 61 with value: 0.17605482002169884.[0m
[32m[I 2022-10-25 12:01:11,822][0m Trial 83 finished with value: 0.0 and parameters: {'alpha': 0.970883529918326, 'l1_ratio': 0.08253007590388085

In [81]:
#Optimisation study for decision tree regressor with context included concat guide RNA target DNA one hot encoding
study = optuna.create_study(direction = 'maximize')
optimization_function = partial(
                objective_dt,
                x = concat_context_included_guide_RNAs_target_DNAs,
                y = k_values
            )
study.optimize(optimization_function, n_trials=100) 
dt_concat_context_included_guide_RNA_target_DNA = study.best_value

[32m[I 2022-10-25 13:58:37,463][0m A new study created in memory with name: no-name-ab86895d-1770-45ba-a39a-39a1f09103b7[0m
[32m[I 2022-10-25 13:58:37,533][0m Trial 0 finished with value: 0.0 and parameters: {'ccp_alpha': 0.04789332942106117, 'criterion': 'squared_error'}. Best is trial 0 with value: 0.0.[0m
[32m[I 2022-10-25 13:58:37,999][0m Trial 1 finished with value: -0.007783344873283043 and parameters: {'ccp_alpha': 0.000964295762532319, 'criterion': 'absolute_error'}. Best is trial 0 with value: 0.0.[0m
[32m[I 2022-10-25 13:58:38,075][0m Trial 2 finished with value: 0.0 and parameters: {'ccp_alpha': 0.12140950392198102, 'criterion': 'squared_error'}. Best is trial 0 with value: 0.0.[0m
[32m[I 2022-10-25 13:58:38,140][0m Trial 3 finished with value: 0.0 and parameters: {'ccp_alpha': 0.5878854821133842, 'criterion': 'friedman_mse'}. Best is trial 0 with value: 0.0.[0m
[32m[I 2022-10-25 13:58:38,209][0m Trial 4 finished with value: 0.0 and parameters: {'ccp_alpha':

[32m[I 2022-10-25 13:58:46,665][0m Trial 41 finished with value: 0.0 and parameters: {'ccp_alpha': 0.7803441775313215, 'criterion': 'friedman_mse'}. Best is trial 20 with value: 0.044035457505525344.[0m
[32m[I 2022-10-25 13:58:46,731][0m Trial 42 finished with value: 0.0 and parameters: {'ccp_alpha': 0.45279731589709427, 'criterion': 'friedman_mse'}. Best is trial 20 with value: 0.044035457505525344.[0m
[32m[I 2022-10-25 13:58:46,800][0m Trial 43 finished with value: 0.0 and parameters: {'ccp_alpha': 0.6532386613418775, 'criterion': 'friedman_mse'}. Best is trial 20 with value: 0.044035457505525344.[0m
[32m[I 2022-10-25 13:58:46,871][0m Trial 44 finished with value: 0.0 and parameters: {'ccp_alpha': 0.7851774440669393, 'criterion': 'friedman_mse'}. Best is trial 20 with value: 0.044035457505525344.[0m
[32m[I 2022-10-25 13:58:46,945][0m Trial 45 finished with value: 0.0 and parameters: {'ccp_alpha': 0.8259146556764878, 'criterion': 'friedman_mse'}. Best is trial 20 with va

[32m[I 2022-10-25 13:58:57,308][0m Trial 81 finished with value: 0.0 and parameters: {'ccp_alpha': 0.11348361179307313, 'criterion': 'absolute_error'}. Best is trial 20 with value: 0.044035457505525344.[0m
[32m[I 2022-10-25 13:58:57,786][0m Trial 82 finished with value: 0.0 and parameters: {'ccp_alpha': 0.9233720978561611, 'criterion': 'absolute_error'}. Best is trial 20 with value: 0.044035457505525344.[0m
[32m[I 2022-10-25 13:58:58,272][0m Trial 83 finished with value: 0.0 and parameters: {'ccp_alpha': 0.8884654447256519, 'criterion': 'absolute_error'}. Best is trial 20 with value: 0.044035457505525344.[0m
[32m[I 2022-10-25 13:58:58,775][0m Trial 84 finished with value: 0.0 and parameters: {'ccp_alpha': 0.1955479624642133, 'criterion': 'absolute_error'}. Best is trial 20 with value: 0.044035457505525344.[0m
[32m[I 2022-10-25 13:58:59,240][0m Trial 85 finished with value: 0.0 and parameters: {'ccp_alpha': 0.9690245101312142, 'criterion': 'absolute_error'}. Best is trial 

In [56]:
#Optimisation study for random forest regressor with context included concat guide RNA target DNA one hot encoding
study = optuna.create_study(direction = 'maximize')
optimization_function = partial(
                objective_rf,
                x = concat_context_included_guide_RNAs_target_DNAs,
                y = k_values
            )
study.optimize(optimization_function, n_trials=100) 
rf_concat_context_included_guide_RNA_target_DNA = study.best_value

[32m[I 2022-10-25 12:01:33,477][0m A new study created in memory with name: no-name-8654909f-fcc4-463b-a833-d39cf766b616[0m
[32m[I 2022-10-25 12:01:35,232][0m Trial 0 finished with value: 0.27317682317682324 and parameters: {'n_estimators': 235, 'min_samples_leaf': 4, 'max_samples': 0.21117064545382638, 'max_features': 0.42992316517865187}. Best is trial 0 with value: 0.27317682317682324.[0m
[32m[I 2022-10-25 12:01:37,242][0m Trial 1 finished with value: 0.0 and parameters: {'n_estimators': 304, 'min_samples_leaf': 28, 'max_samples': 0.4699794093006707, 'max_features': 0.9816795099820104}. Best is trial 0 with value: 0.27317682317682324.[0m
[32m[I 2022-10-25 12:01:40,802][0m Trial 2 finished with value: 0.09970029970029971 and parameters: {'n_estimators': 492, 'min_samples_leaf': 16, 'max_samples': 0.7448942633078421, 'max_features': 0.11520028789407633}. Best is trial 0 with value: 0.27317682317682324.[0m
[32m[I 2022-10-25 12:01:46,202][0m Trial 3 finished with value: 0.

[32m[I 2022-10-25 12:03:17,134][0m Trial 30 finished with value: 0.2906593406593407 and parameters: {'n_estimators': 171, 'min_samples_leaf': 4, 'max_samples': 0.3217887458942768, 'max_features': 0.8073181855918539}. Best is trial 20 with value: 0.32342657342657344.[0m
[32m[I 2022-10-25 12:03:18,836][0m Trial 31 finished with value: 0.27732267732267735 and parameters: {'n_estimators': 178, 'min_samples_leaf': 4, 'max_samples': 0.32203934502214726, 'max_features': 0.9080227371615174}. Best is trial 20 with value: 0.32342657342657344.[0m
[32m[I 2022-10-25 12:03:21,548][0m Trial 32 finished with value: 0.21738261738261738 and parameters: {'n_estimators': 325, 'min_samples_leaf': 8, 'max_samples': 0.4584360370158722, 'max_features': 0.786126114675211}. Best is trial 20 with value: 0.32342657342657344.[0m
[32m[I 2022-10-25 12:03:23,901][0m Trial 33 finished with value: 0.274975024975025 and parameters: {'n_estimators': 272, 'min_samples_leaf': 6, 'max_samples': 0.424142865227112,

[32m[I 2022-10-25 12:05:06,071][0m Trial 61 finished with value: 0.2 and parameters: {'n_estimators': 280, 'min_samples_leaf': 3, 'max_samples': 0.25735350346573693, 'max_features': 0.8849199437168418}. Best is trial 20 with value: 0.32342657342657344.[0m
[32m[I 2022-10-25 12:05:09,786][0m Trial 62 finished with value: 0.31493506493506496 and parameters: {'n_estimators': 301, 'min_samples_leaf': 2, 'max_samples': 0.3492494470218571, 'max_features': 0.8416844821538227}. Best is trial 20 with value: 0.32342657342657344.[0m
[32m[I 2022-10-25 12:05:11,994][0m Trial 63 finished with value: 0.33811188811188814 and parameters: {'n_estimators': 205, 'min_samples_leaf': 2, 'max_samples': 0.3509449747871047, 'max_features': 0.8038183786405524}. Best is trial 63 with value: 0.33811188811188814.[0m
[32m[I 2022-10-25 12:05:16,446][0m Trial 64 finished with value: 0.2642857142857143 and parameters: {'n_estimators': 311, 'min_samples_leaf': 1, 'max_samples': 0.4001886984009261, 'max_featur

[32m[I 2022-10-25 12:06:25,844][0m Trial 92 finished with value: 0.2066933066933067 and parameters: {'n_estimators': 162, 'min_samples_leaf': 2, 'max_samples': 0.3687102204171713, 'max_features': 0.8683384921119975}. Best is trial 63 with value: 0.33811188811188814.[0m
[32m[I 2022-10-25 12:06:28,414][0m Trial 93 finished with value: 0.21818181818181817 and parameters: {'n_estimators': 264, 'min_samples_leaf': 5, 'max_samples': 0.24896609225418584, 'max_features': 0.7689174453927828}. Best is trial 63 with value: 0.33811188811188814.[0m
[32m[I 2022-10-25 12:06:32,596][0m Trial 94 finished with value: 0.273976023976024 and parameters: {'n_estimators': 332, 'min_samples_leaf': 3, 'max_samples': 0.3386299137551078, 'max_features': 0.9741857867688177}. Best is trial 63 with value: 0.33811188811188814.[0m
[32m[I 2022-10-25 12:06:36,026][0m Trial 95 finished with value: 0.22717282717282714 and parameters: {'n_estimators': 204, 'min_samples_leaf': 1, 'max_samples': 0.306102658111248

In [57]:
#Optimisation study for xgboost regressor with context included concat guide RNA target DNA one hot encoding
study = optuna.create_study(direction = 'maximize')
optimization_function = partial(
                objective_xgb,
                x = concat_context_included_guide_RNAs_target_DNAs,
                y = k_values
            )
study.optimize(optimization_function, n_trials=100) 
xgb_concat_context_included_guide_RNA_target_DNA = study.best_value

[32m[I 2022-10-25 12:06:44,999][0m A new study created in memory with name: no-name-26d008fd-df20-4692-a462-ef2ecd01a5f3[0m
[32m[I 2022-10-25 12:06:48,096][0m Trial 0 finished with value: 0.12002997002997004 and parameters: {'eta': 0.103678956494547, 'max_depth': 10, 'min_child_weight': 2, 'subsample': 0.6594940292498195, 'lambda': 0.9454596607038982}. Best is trial 0 with value: 0.12002997002997004.[0m
[32m[I 2022-10-25 12:06:50,642][0m Trial 1 finished with value: 0.15019980019980023 and parameters: {'eta': 0.06663277021525137, 'max_depth': 9, 'min_child_weight': 5, 'subsample': 0.7182675847590021, 'lambda': 0.9348638859390913}. Best is trial 1 with value: 0.15019980019980023.[0m
[32m[I 2022-10-25 12:06:53,379][0m Trial 2 finished with value: 0.20939060939060936 and parameters: {'eta': 0.03044816777115862, 'max_depth': 8, 'min_child_weight': 4, 'subsample': 0.6805374397509508, 'lambda': 0.8986250261629402}. Best is trial 2 with value: 0.20939060939060936.[0m
[32m[I 2022-

[32m[I 2022-10-25 12:07:58,116][0m Trial 28 finished with value: 0.1838161838161838 and parameters: {'eta': 0.04681611987296828, 'max_depth': 9, 'min_child_weight': 4, 'subsample': 0.9988200411842681, 'lambda': 0.9189652462053811}. Best is trial 14 with value: 0.32902097902097904.[0m
[32m[I 2022-10-25 12:08:00,456][0m Trial 29 finished with value: 0.1728771228771229 and parameters: {'eta': 0.1093804861654694, 'max_depth': 10, 'min_child_weight': 5, 'subsample': 0.9385537925826271, 'lambda': 0.9568726108583292}. Best is trial 14 with value: 0.32902097902097904.[0m
[32m[I 2022-10-25 12:08:02,788][0m Trial 30 finished with value: 0.06878121878121879 and parameters: {'eta': 0.07112400202358181, 'max_depth': 9, 'min_child_weight': 6, 'subsample': 0.958695310137345, 'lambda': 0.9829916867314076}. Best is trial 14 with value: 0.32902097902097904.[0m
[32m[I 2022-10-25 12:08:04,879][0m Trial 31 finished with value: 0.3620879120879121 and parameters: {'eta': 0.0004191170918532721, 'ma

[32m[I 2022-10-25 12:09:01,734][0m Trial 57 finished with value: 0.14340659340659342 and parameters: {'eta': 0.022310356930063377, 'max_depth': 7, 'min_child_weight': 6, 'subsample': 0.9220436665948005, 'lambda': 0.9224157309676908}. Best is trial 31 with value: 0.3620879120879121.[0m
[32m[I 2022-10-25 12:09:04,043][0m Trial 58 finished with value: 0.22952047952047955 and parameters: {'eta': 0.11643113482239063, 'max_depth': 7, 'min_child_weight': 4, 'subsample': 0.9917886229256456, 'lambda': 0.9468237788959853}. Best is trial 31 with value: 0.3620879120879121.[0m
[32m[I 2022-10-25 12:09:06,059][0m Trial 59 finished with value: 0.17387612387612386 and parameters: {'eta': 0.007112036268684276, 'max_depth': 8, 'min_child_weight': 5, 'subsample': 0.7559937414391464, 'lambda': 0.9355221195994499}. Best is trial 31 with value: 0.3620879120879121.[0m
[32m[I 2022-10-25 12:09:08,517][0m Trial 60 finished with value: 0.16668331668331665 and parameters: {'eta': 0.017631007001517797, '

[32m[I 2022-10-25 12:10:05,338][0m Trial 86 finished with value: 0.31903096903096906 and parameters: {'eta': 4.708521733040458e-05, 'max_depth': 9, 'min_child_weight': 6, 'subsample': 0.8652841147188021, 'lambda': 0.9301248008163052}. Best is trial 83 with value: 0.40044955044955044.[0m
[32m[I 2022-10-25 12:10:08,308][0m Trial 87 finished with value: 0.11348651348651351 and parameters: {'eta': 0.014400308197511317, 'max_depth': 9, 'min_child_weight': 6, 'subsample': 0.8598986052054046, 'lambda': 0.9275631635457037}. Best is trial 83 with value: 0.40044955044955044.[0m
[32m[I 2022-10-25 12:10:10,881][0m Trial 88 finished with value: 0.1083916083916084 and parameters: {'eta': 0.029164594221549357, 'max_depth': 10, 'min_child_weight': 6, 'subsample': 0.8483522138953159, 'lambda': 0.9208480624241698}. Best is trial 83 with value: 0.40044955044955044.[0m
[32m[I 2022-10-25 12:10:13,259][0m Trial 89 finished with value: 0.21208791208791208 and parameters: {'eta': 0.0056642807908096

In [58]:
#Optimisation study for lightgbm regressor with context included concat guide RNA target DNA one hot encoding
study = optuna.create_study(direction = 'maximize')
optimization_function = partial(
                objective_lgbm,
                x = concat_context_included_guide_RNAs_target_DNAs,
                y = k_values
            )
study.optimize(optimization_function, n_trials=100) 
lgbm_concat_context_included_guide_RNA_target_DNA = study.best_value

[32m[I 2022-10-25 12:10:37,065][0m A new study created in memory with name: no-name-60177ff1-848f-406a-91fd-b2f312bb162a[0m
[32m[I 2022-10-25 12:10:37,366][0m Trial 0 finished with value: 0.017982017982017984 and parameters: {'boosting_type': 'dart', 'num_leaves': 38, 'max_depth': 2, 'learning_rate': 0.057492770133513386, 'n_estimators': 151, 'min_child_weight': 0.004801501740872429}. Best is trial 0 with value: 0.017982017982017984.[0m
[32m[I 2022-10-25 12:10:37,756][0m Trial 1 finished with value: 0.06958041958041958 and parameters: {'boosting_type': 'gbdt', 'num_leaves': 25, 'max_depth': 44, 'learning_rate': 0.07431698293146137, 'n_estimators': 160, 'min_child_weight': 0.002291961190204474}. Best is trial 1 with value: 0.06958041958041958.[0m
[32m[I 2022-10-25 12:10:38,084][0m Trial 2 finished with value: 0.07481995912283337 and parameters: {'boosting_type': 'gbdt', 'num_leaves': 27, 'max_depth': 24, 'learning_rate': 0.006170292880016786, 'n_estimators': 87, 'min_child_we

[32m[I 2022-10-25 12:10:46,115][0m Trial 26 finished with value: 0.07767232767232768 and parameters: {'boosting_type': 'goss', 'num_leaves': 17, 'max_depth': 14, 'learning_rate': 0.013633533314040908, 'n_estimators': 167, 'min_child_weight': 0.002339257433841669}. Best is trial 10 with value: 0.0875124875124875.[0m
[32m[I 2022-10-25 12:10:46,394][0m Trial 27 finished with value: 0.0887891250451793 and parameters: {'boosting_type': 'goss', 'num_leaves': 10, 'max_depth': 26, 'learning_rate': 0.057606198612536415, 'n_estimators': 146, 'min_child_weight': 0.002138735077938282}. Best is trial 27 with value: 0.0887891250451793.[0m
[32m[I 2022-10-25 12:10:46,900][0m Trial 28 finished with value: 0.0537962037962038 and parameters: {'boosting_type': 'dart', 'num_leaves': 11, 'max_depth': 27, 'learning_rate': 0.059050403430083255, 'n_estimators': 145, 'min_child_weight': 0.001277187218173287}. Best is trial 27 with value: 0.0887891250451793.[0m
[32m[I 2022-10-25 12:10:47,189][0m Trial

[32m[I 2022-10-25 12:10:55,034][0m Trial 52 finished with value: 0.05498293815763659 and parameters: {'boosting_type': 'goss', 'num_leaves': 2, 'max_depth': 7, 'learning_rate': 0.06278499203472324, 'n_estimators': 132, 'min_child_weight': 0.0019695970213458405}. Best is trial 44 with value: 0.11633366633366633.[0m
[32m[I 2022-10-25 12:10:55,300][0m Trial 53 finished with value: 0.07587412587412587 and parameters: {'boosting_type': 'goss', 'num_leaves': 8, 'max_depth': -1, 'learning_rate': 0.0711147283283438, 'n_estimators': 118, 'min_child_weight': 0.0034457914023602212}. Best is trial 44 with value: 0.11633366633366633.[0m
[32m[I 2022-10-25 12:10:55,592][0m Trial 54 finished with value: 0.08401598401598402 and parameters: {'boosting_type': 'goss', 'num_leaves': 4, 'max_depth': 2, 'learning_rate': 0.04746263668571338, 'n_estimators': 124, 'min_child_weight': 0.0026986626657765126}. Best is trial 44 with value: 0.11633366633366633.[0m
[32m[I 2022-10-25 12:10:55,854][0m Trial 

[32m[I 2022-10-25 12:11:04,160][0m Trial 78 finished with value: 0.06570389076773045 and parameters: {'boosting_type': 'gbdt', 'num_leaves': 11, 'max_depth': 1, 'learning_rate': 0.0061409647852736565, 'n_estimators': 138, 'min_child_weight': 0.0027026252964219124}. Best is trial 63 with value: 0.1985649213353188.[0m
[32m[I 2022-10-25 12:11:04,549][0m Trial 79 finished with value: 0.08166833166833168 and parameters: {'boosting_type': 'gbdt', 'num_leaves': 3, 'max_depth': 4, 'learning_rate': 0.012498071511561095, 'n_estimators': 149, 'min_child_weight': 0.0029440290105193882}. Best is trial 63 with value: 0.1985649213353188.[0m
[32m[I 2022-10-25 12:11:04,921][0m Trial 80 finished with value: 0.05759240759240759 and parameters: {'boosting_type': 'gbdt', 'num_leaves': 8, 'max_depth': 0, 'learning_rate': 0.011126656076518544, 'n_estimators': 121, 'min_child_weight': 0.002379611438425214}. Best is trial 63 with value: 0.1985649213353188.[0m
[32m[I 2022-10-25 12:11:05,338][0m Trial

In [59]:
#Optimisation study for support vector machine regressor with context included concat guide RNA target DNA one hot encoding
study = optuna.create_study(direction = 'maximize')
optimization_function = partial(
                objective_svr,
                x = concat_context_included_guide_RNAs_target_DNAs,
                y = k_values
            )
study.optimize(optimization_function, n_trials=100) 
svr_concat_context_included_guide_RNA_target_DNA = study.best_value

[32m[I 2022-10-25 12:11:11,280][0m A new study created in memory with name: no-name-1d91caf0-2e2b-4d40-b744-92165a016099[0m
[32m[I 2022-10-25 12:11:11,300][0m Trial 0 finished with value: 0.0 and parameters: {'kernel': 'linear', 'degree': 8, 'C': 0.11930204717976844, 'epsilon': 4.376573473107153}. Best is trial 0 with value: 0.0.[0m
[32m[I 2022-10-25 12:11:11,319][0m Trial 1 finished with value: 0.0 and parameters: {'kernel': 'poly', 'degree': 1, 'C': 0.3182525306868511, 'epsilon': 3.3522006960047346}. Best is trial 0 with value: 0.0.[0m
[32m[I 2022-10-25 12:11:11,338][0m Trial 2 finished with value: -0.04702564102564103 and parameters: {'kernel': 'sigmoid', 'degree': 10, 'C': 4.085013297543582, 'epsilon': 0.5501646671520094}. Best is trial 0 with value: 0.0.[0m
[32m[I 2022-10-25 12:11:11,355][0m Trial 3 finished with value: 0.0 and parameters: {'kernel': 'sigmoid', 'degree': 5, 'C': 1.6666586098496983, 'epsilon': 4.7446836346166075}. Best is trial 0 with value: 0.0.[0m


[32m[I 2022-10-25 12:11:12,011][0m Trial 34 finished with value: 0.07388034188034187 and parameters: {'kernel': 'poly', 'degree': 6, 'C': 1.0068900105663363, 'epsilon': 0.6579460307039992}. Best is trial 33 with value: 0.1508034188034188.[0m
[32m[I 2022-10-25 12:11:12,028][0m Trial 35 finished with value: 0.06454700854700855 and parameters: {'kernel': 'poly', 'degree': 6, 'C': 1.4518847041120804, 'epsilon': 0.951315838447419}. Best is trial 33 with value: 0.1508034188034188.[0m
[32m[I 2022-10-25 12:11:12,046][0m Trial 36 finished with value: 0.06497435897435896 and parameters: {'kernel': 'poly', 'degree': 7, 'C': 1.8880941637147273, 'epsilon': 0.47346880617150405}. Best is trial 33 with value: 0.1508034188034188.[0m
[32m[I 2022-10-25 12:11:12,078][0m Trial 37 finished with value: 0.07504273504273506 and parameters: {'kernel': 'sigmoid', 'degree': 8, 'C': 2.3320854204856425, 'epsilon': 0.013169944773224834}. Best is trial 33 with value: 0.1508034188034188.[0m
[32m[I 2022-10

[32m[I 2022-10-25 12:11:12,737][0m Trial 68 finished with value: 0.0 and parameters: {'kernel': 'poly', 'degree': 4, 'C': 0.4185878679981115, 'epsilon': 3.4899433923578433}. Best is trial 33 with value: 0.1508034188034188.[0m
[32m[I 2022-10-25 12:11:12,759][0m Trial 69 finished with value: 0.019726495726495732 and parameters: {'kernel': 'linear', 'degree': 3, 'C': 1.6194807394850248, 'epsilon': 0.15091188684445428}. Best is trial 33 with value: 0.1508034188034188.[0m
[32m[I 2022-10-25 12:11:12,776][0m Trial 70 finished with value: -0.04615710827971796 and parameters: {'kernel': 'linear', 'degree': 5, 'C': 0.9685797219120831, 'epsilon': 0.42853246184436267}. Best is trial 33 with value: 0.1508034188034188.[0m
[32m[I 2022-10-25 12:11:12,802][0m Trial 71 finished with value: 0.12632478632478633 and parameters: {'kernel': 'linear', 'degree': 6, 'C': 1.2113393317771122, 'epsilon': 0.039885374905602664}. Best is trial 33 with value: 0.1508034188034188.[0m
[32m[I 2022-10-25 12:11

In [124]:
import plotly.graph_objects as go
features=['One hot encoded guide RNA sequence', 
          'One hot encoded target DNA complementary guide RNA sequence', 
          'One hot encoded concatenated guide RNA and target DNA sequence',
          'One hot encoded concatenated guide RNA and target DNA sequence (PAM included)',
          'One hot encoded concatenated guide RNA and target DNA sequence (20 bp target DNA context included)'
         
         ]

fig = go.Figure(data=[
    go.Bar(name='Linear Regression', x=features, y=[linear_regression_gRNA, linear_regression_complementary_gRNA, linear_regression_concat_gRNA_target_DNA, linear_regression_concat_pam_included_guide_RNA_target_DNA, linear_regression_concat_context_included_guide_RNA_target_DNA]),
    go.Bar(name='Ridge Regression', x=features, y=[ridge_gRNA, ridge_complementary_gRNA, ridge_concat_gRNA_target_DNA, ridge_concat_pam_included_guide_RNA_target_DNA, ridge_concat_context_included_guide_RNA_target_DNA]),
    go.Bar(name='Lasso Regression', x=features, y=[lasso_gRNA, lasso_complementary_gRNA, lasso_concat_gRNA_target_DNA, lasso_concat_pam_included_guide_RNA_target_DNA,lasso_concat_context_included_guide_RNA_target_DNA]),
    go.Bar(name='Elastic Net Regression', x=features, y=[ElasticNet_gRNA, ElasticNet_complementary_gRNA, ElasticNet_concat_gRNA_target_DNA, ElasticNet_concat_pam_included_guide_RNA_target_DNA,ElasticNet_concat_context_included_guide_RNA_target_DNA]),
    go.Bar(name='Decision Tree Regressor', x=features, y=[dt_gRNA, dt_complementary_gRNA, dt_concat_gRNA_target_DNA, dt_concat_pam_included_guide_RNA_target_DNA,dt_concat_context_included_guide_RNA_target_DNA]),
    go.Bar(name='Random Forest Regressor', x=features, y=[rf_gRNA, rf_complementary_gRNA, rf_concat_gRNA_target_DNA, rf_concat_pam_included_guide_RNA_target_DNA, rf_concat_context_included_guide_RNA_target_DNA]),
    go.Bar(name='xgboost Regressor', x=features, y=[xgb_gRNA, xgb_complementary_gRNA, xgb_concat_gRNA_target_DNA, xgb_concat_pam_included_guide_RNA_target_DNA, xgb_concat_context_included_guide_RNA_target_DNA]),
    go.Bar(name='lightgbm Regressor', x=features, y=[lgbm_gRNA, lgbm_complementary_gRNA, lgbm_concat_gRNA_target_DNA, lgbm_concat_pam_included_guide_RNA_target_DNA, lgbm_concat_context_included_guide_RNA_target_DNA]),
    go.Bar(name='Support Vector Machine Regressor', x=features, y=[svr_gRNA, svr_complementary_gRNA, svr_concat_gRNA_target_DNA, svr_concat_pam_included_guide_RNA_target_DNA, svr_concat_context_included_guide_RNA_target_DNA])
])
# Change the bar mode
fig.update_layout(height = 2000, width = 2000, barmode='group', plot_bgcolor = 'white', yaxis_title = 'Spearman Rank Correlation  Coefficient')
fig.show()