This notebook takes in sequence features and runs them through a different models. All model hyperparameters are 
selected using optuna. Models are run across 10 folds and benchmarking results are plotted
for mean squared error and pearson's correlation coefficient



In [45]:
#import libraries
import pandas as pd
import numpy as np 
import sklearn

In [46]:
#Load up guide RNA And target DNA as dataframes
df_guide_RNA = pd.read_csv(r"C:\Users\dasak\OneDrive\Documents\GitHub\cas12-collateral-cleavage-prediction\full_guide_RNA_sequences.csv")
df_target_DNA = pd.read_csv(r"C:\Users\dasak\OneDrive\Documents\GitHub\cas12-collateral-cleavage-prediction\target_DNA_sequences.csv")

# Load k_value_sigmoid from experiment on 24_05_2022 
from numpy import genfromtxt
k_values = genfromtxt('k_values_sigmoid.csv', delimiter=',')
print(k_values)

[ 0.11984317  0.13894219  0.15437609  0.06970868  0.13297348  0.12425794
  0.18793106  0.27095051  0.16818124  0.1347531   0.29226458  0.12293203
  0.38708549  0.19778096  0.3622934   0.34890034  0.20684573  0.11417769
 -0.07936754  0.31408586  0.18672599  0.13433442  0.1847865   0.24778192
  0.42079963  0.35078867  0.09680571  0.5202575   0.22697745  0.3248318
  0.1702902   0.24634137  0.14220321 -2.51561011  0.17162254  0.04222993
  0.24852536  0.296885    0.27256568  0.32140478  0.21027936  0.1127019
  0.20870236  0.28398043  0.32744511  0.2282549   0.14870497  0.32030304
  0.2445419   0.08811278  0.44583873  0.15249895  0.23265821  0.22829356
  0.2489641   0.16632398  0.27098826  0.18329734  0.11389634  0.29132781
  0.35664269 -0.2822598   0.14580803  0.06875822  0.09644608  0.10429324
  0.16959348  0.32049426  0.3353713   0.18005612  0.15875678  0.29960359
  0.41943501  0.45389137  0.29449169  0.47311382  0.28739385  0.33115047
  0.3254987   0.44876175  0.15296381  0.1752204   0.0

In [47]:
def one_hot_encode_DNA(DNA_sequence):
    mapping = dict(zip("atcg", range(4)))    
    one_hot_encoded_sequence = [mapping[i] for i in DNA_sequence]
    return np.eye(4)[one_hot_encoded_sequence]

In [48]:
def one_hot_encode_RNA(gRNA_sequence):
    mapping = dict(zip("aucg", range(4)))    
    one_hot_encoded_sequence = [mapping[i] for i in gRNA_sequence]
    return np.eye(4)[one_hot_encoded_sequence]

In [49]:
guide_RNA_sequences = df_guide_RNA['Sequence']
guide_RNA_one_hot_encoded_sequences = []
for i in guide_RNA_sequences:
    one_hot_encoded_sequence = one_hot_encode_RNA(i)
    guide_RNA_one_hot_encoded_sequences.append(one_hot_encoded_sequence.ravel())

#generate feature names
feature_names_gRNA = []
for i in range(41):
    feature_names_gRNA.append('guide RNA A' + str(i+1))
    feature_names_gRNA.append('guide RNA U' + str(i+1))
    feature_names_gRNA.append('guide RNA C' + str(i+1))
    feature_names_gRNA.append('guide RNA G' + str(i+1))

In [50]:
#Generalised model function

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from scipy.stats import pearsonr
from sklearn.metrics import r2_score
from sklearn.tree import DecisionTreeRegressor

def train_model(model,input_features, outputs, input_feature_names):
    '''
    This function takes in input features and outputs as kvalue for an experiment. 
    It then trains a model on these features. Hyperparameter optimisation is performed using
    optuna, training and validation is performed with cross validation    
    '''
    #Set out 20% of data to act as a hold out for training.
    input_train, input_test, output_train, output_test = train_test_split(
        input_features,outputs, test_size=0.2, random_state=0)
    
    #Model selection
    model = model
    model = model.fit(input_train, output_train)
    model_importance = pd.DataFrame({'Feature_names':input_feature_names,'Importance': model.feature_importances_})
    output_predictions = model.predict(input_test)
    mse = mean_squared_error(output_test, output_predictions)
    pearson_rank_results = pearsonr(output_test, output_predictions)
    r2 = r2_score(output_test,output_predictions)
    return(model_importance,mse, pearson_rank_results,r2)
    

In [51]:
import optuna
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from scipy.stats import pearsonr
from sklearn.metrics import r2_score
from sklearn.model_selection import cross_validate
from sklearn.linear_model import LinearRegression

model = LinearRegression()
score = cross_validate(model, guide_RNA_one_hot_encoded_sequences, k_values, cv = 5, scoring = 'neg_mean_squared_error')
score_mean = -np.mean(score['test_score'])
print(score_mean)

0.21836289016980878


In [52]:
import optuna
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from scipy.stats import pearsonr
from sklearn.metrics import r2_score
from sklearn.model_selection import cross_validate
from sklearn.linear_model import Ridge
from functools import *

def objective_ridge(trial,x,y):
    
    alpha = trial.suggest_float('alpha', 0, 1)
    solver = trial.suggest_categorical('solver', ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga'])
    
    
    model = Ridge(
        alpha = alpha,
        solver = solver,
        )
    
    #Use 5 way cross validation for training.
    score = cross_validate(model, x, y, cv = 5, scoring = 'neg_mean_squared_error')
    score_mean = -np.mean(score['test_score'])
    return(score_mean)
    
study = optuna.create_study(direction = 'minimize')
optimization_function = partial(
                objective_ridge,
                x = guide_RNA_one_hot_encoded_sequences,
                y = k_values
            )
study.optimize(optimization_function, n_trials=100) 
ridge_gRNA = study.best_value



[32m[I 2022-10-21 10:45:37,826][0m A new study created in memory with name: no-name-fbccdd73-1269-462c-b798-130ac519bdd5[0m
[32m[I 2022-10-21 10:45:37,844][0m Trial 0 finished with value: 0.19499736405084533 and parameters: {'alpha': 0.384270504439465, 'solver': 'sparse_cg'}. Best is trial 0 with value: 0.19499736405084533.[0m
[32m[I 2022-10-21 10:45:37,949][0m Trial 1 finished with value: 0.17662801308450177 and parameters: {'alpha': 0.8631921176563843, 'solver': 'sag'}. Best is trial 1 with value: 0.17662801308450177.[0m
[32m[I 2022-10-21 10:45:38,049][0m Trial 2 finished with value: 0.17277115134154625 and parameters: {'alpha': 0.9924987421061795, 'solver': 'sag'}. Best is trial 2 with value: 0.17277115134154625.[0m
[32m[I 2022-10-21 10:45:38,212][0m Trial 3 finished with value: 0.19446815131623882 and parameters: {'alpha': 0.39554571236504543, 'solver': 'sag'}. Best is trial 2 with value: 0.17277115134154625.[0m
[32m[I 2022-10-21 10:45:38,265][0m Trial 4 finished w

In [53]:
import optuna
from sklearn.metrics import mean_squared_error
from scipy.stats import pearsonr
from sklearn.metrics import r2_score
from sklearn.model_selection import cross_validate
from sklearn.linear_model import Lasso
from functools import *

def objective_lasso(trial,x,y):
    alpha = trial.suggest_float('alpha', 0, 1)
    
    model = Lasso(
        alpha = alpha
        )
    
    #Use 5 way cross validation for training.
    score = cross_validate(model, x, y, cv = 5, scoring = 'neg_mean_squared_error')
    score_mean = -np.mean(score['test_score'])
    return(score_mean)
    
study = optuna.create_study(direction = 'minimize')
optimization_function = partial(
                objective_lasso,
                x = guide_RNA_one_hot_encoded_sequences,
                y = k_values
            )
study.optimize(optimization_function, n_trials=100) 
lasso_gRNA = study.best_value


[32m[I 2022-10-21 10:46:23,874][0m A new study created in memory with name: no-name-5758eca2-0837-4291-86b9-fe9719a5db49[0m
[32m[I 2022-10-21 10:46:23,883][0m Trial 0 finished with value: 0.07605086973235141 and parameters: {'alpha': 0.8516612922642071}. Best is trial 0 with value: 0.07605086973235141.[0m
[32m[I 2022-10-21 10:46:23,891][0m Trial 1 finished with value: 0.07605086973235141 and parameters: {'alpha': 0.27103718856025627}. Best is trial 0 with value: 0.07605086973235141.[0m
[32m[I 2022-10-21 10:46:23,898][0m Trial 2 finished with value: 0.07605086973235141 and parameters: {'alpha': 0.8605523759458531}. Best is trial 0 with value: 0.07605086973235141.[0m
[32m[I 2022-10-21 10:46:23,907][0m Trial 3 finished with value: 0.07605086973235141 and parameters: {'alpha': 0.913324751719118}. Best is trial 0 with value: 0.07605086973235141.[0m
[32m[I 2022-10-21 10:46:23,915][0m Trial 4 finished with value: 0.07605086973235141 and parameters: {'alpha': 0.081726435489925

In [54]:
import optuna
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from scipy.stats import pearsonr
from sklearn.metrics import r2_score
from sklearn.model_selection import cross_validate
from sklearn.linear_model import ElasticNet

def objective_ElasticNet(trial,x,y):
    alpha = trial.suggest_float('alpha', 0, 1)
    l1_ratio = trial.suggest_float('l1_ratio',0,1)
    
    model = ElasticNet(
        alpha = alpha,
        l1_ratio = l1_ratio
        )
    
    #Use 5 way cross validation for training.
    score = cross_validate(model, x, y, cv = 5, scoring = 'neg_mean_squared_error')
    score_mean = -np.mean(score['test_score'])
    return(score_mean)
    
study = optuna.create_study(direction = 'minimize')
optimization_function = partial(
                objective_ElasticNet,
                x = guide_RNA_one_hot_encoded_sequences,
                y = k_values
            )
study.optimize(optimization_function, n_trials=100) 
ElastiNet_gRNA = study.best_value




[32m[I 2022-10-21 10:47:00,031][0m A new study created in memory with name: no-name-f371614e-19a8-430a-a0b1-e26c5a98135e[0m
[32m[I 2022-10-21 10:47:00,040][0m Trial 0 finished with value: 0.07605086973235141 and parameters: {'alpha': 0.25007493663050573, 'l1_ratio': 0.7313912505296785}. Best is trial 0 with value: 0.07605086973235141.[0m
[32m[I 2022-10-21 10:47:00,049][0m Trial 1 finished with value: 0.07605086973235141 and parameters: {'alpha': 0.11342841562372197, 'l1_ratio': 0.6825242884968589}. Best is trial 0 with value: 0.07605086973235141.[0m
[32m[I 2022-10-21 10:47:00,058][0m Trial 2 finished with value: 0.07605086973235141 and parameters: {'alpha': 0.5343376511763454, 'l1_ratio': 0.11250440401963702}. Best is trial 0 with value: 0.07605086973235141.[0m
[32m[I 2022-10-21 10:47:00,069][0m Trial 3 finished with value: 0.07699178081659527 and parameters: {'alpha': 0.03433368894508382, 'l1_ratio': 0.5295405310874881}. Best is trial 0 with value: 0.07605086973235141.[

In [62]:
import optuna
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from scipy.stats import pearsonr
from sklearn.metrics import r2_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_validate
from functools import *

def objective_dt(trial,x,y):
    
    ccp_alpha = trial.suggest_float('ccp_alpha', 0, 1)
    criterion = trial.suggest_categorical('criterion', ['squared_error', 'friedman_mse', 'absolute_error'])
    
    model =  DecisionTreeRegressor(
        ccp_alpha = ccp_alpha,
        criterion = criterion,
        random_state = 42)
    
    #Use 5 way cross validation for training.
    score = cross_validate(model, x, y, cv = 5, scoring = 'neg_mean_squared_error')
    score_mean = -np.mean(score['test_score'])
    return(score_mean)
    
study = optuna.create_study(direction = 'minimize')
optimization_function = partial(
                objective_dt,
                x = guide_RNA_one_hot_encoded_sequences,
                y = k_values
            )
study.optimize(optimization_function, n_trials=100) 
dt_gRNA = study.best_value

[32m[I 2022-10-21 10:57:31,415][0m A new study created in memory with name: no-name-24042d41-cf5e-4a62-8efc-8b24753beda4[0m
[32m[I 2022-10-21 10:57:31,517][0m Trial 0 finished with value: 0.07576461605299843 and parameters: {'ccp_alpha': 0.9319549654527023, 'criterion': 'absolute_error'}. Best is trial 0 with value: 0.07576461605299843.[0m
[32m[I 2022-10-21 10:57:31,541][0m Trial 1 finished with value: 0.07605086973235142 and parameters: {'ccp_alpha': 0.12398987578806075, 'criterion': 'squared_error'}. Best is trial 0 with value: 0.07576461605299843.[0m
[32m[I 2022-10-21 10:57:31,563][0m Trial 2 finished with value: 0.07605086973235142 and parameters: {'ccp_alpha': 0.5748583869329502, 'criterion': 'squared_error'}. Best is trial 0 with value: 0.07576461605299843.[0m
[32m[I 2022-10-21 10:57:31,587][0m Trial 3 finished with value: 0.07605086973235142 and parameters: {'ccp_alpha': 0.7645848574514825, 'criterion': 'squared_error'}. Best is trial 0 with value: 0.07576461605299

In [18]:
import optuna
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from scipy.stats import pearsonr
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_validate

def objective_rf(trial,x,y):
    n_estimators = trial.suggest_int("n_estimators", 100, 1000)
    min_samples_leaf = trial.suggest_int("min_samples_leaf", 1, 30)
    max_samples = trial.suggest_float("max_samples", 0.01, 1.0)
    max_features = trial.suggest_float("max_features", 0.01, 1.0)
    
    model = RandomForestRegressor(
        n_estimators = n_estimators,
        min_samples_leaf = min_samples_leaf,
        max_samples = max_samples,
        max_features = max_features, 
        random_state = 42)
    
    #Use 5 way cross validation for training.
    score = cross_validate(model, x, y, cv = 10, scoring = 'neg_mean_squared_error')
    score_mean = -np.mean(score['test_score'])
    return(score_mean)
    
study = optuna.create_study(direction = 'minimize')
optimization_function = partial(
                objective_rf,
                x = guide_RNA_one_hot_encoded_sequences,
                y = k_values
            )
study.optimize(optimization_function, n_trials=100) 
rf_gRNA = study.best_value

[32m[I 2022-10-19 14:54:10,345][0m A new study created in memory with name: no-name-ef165ae9-5867-43a4-b3d1-1cd278a8d96c[0m
[32m[I 2022-10-19 14:54:16,892][0m Trial 0 finished with value: 0.07675764695932201 and parameters: {'n_estimators': 696, 'min_samples_leaf': 10, 'max_samples': 0.34236702726754664, 'max_features': 0.03053280938976001}. Best is trial 0 with value: 0.07675764695932201.[0m
[32m[I 2022-10-19 14:54:24,978][0m Trial 1 finished with value: 0.07655609828372802 and parameters: {'n_estimators': 863, 'min_samples_leaf': 17, 'max_samples': 0.42331528993699313, 'max_features': 0.3578074046428498}. Best is trial 1 with value: 0.07655609828372802.[0m
[32m[I 2022-10-19 14:54:32,423][0m Trial 2 finished with value: 0.07667334794081072 and parameters: {'n_estimators': 765, 'min_samples_leaf': 18, 'max_samples': 0.39056807161093854, 'max_features': 0.2888341782729423}. Best is trial 1 with value: 0.07655609828372802.[0m
[32m[I 2022-10-19 14:54:34,236][0m Trial 3 finis

In [14]:
import optuna
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from scipy.stats import pearsonr
from sklearn.metrics import r2_score
from sklearn.model_selection import cross_validate
from functools import *
from xgboost import XGBRegressor 

def objective_xgb(trial,x,y):
    eta = trial.suggest_float("eta", 0, 0.2)
    max_depth = trial.suggest_int("max_depth", 7, 11)
    min_child_weight = trial.suggest_int("min_child_weight", 2, 6)
    subsample = trial.suggest_float("subsample", 0.6, 1.0)
    reg_lambda = trial.suggest_float("lambda", 0.8, 1.0)
    
    model = XGBRegressor(
        eta = eta,
        max_depth = max_depth,
        min_child_weight = min_child_weight,
        subsample = subsample,
        reg_lambda = reg_lambda,
        nthread = 1)
    
    #Use 5 way cross validation for training.
    score = cross_validate(model, x, y, cv = 10, scoring = 'neg_mean_squared_error')
    score_mean = -np.mean(score['test_score'])
    return(score_mean)
    
study = optuna.create_study(direction = 'minimize')
optimization_function = partial(
                objective_xgb,
                x = guide_RNA_one_hot_encoded_sequences,
                y = k_values
            )
study.optimize(optimization_function, n_trials=100) 
xgb_gRNA = study.best_value

[32m[I 2022-10-19 14:49:28,722][0m A new study created in memory with name: no-name-e8eb9fec-90e7-4e4d-a3d1-3a9a069a4068[0m
[32m[I 2022-10-19 14:49:30,623][0m Trial 0 finished with value: 0.09190494776876856 and parameters: {'eta': 0.027495917812330406, 'max_depth': 8, 'min_child_weight': 2, 'subsample': 0.9356158387000062, 'lambda': 0.9685790687928537}. Best is trial 0 with value: 0.09190494776876856.[0m
[32m[I 2022-10-19 14:49:32,528][0m Trial 1 finished with value: 0.10973632595568635 and parameters: {'eta': 0.13467560438158555, 'max_depth': 7, 'min_child_weight': 2, 'subsample': 0.977320590527107, 'lambda': 0.8304719075603851}. Best is trial 0 with value: 0.09190494776876856.[0m
[32m[I 2022-10-19 14:49:34,184][0m Trial 2 finished with value: 0.08862041787453145 and parameters: {'eta': 0.044137016366609674, 'max_depth': 9, 'min_child_weight': 4, 'subsample': 0.6082287513858248, 'lambda': 0.9394233494968811}. Best is trial 2 with value: 0.08862041787453145.[0m
[32m[I 202

In [17]:
import optuna
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from scipy.stats import pearsonr
from sklearn.metrics import r2_score
from sklearn.model_selection import cross_validate
from functools import *
from lightgbm import LGBMRegressor

def objective_lgbm(trial,x,y):
    boosting_type = trial.suggest_categorical("boosting_type", ['gbdt', 'dart', 'goss'])
    num_leaves = trial.suggest_int("num_leaves", 2, 50)
    max_depth = trial.suggest_int("max_depth", -1, 50)
    learning_rate = trial.suggest_float("learning_rate", 0.001, 0.1)
    n_estimators = trial.suggest_int("n_estimators", 50, 200)
    min_child_weight = trial.suggest_float("min_child_weight", 0.001, 0.005)
        
    model = LGBMRegressor(
        boosting_type = boosting_type,
        num_leaves = num_leaves,
        max_depth = max_depth,
        learning_rate = learning_rate,
        n_estimators = n_estimators,
        min_child_weight = min_child_weight,
        )
    
    #Use 10 way cross validation for training.
    score = cross_validate(model, x, y, cv = 5, scoring = 'neg_mean_squared_error')
    score_mean = -np.mean(score['test_score'])
    return(score_mean)
    
study = optuna.create_study(direction = 'minimize')
optimization_function = partial(
                objective_lgbm,
                x = guide_RNA_one_hot_encoded_sequences,
                y = k_values
            )
study.optimize(optimization_function, n_trials=100) 
lgbm_gRNA = study.best_value

[32m[I 2022-10-19 14:53:18,710][0m A new study created in memory with name: no-name-a34671e6-4efe-46db-a026-195f4451e090[0m
[32m[I 2022-10-19 14:53:18,841][0m Trial 0 finished with value: 0.08071008872239678 and parameters: {'boosting_type': 'dart', 'num_leaves': 5, 'max_depth': 21, 'learning_rate': 0.005734318809517211, 'n_estimators': 74, 'min_child_weight': 0.003776426404437692}. Best is trial 0 with value: 0.08071008872239678.[0m
[32m[I 2022-10-19 14:53:18,937][0m Trial 1 finished with value: 0.07680530963624424 and parameters: {'boosting_type': 'goss', 'num_leaves': 50, 'max_depth': 8, 'learning_rate': 0.044495514296015844, 'n_estimators': 76, 'min_child_weight': 0.004427787358480699}. Best is trial 1 with value: 0.07680530963624424.[0m
[32m[I 2022-10-19 14:53:19,082][0m Trial 2 finished with value: 0.07904347810650937 and parameters: {'boosting_type': 'dart', 'num_leaves': 3, 'max_depth': 8, 'learning_rate': 0.03251906473866682, 'n_estimators': 73, 'min_child_weight': 

In [13]:
import optuna
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from scipy.stats import pearsonr
from sklearn.metrics import r2_score
from sklearn.model_selection import cross_validate
from functools import *
from sklearn.svm import SVR

def objective_svr(trial,x,y):
    kernel = trial.suggest_categorical("kernel", ["linear", "poly", "rbf", "sigmoid"])
    degree = trial.suggest_int("degree", 1, 10)
    C = trial.suggest_float("C", 0, 5)
    epsilon = trial.suggest_float("epsilon", 0, 5)
    
        
    model = SVR(
        kernel = kernel,
        degree = degree,
        C = C,
        epsilon = epsilon
        )
    
    #Use 10 way cross validation for training.
    score = cross_validate(model, x, y, cv = 5, scoring = 'neg_mean_squared_error')
    score_mean = -np.mean(score['test_score'])
    return(score_mean)
    
study = optuna.create_study(direction = 'minimize')
optimization_function = partial(
                objective_svr,
                x = guide_RNA_one_hot_encoded_sequences,
                y = k_values
            )
study.optimize(optimization_function, n_trials=100) 
svr_gRNA = study.best_value

[32m[I 2022-10-19 18:17:01,340][0m A new study created in memory with name: no-name-1dbf6793-ab47-43fc-af05-fc5f6e9f9e46[0m
[32m[I 2022-10-19 18:17:01,361][0m Trial 0 finished with value: 1.2047362142133398 and parameters: {'kernel': 'linear', 'degree': 5, 'C': 3.022632875724411, 'epsilon': 3.2048251889986643}. Best is trial 0 with value: 1.2047362142133398.[0m
[32m[I 2022-10-19 18:17:01,385][0m Trial 1 finished with value: 0.08019004365017482 and parameters: {'kernel': 'poly', 'degree': 9, 'C': 1.1475622362476896, 'epsilon': 0.055910266778556594}. Best is trial 1 with value: 0.08019004365017482.[0m
[32m[I 2022-10-19 18:17:01,402][0m Trial 2 finished with value: 1.2047362142133398 and parameters: {'kernel': 'linear', 'degree': 5, 'C': 0.6114053351710419, 'epsilon': 1.9243457775956319}. Best is trial 1 with value: 0.08019004365017482.[0m
[32m[I 2022-10-19 18:17:01,432][0m Trial 3 finished with value: 0.09194829482143656 and parameters: {'kernel': 'rbf', 'degree': 4, 'C': 3

In [18]:
#One hot encode only the 20 gRNA bases that interact with the target DNA
complementary_guide_RNA_sequences = []
for i in guide_RNA_sequences:
    complementary_guide_RNA_sequence = i[21:41]
    complementary_guide_RNA_sequences.append(complementary_guide_RNA_sequence)

#One hot encode them
complementary_guide_RNA_one_hot_encoded_sequences = []
for i in complementary_guide_RNA_sequences:
    one_hot_encoded_sequence = one_hot_encode_RNA(i).ravel()
    complementary_guide_RNA_one_hot_encoded_sequences.append(one_hot_encoded_sequence)

#generate feature names
feature_names_complementary_gRNA = []
for i in range(20):
    feature_names_complementary_gRNA.append('guide RNA A' + str(i+1))
    feature_names_complementary_gRNA.append('guide RNA U' + str(i+1))
    feature_names_complementary_gRNA.append('guide RNA C' + str(i+1))
    feature_names_complementary_gRNA.append('guide RNA G' + str(i+1))

In [40]:
model = LinearRegression()
score = cross_validate(model, complementary_guide_RNA_one_hot_encoded_sequences, k_values, cv = 5, scoring = 'neg_mean_squared_error')
score_mean = -np.mean(score['test_score'])
linear_regression_complementary_gRNA = score_mean
linear_regression_complementary_gRNA

0.23937805191439362

In [41]:
study = optuna.create_study(direction = 'minimize')
optimization_function = partial(
                objective_ridge,
                x = complementary_guide_RNA_one_hot_encoded_sequences,
                y = k_values
            )
study.optimize(optimization_function, n_trials=100) 
ridge_complementary_gRNA = study.best_value

[32m[I 2022-10-21 10:44:32,756][0m A new study created in memory with name: no-name-5d24517b-0656-45eb-8088-8b3c34838de8[0m
[32m[I 2022-10-21 10:44:32,787][0m Trial 0 finished with value: 0.1788140460866425 and parameters: {'alpha': 0.7689143113528994, 'solver': 'saga'}. Best is trial 0 with value: 0.1788140460866425.[0m
[32m[I 2022-10-21 10:44:32,798][0m Trial 1 finished with value: 0.17952142127725873 and parameters: {'alpha': 0.7733514192347101, 'solver': 'auto'}. Best is trial 0 with value: 0.1788140460866425.[0m
[32m[I 2022-10-21 10:44:32,813][0m Trial 2 finished with value: 0.19770467955079038 and parameters: {'alpha': 0.31865918691020667, 'solver': 'lsqr'}. Best is trial 0 with value: 0.1788140460866425.[0m
[32m[I 2022-10-21 10:44:32,825][0m Trial 3 finished with value: 0.17345282400868942 and parameters: {'alpha': 0.9686490114380973, 'solver': 'cholesky'}. Best is trial 3 with value: 0.17345282400868942.[0m
[32m[I 2022-10-21 10:44:32,838][0m Trial 4 finished wi

In [42]:
study = optuna.create_study(direction = 'minimize')
optimization_function = partial(
                objective_lasso,
                x = complementary_guide_RNA_one_hot_encoded_sequences,
                y = k_values
            )
study.optimize(optimization_function, n_trials=100) 
lasso_complementary_gRNA = study.best_value

[32m[I 2022-10-21 10:44:39,364][0m A new study created in memory with name: no-name-47e75886-1c23-4d63-ae5c-c24aea8d135f[0m
[32m[I 2022-10-21 10:44:39,371][0m Trial 0 finished with value: 0.07605086973235141 and parameters: {'alpha': 0.13860243753468438}. Best is trial 0 with value: 0.07605086973235141.[0m
[32m[I 2022-10-21 10:44:39,379][0m Trial 1 finished with value: 0.07605086973235141 and parameters: {'alpha': 0.8133508828699847}. Best is trial 0 with value: 0.07605086973235141.[0m
[32m[I 2022-10-21 10:44:39,389][0m Trial 2 finished with value: 0.07605086973235141 and parameters: {'alpha': 0.6155536553921392}. Best is trial 0 with value: 0.07605086973235141.[0m
[32m[I 2022-10-21 10:44:39,397][0m Trial 3 finished with value: 0.07605086973235141 and parameters: {'alpha': 0.5769101483981451}. Best is trial 0 with value: 0.07605086973235141.[0m
[32m[I 2022-10-21 10:44:39,405][0m Trial 4 finished with value: 0.07605086973235141 and parameters: {'alpha': 0.51301278645520

In [43]:
study = optuna.create_study(direction = 'minimize')
optimization_function = partial(
                objective_ElasticNet,
                x = complementary_guide_RNA_one_hot_encoded_sequences,
                y = k_values
            )
study.optimize(optimization_function, n_trials=100) 
ElasticNet_complementary_gRNA = study.best_value

[32m[I 2022-10-21 10:44:49,899][0m A new study created in memory with name: no-name-08b412b1-dbd0-4bbb-8096-fa907c32d82a[0m
[32m[I 2022-10-21 10:44:49,909][0m Trial 0 finished with value: 0.07605086973235141 and parameters: {'alpha': 0.9821956938875107, 'l1_ratio': 0.10772130833366445}. Best is trial 0 with value: 0.07605086973235141.[0m
[32m[I 2022-10-21 10:44:49,935][0m Trial 1 finished with value: 0.18302909765585584 and parameters: {'alpha': 0.0009105393164136055, 'l1_ratio': 0.6007350717355755}. Best is trial 0 with value: 0.07605086973235141.[0m
[32m[I 2022-10-21 10:44:49,941][0m Trial 2 finished with value: 0.07605086973235141 and parameters: {'alpha': 0.7535819421062441, 'l1_ratio': 0.6573705512435053}. Best is trial 0 with value: 0.07605086973235141.[0m
[32m[I 2022-10-21 10:44:49,949][0m Trial 3 finished with value: 0.07605086973235141 and parameters: {'alpha': 0.18370192568227517, 'l1_ratio': 0.8090638013050026}. Best is trial 0 with value: 0.07605086973235141.

In [44]:
study = optuna.create_study(direction = 'minimize')
optimization_function = partial(
                objective_dt,
                x = complementary_guide_RNA_one_hot_encoded_sequences,
                y = k_values
            )
study.optimize(optimization_function, n_trials=100) 
ridge_gRNA = study.best_value

[32m[I 2022-10-21 10:45:26,383][0m A new study created in memory with name: no-name-f6ac92c5-6da0-476b-9c8a-5c53a51ba03f[0m


NameError: name 'objective_dt' is not defined