This notebook takes in sequence features and runs them through a different models. All model hyperparameters are 
selected using optuna. Models are run across 10 folds and benchmarking results are plotted
for mean squared error and pearson's correlation coefficient



In [2]:
#import libraries
import pandas as pd
import numpy as np 
import sklearn

In [3]:
#Load up guide RNA And target DNA as dataframes
df_guide_RNA = pd.read_csv(r"C:\Users\dasak\OneDrive\Documents\GitHub\cas12-collateral-cleavage-prediction\full_guide_RNA_sequences.csv")
df_target_DNA = pd.read_csv(r"C:\Users\dasak\OneDrive\Documents\GitHub\cas12-collateral-cleavage-prediction\target_DNA_sequences.csv")

# Load k_value_sigmoid from experiment on 24_05_2022 
from numpy import genfromtxt
k_values = genfromtxt('k_values_sigmoid.csv', delimiter=',')
print(k_values)

[ 0.11984317  0.13894219  0.15437609  0.06970868  0.13297348  0.12425794
  0.18793106  0.27095051  0.16818124  0.1347531   0.29226458  0.12293203
  0.38708549  0.19778096  0.3622934   0.34890034  0.20684573  0.11417769
 -0.07936754  0.31408586  0.18672599  0.13433442  0.1847865   0.24778192
  0.42079963  0.35078867  0.09680571  0.5202575   0.22697745  0.3248318
  0.1702902   0.24634137  0.14220321 -2.51561011  0.17162254  0.04222993
  0.24852536  0.296885    0.27256568  0.32140478  0.21027936  0.1127019
  0.20870236  0.28398043  0.32744511  0.2282549   0.14870497  0.32030304
  0.2445419   0.08811278  0.44583873  0.15249895  0.23265821  0.22829356
  0.2489641   0.16632398  0.27098826  0.18329734  0.11389634  0.29132781
  0.35664269 -0.2822598   0.14580803  0.06875822  0.09644608  0.10429324
  0.16959348  0.32049426  0.3353713   0.18005612  0.15875678  0.29960359
  0.41943501  0.45389137  0.29449169  0.47311382  0.28739385  0.33115047
  0.3254987   0.44876175  0.15296381  0.1752204   0.0

In [4]:
def one_hot_encode_DNA(DNA_sequence):
    mapping = dict(zip("atcg", range(4)))    
    one_hot_encoded_sequence = [mapping[i] for i in DNA_sequence]
    return np.eye(4)[one_hot_encoded_sequence]

In [5]:
def one_hot_encode_RNA(gRNA_sequence):
    mapping = dict(zip("aucg", range(4)))    
    one_hot_encoded_sequence = [mapping[i] for i in gRNA_sequence]
    return np.eye(4)[one_hot_encoded_sequence]

In [6]:
guide_RNA_sequences = df_guide_RNA['Sequence']
guide_RNA_one_hot_encoded_sequences = []
for i in guide_RNA_sequences:
    one_hot_encoded_sequence = one_hot_encode_RNA(i)
    guide_RNA_one_hot_encoded_sequences.append(one_hot_encoded_sequence.ravel())

#generate feature names
feature_names_gRNA = []
for i in range(41):
    feature_names_gRNA.append('guide RNA A' + str(i+1))
    feature_names_gRNA.append('guide RNA U' + str(i+1))
    feature_names_gRNA.append('guide RNA C' + str(i+1))
    feature_names_gRNA.append('guide RNA G' + str(i+1))

In [9]:
#Generalised model function

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from scipy.stats import pearsonr
from sklearn.metrics import r2_score
from sklearn.tree import DecisionTreeRegressor

def train_model(model,input_features, outputs, input_feature_names):
    '''
    This function takes in input features and outputs as kvalue for an experiment. 
    It then trains a model on these features. Hyperparameter optimisation is performed using
    optuna, training and validation is performed with cross validation    
    '''
    #Set out 20% of data to act as a hold out for training.
    input_train, input_test, output_train, output_test = train_test_split(
        input_features,outputs, test_size=0.2, random_state=0)
    
    #Model selection
    model = model
    model = model.fit(input_train, output_train)
    model_importance = pd.DataFrame({'Feature_names':input_feature_names,'Importance': model.feature_importances_})
    output_predictions = model.predict(input_test)
    mse = mean_squared_error(output_test, output_predictions)
    pearson_rank_results = pearsonr(output_test, output_predictions)
    r2 = r2_score(output_test,output_predictions)
    return(model_importance,mse, pearson_rank_results,r2)
    

In [10]:
import optuna
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from scipy.stats import pearsonr
from sklearn.metrics import r2_score
from sklearn.model_selection import cross_validate
from sklearn.linear_model import LinearRegression

model = LinearRegression()
score = cross_validate(model, guide_RNA_one_hot_encoded_sequences, k_values, cv = 5, scoring = 'neg_mean_squared_error')
score_mean = -np.mean(score['test_score'])
print(score_mean)

0.21836289016980878


In [21]:
import optuna
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from scipy.stats import pearsonr
from sklearn.metrics import r2_score
from sklearn.model_selection import cross_validate
from sklearn.linear_model import Ridge

def objective_ridge(trial,x,y):
    
    alpha = trial.suggest_float('alpha', 0, 100)
    solver = trial.suggest_categorical('solver', ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga'])
    
    
    model = Ridge(
        alpha = alpha,
        solver = solver,
        )
    
    #Use 5 way cross validation for training.
    score = cross_validate(model, x, y, cv = 5, scoring = 'r2')
    score_mean = -np.mean(score['test_score'])
    return(score_mean)
    
study = optuna.create_study(direction = 'minimize')
optimization_function = partial(
                objective_ridge,
                x = guide_RNA_one_hot_encoded_sequences,
                y = k_values
            )
study.optimize(optimization_function, n_trials=100) 
ridge_gRNA = study.best_value



[32m[I 2022-10-19 18:28:42,087][0m A new study created in memory with name: no-name-3d67989f-08c0-4684-a193-60e593e89ec5[0m
[32m[I 2022-10-19 18:28:42,110][0m Trial 0 finished with value: 0.2549453776614864 and parameters: {'alpha': 46.37560450288348, 'solver': 'lsqr'}. Best is trial 0 with value: 0.2549453776614864.[0m
[32m[I 2022-10-19 18:28:42,145][0m Trial 1 finished with value: 0.16469951426047688 and parameters: {'alpha': 65.0632222161251, 'solver': 'sag'}. Best is trial 1 with value: 0.16469951426047688.[0m
[32m[I 2022-10-19 18:28:42,174][0m Trial 2 finished with value: 2.590738405857814 and parameters: {'alpha': 3.606218026857788, 'solver': 'svd'}. Best is trial 1 with value: 0.16469951426047688.[0m
[32m[I 2022-10-19 18:28:42,227][0m Trial 3 finished with value: 0.22792980728849677 and parameters: {'alpha': 50.74448095503135, 'solver': 'sag'}. Best is trial 1 with value: 0.16469951426047688.[0m
[32m[I 2022-10-19 18:28:42,247][0m Trial 4 finished with value: 1.7

[32m[I 2022-10-19 18:28:43,781][0m Trial 40 finished with value: 0.13447032475308623 and parameters: {'alpha': 75.70226025737267, 'solver': 'sag'}. Best is trial 15 with value: 0.09256864974571426.[0m
[32m[I 2022-10-19 18:28:43,809][0m Trial 41 finished with value: 0.09361774819792264 and parameters: {'alpha': 99.15070379046284, 'solver': 'auto'}. Best is trial 15 with value: 0.09256864974571426.[0m
[32m[I 2022-10-19 18:28:43,855][0m Trial 42 finished with value: 0.09982248912702689 and parameters: {'alpha': 94.47936918001216, 'solver': 'svd'}. Best is trial 15 with value: 0.09256864974571426.[0m
[32m[I 2022-10-19 18:28:43,906][0m Trial 43 finished with value: 0.10597313308639425 and parameters: {'alpha': 90.33493476955762, 'solver': 'sag'}. Best is trial 15 with value: 0.09256864974571426.[0m
[32m[I 2022-10-19 18:28:43,977][0m Trial 44 finished with value: 0.09788254350852461 and parameters: {'alpha': 95.87975557854024, 'solver': 'saga'}. Best is trial 15 with value: 0.0

[32m[I 2022-10-19 18:28:45,305][0m Trial 80 finished with value: 0.10285181422771421 and parameters: {'alpha': 92.38849717728334, 'solver': 'svd'}. Best is trial 55 with value: 0.09253738064831937.[0m
[32m[I 2022-10-19 18:28:45,352][0m Trial 81 finished with value: 0.09269219334839703 and parameters: {'alpha': 99.97277289583081, 'solver': 'sag'}. Best is trial 55 with value: 0.09253738064831937.[0m
[32m[I 2022-10-19 18:28:45,403][0m Trial 82 finished with value: 0.09255714625013849 and parameters: {'alpha': 99.96349833344209, 'solver': 'sag'}. Best is trial 55 with value: 0.09253738064831937.[0m
[32m[I 2022-10-19 18:28:45,448][0m Trial 83 finished with value: 0.0960812999832645 and parameters: {'alpha': 97.21039270012648, 'solver': 'sag'}. Best is trial 55 with value: 0.09253738064831937.[0m
[32m[I 2022-10-19 18:28:45,502][0m Trial 84 finished with value: 0.09830862030434333 and parameters: {'alpha': 95.53624788843402, 'solver': 'sag'}. Best is trial 55 with value: 0.0925

In [12]:
import optuna
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from scipy.stats import pearsonr
from sklearn.metrics import r2_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_validate
from functools import *

def objective_dt(trial,x,y):
    
    ccp_alpha = trial.suggest_float('ccp_alpha', 0, 10)
    criterion = trial.suggest_categorical('criterion', ['squared_error', 'friedman_mse', 'absolute_error'])
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 100)
    min_samples_split = trial.suggest_int ('min_samples_split',20,100)
    
    model =  DecisionTreeRegressor(
        ccp_alpha = ccp_alpha,
        criterion = criterion,
        min_samples_leaf = min_samples_leaf,
        min_samples_split = min_samples_split,
        random_state = 42)
    
    #Use 5 way cross validation for training.
    score = cross_validate(model, x, y, cv = 5, scoring = 'neg_mean_squared_error')
    score_mean = -np.mean(score['test_score'])
    return(score_mean)
    
study = optuna.create_study(direction = 'minimize')
optimization_function = partial(
                objective_dt,
                x = guide_RNA_one_hot_encoded_sequences,
                y = k_values
            )
study.optimize(optimization_function, n_trials=100) 
dt_gRNA = study.best_value

[32m[I 2022-10-19 18:08:34,737][0m A new study created in memory with name: no-name-1949a8ee-b5f4-49e3-afbf-c0d15c90a158[0m
[32m[I 2022-10-19 18:08:34,757][0m Trial 0 finished with value: 0.07605086973235142 and parameters: {'ccp_alpha': 1.5505243149602743, 'criterion': 'squared_error', 'min_samples_leaf': 72, 'min_samples_split': 42}. Best is trial 0 with value: 0.07605086973235142.[0m
[32m[I 2022-10-19 18:08:34,776][0m Trial 1 finished with value: 0.07576461605299843 and parameters: {'ccp_alpha': 6.543203471293488, 'criterion': 'absolute_error', 'min_samples_leaf': 78, 'min_samples_split': 77}. Best is trial 1 with value: 0.07576461605299843.[0m
[32m[I 2022-10-19 18:08:34,798][0m Trial 2 finished with value: 0.07605086973235142 and parameters: {'ccp_alpha': 5.0368180567441225, 'criterion': 'squared_error', 'min_samples_leaf': 17, 'min_samples_split': 49}. Best is trial 1 with value: 0.07576461605299843.[0m
[32m[I 2022-10-19 18:08:34,818][0m Trial 3 finished with value: 

[32m[I 2022-10-19 18:08:35,763][0m Trial 30 finished with value: 0.07605086973235142 and parameters: {'ccp_alpha': 4.3878424985691, 'criterion': 'squared_error', 'min_samples_leaf': 23, 'min_samples_split': 99}. Best is trial 1 with value: 0.07576461605299843.[0m
[32m[I 2022-10-19 18:08:35,809][0m Trial 31 finished with value: 0.07576461605299843 and parameters: {'ccp_alpha': 2.6884058128221255, 'criterion': 'absolute_error', 'min_samples_leaf': 45, 'min_samples_split': 61}. Best is trial 1 with value: 0.07576461605299843.[0m
[32m[I 2022-10-19 18:08:35,875][0m Trial 32 finished with value: 0.07576461605299843 and parameters: {'ccp_alpha': 6.008672650971545, 'criterion': 'absolute_error', 'min_samples_leaf': 3, 'min_samples_split': 91}. Best is trial 1 with value: 0.07576461605299843.[0m
[32m[I 2022-10-19 18:08:35,915][0m Trial 33 finished with value: 0.07576461605299843 and parameters: {'ccp_alpha': 3.2764844467617644, 'criterion': 'absolute_error', 'min_samples_leaf': 90, '

[32m[I 2022-10-19 18:08:37,443][0m Trial 61 finished with value: 0.07576461605299843 and parameters: {'ccp_alpha': 0.8557085664627893, 'criterion': 'absolute_error', 'min_samples_leaf': 6, 'min_samples_split': 100}. Best is trial 1 with value: 0.07576461605299843.[0m
[32m[I 2022-10-19 18:08:37,508][0m Trial 62 finished with value: 0.07576461605299843 and parameters: {'ccp_alpha': 7.247221603984431, 'criterion': 'absolute_error', 'min_samples_leaf': 7, 'min_samples_split': 100}. Best is trial 1 with value: 0.07576461605299843.[0m
[32m[I 2022-10-19 18:08:37,599][0m Trial 63 finished with value: 0.07576461605299843 and parameters: {'ccp_alpha': 3.166898323197957, 'criterion': 'absolute_error', 'min_samples_leaf': 2, 'min_samples_split': 44}. Best is trial 1 with value: 0.07576461605299843.[0m
[32m[I 2022-10-19 18:08:37,634][0m Trial 64 finished with value: 0.07576461605299843 and parameters: {'ccp_alpha': 8.622302568059348, 'criterion': 'absolute_error', 'min_samples_leaf': 79,

[32m[I 2022-10-19 18:08:39,119][0m Trial 92 finished with value: 0.07576461605299843 and parameters: {'ccp_alpha': 9.67782431381227, 'criterion': 'absolute_error', 'min_samples_leaf': 71, 'min_samples_split': 84}. Best is trial 1 with value: 0.07576461605299843.[0m
[32m[I 2022-10-19 18:08:39,214][0m Trial 93 finished with value: 0.07576461605299843 and parameters: {'ccp_alpha': 4.030481157135222, 'criterion': 'absolute_error', 'min_samples_leaf': 4, 'min_samples_split': 42}. Best is trial 1 with value: 0.07576461605299843.[0m
[32m[I 2022-10-19 18:08:39,337][0m Trial 94 finished with value: 0.07576461605299843 and parameters: {'ccp_alpha': 0.41133862619955563, 'criterion': 'absolute_error', 'min_samples_leaf': 7, 'min_samples_split': 52}. Best is trial 1 with value: 0.07576461605299843.[0m
[32m[I 2022-10-19 18:08:39,493][0m Trial 95 finished with value: 0.07576461605299843 and parameters: {'ccp_alpha': 3.521996974324071, 'criterion': 'absolute_error', 'min_samples_leaf': 2, '

In [18]:
import optuna
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from scipy.stats import pearsonr
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_validate

def objective_rf(trial,x,y):
    n_estimators = trial.suggest_int("n_estimators", 100, 1000)
    min_samples_leaf = trial.suggest_int("min_samples_leaf", 1, 30)
    max_samples = trial.suggest_float("max_samples", 0.01, 1.0)
    max_features = trial.suggest_float("max_features", 0.01, 1.0)
    
    model = RandomForestRegressor(
        n_estimators = n_estimators,
        min_samples_leaf = min_samples_leaf,
        max_samples = max_samples,
        max_features = max_features,
        n_jobs = 1, 
        random_state = 42)
    
    #Use 5 way cross validation for training.
    score = cross_validate(model, x, y, cv = 10, scoring = 'neg_mean_squared_error')
    score_mean = -np.mean(score['test_score'])
    return(score_mean)
    
study = optuna.create_study(direction = 'minimize')
optimization_function = partial(
                objective_rf,
                x = guide_RNA_one_hot_encoded_sequences,
                y = k_values
            )
study.optimize(optimization_function, n_trials=100) 
rf_gRNA = study.best_value

[32m[I 2022-10-19 14:54:10,345][0m A new study created in memory with name: no-name-ef165ae9-5867-43a4-b3d1-1cd278a8d96c[0m
[32m[I 2022-10-19 14:54:16,892][0m Trial 0 finished with value: 0.07675764695932201 and parameters: {'n_estimators': 696, 'min_samples_leaf': 10, 'max_samples': 0.34236702726754664, 'max_features': 0.03053280938976001}. Best is trial 0 with value: 0.07675764695932201.[0m
[32m[I 2022-10-19 14:54:24,978][0m Trial 1 finished with value: 0.07655609828372802 and parameters: {'n_estimators': 863, 'min_samples_leaf': 17, 'max_samples': 0.42331528993699313, 'max_features': 0.3578074046428498}. Best is trial 1 with value: 0.07655609828372802.[0m
[32m[I 2022-10-19 14:54:32,423][0m Trial 2 finished with value: 0.07667334794081072 and parameters: {'n_estimators': 765, 'min_samples_leaf': 18, 'max_samples': 0.39056807161093854, 'max_features': 0.2888341782729423}. Best is trial 1 with value: 0.07655609828372802.[0m
[32m[I 2022-10-19 14:54:34,236][0m Trial 3 finis

[32m[I 2022-10-19 14:56:28,701][0m Trial 30 finished with value: 0.07664743244487554 and parameters: {'n_estimators': 277, 'min_samples_leaf': 13, 'max_samples': 0.11071271541610211, 'max_features': 0.6106877709890951}. Best is trial 27 with value: 0.07615850818706663.[0m
[32m[I 2022-10-19 14:56:29,702][0m Trial 31 finished with value: 0.0765300786622415 and parameters: {'n_estimators': 104, 'min_samples_leaf': 8, 'max_samples': 0.3032419919700463, 'max_features': 0.9383367122548871}. Best is trial 27 with value: 0.07615850818706663.[0m
[32m[I 2022-10-19 14:56:31,029][0m Trial 32 finished with value: 0.07694603180866294 and parameters: {'n_estimators': 111, 'min_samples_leaf': 4, 'max_samples': 0.49100138613100963, 'max_features': 0.8356640462718096}. Best is trial 27 with value: 0.07615850818706663.[0m
[32m[I 2022-10-19 14:56:32,149][0m Trial 33 finished with value: 0.07673277957023436 and parameters: {'n_estimators': 144, 'min_samples_leaf': 16, 'max_samples': 0.2144332285

[32m[I 2022-10-19 14:58:28,269][0m Trial 60 finished with value: 0.07636114872524692 and parameters: {'n_estimators': 568, 'min_samples_leaf': 6, 'max_samples': 0.7464738336726074, 'max_features': 0.05939224812902466}. Best is trial 27 with value: 0.07615850818706663.[0m
[32m[I 2022-10-19 14:58:31,756][0m Trial 61 finished with value: 0.07673130244963183 and parameters: {'n_estimators': 402, 'min_samples_leaf': 7, 'max_samples': 0.012307803723202976, 'max_features': 0.24722106147920167}. Best is trial 27 with value: 0.07615850818706663.[0m
[32m[I 2022-10-19 14:58:36,315][0m Trial 62 finished with value: 0.07648384216101059 and parameters: {'n_estimators': 493, 'min_samples_leaf': 9, 'max_samples': 0.05788689982984951, 'max_features': 0.27551063094506034}. Best is trial 27 with value: 0.07615850818706663.[0m
[32m[I 2022-10-19 14:58:40,365][0m Trial 63 finished with value: 0.07636993776186651 and parameters: {'n_estimators': 438, 'min_samples_leaf': 3, 'max_samples': 0.1085535

[32m[I 2022-10-19 15:00:34,122][0m Trial 90 finished with value: 0.07666511718461891 and parameters: {'n_estimators': 246, 'min_samples_leaf': 8, 'max_samples': 0.11926359108139117, 'max_features': 0.7596919206833147}. Best is trial 27 with value: 0.07615850818706663.[0m
[32m[I 2022-10-19 15:00:35,108][0m Trial 91 finished with value: 0.07649970406828722 and parameters: {'n_estimators': 106, 'min_samples_leaf': 11, 'max_samples': 0.08311090854938492, 'max_features': 0.1939908382488979}. Best is trial 27 with value: 0.07615850818706663.[0m
[32m[I 2022-10-19 15:00:36,731][0m Trial 92 finished with value: 0.07661433624325856 and parameters: {'n_estimators': 161, 'min_samples_leaf': 11, 'max_samples': 0.10334801431015711, 'max_features': 0.25938129579917396}. Best is trial 27 with value: 0.07615850818706663.[0m
[32m[I 2022-10-19 15:00:37,927][0m Trial 93 finished with value: 0.0768556601283373 and parameters: {'n_estimators': 132, 'min_samples_leaf': 12, 'max_samples': 0.0137859

In [14]:
import optuna
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from scipy.stats import pearsonr
from sklearn.metrics import r2_score
from sklearn.model_selection import cross_validate
from functools import *
from xgboost import XGBRegressor 

def objective_xgb(trial,x,y):
    eta = trial.suggest_float("eta", 0, 0.2)
    max_depth = trial.suggest_int("max_depth", 7, 11)
    min_child_weight = trial.suggest_int("min_child_weight", 2, 6)
    subsample = trial.suggest_float("subsample", 0.6, 1.0)
    reg_lambda = trial.suggest_float("lambda", 0.8, 1.0)
    
    model = XGBRegressor(
        eta = eta,
        max_depth = max_depth,
        min_child_weight = min_child_weight,
        subsample = subsample,
        reg_lambda = reg_lambda,
        nthread = 1)
    
    #Use 5 way cross validation for training.
    score = cross_validate(model, x, y, cv = 10, scoring = 'neg_mean_squared_error')
    score_mean = -np.mean(score['test_score'])
    return(score_mean)
    
study = optuna.create_study(direction = 'minimize')
optimization_function = partial(
                objective_xgb,
                x = guide_RNA_one_hot_encoded_sequences,
                y = k_values
            )
study.optimize(optimization_function, n_trials=100) 
xgb_gRNA = study.best_value

[32m[I 2022-10-19 14:49:28,722][0m A new study created in memory with name: no-name-e8eb9fec-90e7-4e4d-a3d1-3a9a069a4068[0m
[32m[I 2022-10-19 14:49:30,623][0m Trial 0 finished with value: 0.09190494776876856 and parameters: {'eta': 0.027495917812330406, 'max_depth': 8, 'min_child_weight': 2, 'subsample': 0.9356158387000062, 'lambda': 0.9685790687928537}. Best is trial 0 with value: 0.09190494776876856.[0m
[32m[I 2022-10-19 14:49:32,528][0m Trial 1 finished with value: 0.10973632595568635 and parameters: {'eta': 0.13467560438158555, 'max_depth': 7, 'min_child_weight': 2, 'subsample': 0.977320590527107, 'lambda': 0.8304719075603851}. Best is trial 0 with value: 0.09190494776876856.[0m
[32m[I 2022-10-19 14:49:34,184][0m Trial 2 finished with value: 0.08862041787453145 and parameters: {'eta': 0.044137016366609674, 'max_depth': 9, 'min_child_weight': 4, 'subsample': 0.6082287513858248, 'lambda': 0.9394233494968811}. Best is trial 2 with value: 0.08862041787453145.[0m
[32m[I 202

[32m[I 2022-10-19 14:50:20,090][0m Trial 28 finished with value: 0.08222636561125093 and parameters: {'eta': 0.017983385697771734, 'max_depth': 8, 'min_child_weight': 6, 'subsample': 0.7565888610409927, 'lambda': 0.9189032796787104}. Best is trial 28 with value: 0.08222636561125093.[0m
[32m[I 2022-10-19 14:50:21,894][0m Trial 29 finished with value: 0.08641292137002783 and parameters: {'eta': 0.03556560230129805, 'max_depth': 8, 'min_child_weight': 4, 'subsample': 0.7499011148354482, 'lambda': 0.9218768636820243}. Best is trial 28 with value: 0.08222636561125093.[0m
[32m[I 2022-10-19 14:50:23,401][0m Trial 30 finished with value: 0.08355949549022165 and parameters: {'eta': 0.016328648311328374, 'max_depth': 7, 'min_child_weight': 6, 'subsample': 0.8183811003341308, 'lambda': 0.8671458234833642}. Best is trial 28 with value: 0.08222636561125093.[0m
[32m[I 2022-10-19 14:50:25,083][0m Trial 31 finished with value: 0.08302411469032482 and parameters: {'eta': 0.025543963140848253

[32m[I 2022-10-19 14:51:09,455][0m Trial 57 finished with value: 0.08385241771703608 and parameters: {'eta': 0.04105442989352977, 'max_depth': 11, 'min_child_weight': 5, 'subsample': 0.6373999983375106, 'lambda': 0.9050318565845158}. Best is trial 47 with value: 0.08187977186677563.[0m
[32m[I 2022-10-19 14:51:11,252][0m Trial 58 finished with value: 0.08915623578132613 and parameters: {'eta': 0.058859474028454734, 'max_depth': 11, 'min_child_weight': 5, 'subsample': 0.6189161647009037, 'lambda': 0.9239087405768681}. Best is trial 47 with value: 0.08187977186677563.[0m
[32m[I 2022-10-19 14:51:13,032][0m Trial 59 finished with value: 0.08384250370490529 and parameters: {'eta': 0.015608409375941535, 'max_depth': 11, 'min_child_weight': 5, 'subsample': 0.6247962079601884, 'lambda': 0.8992406921716338}. Best is trial 47 with value: 0.08187977186677563.[0m
[32m[I 2022-10-19 14:51:14,688][0m Trial 60 finished with value: 0.09656096528075934 and parameters: {'eta': 0.124264497626718

[32m[I 2022-10-19 14:51:56,482][0m Trial 86 finished with value: 0.08070757376280942 and parameters: {'eta': 0.02252197989152814, 'max_depth': 10, 'min_child_weight': 6, 'subsample': 0.611491151171016, 'lambda': 0.8901658602743404}. Best is trial 85 with value: 0.08038608542559922.[0m
[32m[I 2022-10-19 14:51:58,109][0m Trial 87 finished with value: 0.08492148844620867 and parameters: {'eta': 0.013142757512770924, 'max_depth': 10, 'min_child_weight': 6, 'subsample': 0.6038998569327994, 'lambda': 0.8987852924168247}. Best is trial 85 with value: 0.08038608542559922.[0m
[32m[I 2022-10-19 14:51:59,762][0m Trial 88 finished with value: 0.08097550673792829 and parameters: {'eta': 0.02470119387818088, 'max_depth': 10, 'min_child_weight': 6, 'subsample': 0.6156986574374507, 'lambda': 0.8910666757491847}. Best is trial 85 with value: 0.08038608542559922.[0m
[32m[I 2022-10-19 14:52:01,171][0m Trial 89 finished with value: 0.08441983784646419 and parameters: {'eta': 0.03790176106959599

In [17]:
import optuna
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from scipy.stats import pearsonr
from sklearn.metrics import r2_score
from sklearn.model_selection import cross_validate
from functools import *
from lightgbm import LGBMRegressor

def objective_lgbm(trial,x,y):
    boosting_type = trial.suggest_categorical("boosting_type", ['gbdt', 'dart', 'goss'])
    num_leaves = trial.suggest_int("num_leaves", 2, 50)
    max_depth = trial.suggest_int("max_depth", -1, 50)
    learning_rate = trial.suggest_float("learning_rate", 0.001, 0.1)
    n_estimators = trial.suggest_int("n_estimators", 50, 200)
    min_child_weight = trial.suggest_float("min_child_weight", 0.001, 0.005)
        
    model = LGBMRegressor(
        boosting_type = boosting_type,
        num_leaves = num_leaves,
        max_depth = max_depth,
        learning_rate = learning_rate,
        n_estimators = n_estimators,
        min_child_weight = min_child_weight,
        )
    
    #Use 10 way cross validation for training.
    score = cross_validate(model, x, y, cv = 5, scoring = 'neg_mean_squared_error')
    score_mean = -np.mean(score['test_score'])
    return(score_mean)
    
study = optuna.create_study(direction = 'minimize')
optimization_function = partial(
                objective_lgbm,
                x = guide_RNA_one_hot_encoded_sequences,
                y = k_values
            )
study.optimize(optimization_function, n_trials=100) 
lgbm_gRNA = study.best_value

[32m[I 2022-10-19 14:53:18,710][0m A new study created in memory with name: no-name-a34671e6-4efe-46db-a026-195f4451e090[0m
[32m[I 2022-10-19 14:53:18,841][0m Trial 0 finished with value: 0.08071008872239678 and parameters: {'boosting_type': 'dart', 'num_leaves': 5, 'max_depth': 21, 'learning_rate': 0.005734318809517211, 'n_estimators': 74, 'min_child_weight': 0.003776426404437692}. Best is trial 0 with value: 0.08071008872239678.[0m
[32m[I 2022-10-19 14:53:18,937][0m Trial 1 finished with value: 0.07680530963624424 and parameters: {'boosting_type': 'goss', 'num_leaves': 50, 'max_depth': 8, 'learning_rate': 0.044495514296015844, 'n_estimators': 76, 'min_child_weight': 0.004427787358480699}. Best is trial 1 with value: 0.07680530963624424.[0m
[32m[I 2022-10-19 14:53:19,082][0m Trial 2 finished with value: 0.07904347810650937 and parameters: {'boosting_type': 'dart', 'num_leaves': 3, 'max_depth': 8, 'learning_rate': 0.03251906473866682, 'n_estimators': 73, 'min_child_weight': 

[32m[I 2022-10-19 14:53:22,533][0m Trial 26 finished with value: 0.07658447686711084 and parameters: {'boosting_type': 'goss', 'num_leaves': 40, 'max_depth': 2, 'learning_rate': 0.07821716609153083, 'n_estimators': 57, 'min_child_weight': 0.004134958279174605}. Best is trial 23 with value: 0.07643553702223575.[0m
[32m[I 2022-10-19 14:53:22,675][0m Trial 27 finished with value: 0.07753013514046882 and parameters: {'boosting_type': 'dart', 'num_leaves': 27, 'max_depth': -1, 'learning_rate': 0.05488397012403589, 'n_estimators': 50, 'min_child_weight': 0.004136174028005985}. Best is trial 23 with value: 0.07643553702223575.[0m
[32m[I 2022-10-19 14:53:22,796][0m Trial 28 finished with value: 0.08215564149317059 and parameters: {'boosting_type': 'gbdt', 'num_leaves': 39, 'max_depth': -1, 'learning_rate': 0.07709663249455254, 'n_estimators': 59, 'min_child_weight': 0.004074510426828912}. Best is trial 23 with value: 0.07643553702223575.[0m
[32m[I 2022-10-19 14:53:22,886][0m Trial 2

[32m[I 2022-10-19 14:53:25,569][0m Trial 52 finished with value: 0.076565537634268 and parameters: {'boosting_type': 'goss', 'num_leaves': 17, 'max_depth': 37, 'learning_rate': 0.08420365143215955, 'n_estimators': 81, 'min_child_weight': 0.001763981331309495}. Best is trial 42 with value: 0.07633597313140873.[0m
[32m[I 2022-10-19 14:53:25,692][0m Trial 53 finished with value: 0.07658248852661552 and parameters: {'boosting_type': 'goss', 'num_leaves': 19, 'max_depth': 35, 'learning_rate': 0.04883627903129517, 'n_estimators': 99, 'min_child_weight': 0.0017526511667766314}. Best is trial 42 with value: 0.07633597313140873.[0m
[32m[I 2022-10-19 14:53:25,822][0m Trial 54 finished with value: 0.07678075402122812 and parameters: {'boosting_type': 'goss', 'num_leaves': 25, 'max_depth': 47, 'learning_rate': 0.08654769674222847, 'n_estimators': 82, 'min_child_weight': 0.001749306035994561}. Best is trial 42 with value: 0.07633597313140873.[0m
[32m[I 2022-10-19 14:53:25,944][0m Trial 5

[32m[I 2022-10-19 14:53:29,194][0m Trial 78 finished with value: 0.07704696427575639 and parameters: {'boosting_type': 'goss', 'num_leaves': 2, 'max_depth': 48, 'learning_rate': 0.07506272040052263, 'n_estimators': 134, 'min_child_weight': 0.002705088276933265}. Best is trial 42 with value: 0.07633597313140873.[0m
[32m[I 2022-10-19 14:53:29,310][0m Trial 79 finished with value: 0.07658889826212041 and parameters: {'boosting_type': 'goss', 'num_leaves': 9, 'max_depth': 50, 'learning_rate': 0.08111272162686596, 'n_estimators': 120, 'min_child_weight': 0.003205526445994452}. Best is trial 42 with value: 0.07633597313140873.[0m
[32m[I 2022-10-19 14:53:29,463][0m Trial 80 finished with value: 0.0769086625723771 and parameters: {'boosting_type': 'goss', 'num_leaves': 11, 'max_depth': 47, 'learning_rate': 0.06566783504913473, 'n_estimators': 150, 'min_child_weight': 0.0033026955989106377}. Best is trial 42 with value: 0.07633597313140873.[0m
[32m[I 2022-10-19 14:53:29,582][0m Trial

In [13]:
import optuna
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from scipy.stats import pearsonr
from sklearn.metrics import r2_score
from sklearn.model_selection import cross_validate
from functools import *
from sklearn.svm import SVR

def objective_svr(trial,x,y):
    kernel = trial.suggest_categorical("kernel", ["linear", "poly", "rbf", "sigmoid"])
    degree = trial.suggest_int("degree", 1, 10)
    C = trial.suggest_float("C", 0, 5)
    epsilon = trial.suggest_float("epsilon", 0, 5)
    
        
    model = SVR(
        kernel = kernel,
        degree = degree,
        C = C,
        epsilon = epsilon
        )
    
    #Use 10 way cross validation for training.
    score = cross_validate(model, x, y, cv = 5, scoring = 'neg_mean_squared_error')
    score_mean = -np.mean(score['test_score'])
    return(score_mean)
    
study = optuna.create_study(direction = 'minimize')
optimization_function = partial(
                objective_svr,
                x = guide_RNA_one_hot_encoded_sequences,
                y = k_values
            )
study.optimize(optimization_function, n_trials=100) 
svr_gRNA = study.best_value

[32m[I 2022-10-19 18:17:01,340][0m A new study created in memory with name: no-name-1dbf6793-ab47-43fc-af05-fc5f6e9f9e46[0m
[32m[I 2022-10-19 18:17:01,361][0m Trial 0 finished with value: 1.2047362142133398 and parameters: {'kernel': 'linear', 'degree': 5, 'C': 3.022632875724411, 'epsilon': 3.2048251889986643}. Best is trial 0 with value: 1.2047362142133398.[0m
[32m[I 2022-10-19 18:17:01,385][0m Trial 1 finished with value: 0.08019004365017482 and parameters: {'kernel': 'poly', 'degree': 9, 'C': 1.1475622362476896, 'epsilon': 0.055910266778556594}. Best is trial 1 with value: 0.08019004365017482.[0m
[32m[I 2022-10-19 18:17:01,402][0m Trial 2 finished with value: 1.2047362142133398 and parameters: {'kernel': 'linear', 'degree': 5, 'C': 0.6114053351710419, 'epsilon': 1.9243457775956319}. Best is trial 1 with value: 0.08019004365017482.[0m
[32m[I 2022-10-19 18:17:01,432][0m Trial 3 finished with value: 0.09194829482143656 and parameters: {'kernel': 'rbf', 'degree': 4, 'C': 3

[32m[I 2022-10-19 18:17:02,645][0m Trial 33 finished with value: 0.5287582261357797 and parameters: {'kernel': 'poly', 'degree': 10, 'C': 1.4180083384974123, 'epsilon': 0.8940599580718258}. Best is trial 29 with value: 0.07837676218895448.[0m
[32m[I 2022-10-19 18:17:02,692][0m Trial 34 finished with value: 0.08042399255458624 and parameters: {'kernel': 'poly', 'degree': 9, 'C': 1.5962471585565279, 'epsilon': 0.06517497066321264}. Best is trial 29 with value: 0.07837676218895448.[0m
[32m[I 2022-10-19 18:17:02,731][0m Trial 35 finished with value: 1.2047362142133398 and parameters: {'kernel': 'linear', 'degree': 8, 'C': 0.3110505132144382, 'epsilon': 2.061965631621682}. Best is trial 29 with value: 0.07837676218895448.[0m
[32m[I 2022-10-19 18:17:02,768][0m Trial 36 finished with value: 0.18048652671993848 and parameters: {'kernel': 'poly', 'degree': 3, 'C': 1.0989573528313947, 'epsilon': 0.5405261582035672}. Best is trial 29 with value: 0.07837676218895448.[0m
[32m[I 2022-10

[32m[I 2022-10-19 18:17:03,983][0m Trial 67 finished with value: 0.09645826005464372 and parameters: {'kernel': 'sigmoid', 'degree': 7, 'C': 2.1657605762324015, 'epsilon': 0.22760256656882163}. Best is trial 59 with value: 0.0752170905869182.[0m
[32m[I 2022-10-19 18:17:04,019][0m Trial 68 finished with value: 1.2047362142133398 and parameters: {'kernel': 'sigmoid', 'degree': 5, 'C': 1.5831842099008018, 'epsilon': 3.282325304624277}. Best is trial 59 with value: 0.0752170905869182.[0m
[32m[I 2022-10-19 18:17:04,060][0m Trial 69 finished with value: 0.11386362906318226 and parameters: {'kernel': 'sigmoid', 'degree': 6, 'C': 4.721866477395784, 'epsilon': 0.45398767503455506}. Best is trial 59 with value: 0.0752170905869182.[0m
[32m[I 2022-10-19 18:17:04,098][0m Trial 70 finished with value: 0.08407755486666826 and parameters: {'kernel': 'sigmoid', 'degree': 8, 'C': 1.3368869273003454, 'epsilon': 0.23190550639501006}. Best is trial 59 with value: 0.0752170905869182.[0m
[32m[I 