In [1]:
# Importing necessary libraries

import numpy as np
import math
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import seaborn as sns

In [2]:
# Loading in gene/TF expression full data

gene_expression = pd.read_csv(('/Users/christianlangridge/Desktop/Zhang-Lab/Zhang Lab Data/Full data files/Geneexpression (full).tsv'), sep='\t', header=0)
tf_expression = pd.read_csv(('/Users/christianlangridge/Desktop/Zhang-Lab/Zhang Lab Data/Full data files/TF(full).tsv'), sep='\t', header=0)

In [3]:
# Split into training, testing and validation sets and into numpy arrays + combining dataframes
x = tf_expression
y = gene_expression

combined_data = pd.concat([x, y], axis=1)

# First split: 70% train and 30% temp (test + val)
x_train, x_temp, y_train, y_temp = train_test_split(
    x, y, test_size=0.3, random_state=42)

# Second split: split the temp set into 20% test and 10% val (which is 2/3 and 1/3 of temp)
x_test, x_val, y_test, y_val = train_test_split(
    x_temp, y_temp, test_size=1/3, random_state=42)

# For training set
x_train = x_train.to_numpy()
y_train = y_train.to_numpy()

# For validation set
x_val = x_val.to_numpy()
y_val = y_val.to_numpy()

x_test = x_test.to_numpy()
y_test = y_test.to_numpy()

In [4]:
# out-of-the-box r2 score with ElasticNet: 0.00432651511965587
# no tuning at all so not surprised by uncompetitive results vs. MLR

from sklearn.linear_model import ElasticNet
elas_reg = ElasticNet().fit(x_train,y_train)
print('Score ', elas_reg.score(x_test,y_test))    

Score  0.00432651511965587


In [None]:
# tuning performance of ElasticNet
# scaling input features 

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import MultiTaskElasticNetCV

scaler = StandardScaler()
X_train_std = scaler.fit_transform(x_train)
X_test_std  = scaler.transform(x_test)

In [None]:
# L1/L2 penalty tuning with ElasticNetCV

elas = MultiTaskElasticNetCV(
    alphas=np.logspace(-4, 1, 50),
    l1_ratio=[0.1, 0.3, 0.5, 0.7, 0.9],
    cv=5,
    n_jobs=-1,
    max_iter=10000,
)
elas.fit(X_train_std, y_train)
print("Test R2:", elas.score(X_test_std, y_test))
print("alpha_:", elas.alpha_, "l1_ratio_:", elas.l1_ratio_)

TRYING A GITHUB SCRIPT WITH OPTUNA

In [6]:
import optuna
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, VotingRegressor
from sklearn.linear_model import ElasticNet, Lasso, Ridge
from sklearn.model_selection import cross_val_score, train_test_split
from xgboost import XGBRegressor
import lightgbm as lgb
import joblib
import json

class Objective:
    """
    Objective function for hyperparameter optimization.
    Parameters:
    - X: Features.
    - y: Target variable.
    - algorithm: Algorithm to optimize.
    - scoring: Scoring metric for optimization.
    """

    def __init__(self, X, y, algorithm, scoring):
        self.X = X
        self.y = y
        self.algorithm = algorithm
        self.scoring = scoring

    def __call__(self, trial):
        """
        Callable method for optimization.
        Parameters:
        - trial: Optuna trial object.
        Returns:
        - Objective value for optimization.
        """

        if self.algorithm == 'lightgbm':
            tree_learner = trial.suggest_categorical('tree_learner', ['serial', 'feature', 'data', 'voting'])
            extra_trees = trial.suggest_categorical('extra_trees', [True, False])
            num_leaves = trial.suggest_int('num_leaves', 31, 150)
            n_estimators = trial.suggest_int('n_estimators', 100, 1000)
            min_data_in_leaf = trial.suggest_int('min_data_in_leaf', 50, 200)
            max_depth = trial.suggest_int('max_depth', 2, 20)
            subsample = trial.suggest_uniform('subsample', 0.5, 1)
            feature_fraction = trial.suggest_uniform('feature_fraction', 0.5, 1)
            learning_rate = trial.suggest_loguniform('learning_rate', 0.01, 0.1)
            bagging_freq = trial.suggest_int("bagging_freq", 1, 5)
            reg_alpha = trial.suggest_uniform('reg_alpha', 1, 10.0)
            reg_lambda = trial.suggest_uniform('reg_lambda', 1, 10.0)

            model = lgb.LGBMRegressor(objective='regression',
                                      boosting='goss',
                                      metric='l2',
                                      verbosity=-1,
                                      num_leaves=num_leaves,
                                      extra_trees=extra_trees,
                                      min_data_in_leaf=min_data_in_leaf,
                                      max_depth=max_depth,
                                      feature_fraction=feature_fraction,
                                      subsample=subsample,
                                      n_estimators=n_estimators,
                                      learning_rate=learning_rate,
                                      reg_alpha=reg_alpha,
                                      tree_learner=tree_learner,
                                      reg_lambda=reg_lambda,
                                      n_jobs=-1,
                                      random_state=42)

        elif self.algorithm == 'xgboost':
            n_estimators = trial.suggest_int('n_estimators', 50, 500)
            min_child_samples = trial.suggest_int('min_child_samples', 20, 200)
            max_depth = trial.suggest_int('max_depth', 1, 12)
            subsample = trial.suggest_uniform('subsample', 0.1, 1)
            colsample_bytree = trial.suggest_uniform('colsample_bytree', 0.1, 1)
            learning_rate = trial.suggest_loguniform('learning_rate', 0.01, 0.2)
            reg_alpha = trial.suggest_uniform('reg_alpha', 1.0, 8.0)
            reg_lambda = trial.suggest_uniform('reg_lambda', 1.0, 8.0)

            model = XGBRegressor(objective='reg:squarederror',
                                  n_jobs=-1,
                                  #verbosity=-1,
                                  eval_metric='rmse',
                                  random_state=42,
                                  n_estimators=n_estimators,
                                  #min_child_samples=min_child_samples,
                                  max_depth=max_depth,
                                  subsample=subsample,
                                  colsample_bytree=colsample_bytree,
                                  learning_rate=learning_rate,
                                  reg_alpha=reg_alpha,
                                  reg_lambda=reg_lambda)

        elif self.algorithm == 'random_forest':
            n_estimators = trial.suggest_int('n_estimators', 50, 300)
            max_depth = trial.suggest_int('max_depth', 1, 32)
            max_features = trial.suggest_categorical('max_features', ['sqrt', 'log2'])
            min_samples_split = trial.suggest_int('min_samples_split', 2, 32)
            min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 20)

            model = RandomForestRegressor(oob_score=True,
                                           warm_start=True,
                                           n_estimators=n_estimators,
                                           max_depth=max_depth,
                                           min_samples_leaf=min_samples_leaf,
                                           max_features=max_features,
                                           min_samples_split=min_samples_split,
                                           n_jobs=-1,
                                           random_state=42)

        elif self.algorithm == 'extra_trees':
            n_estimators = trial.suggest_int('n_estimators', 50, 300)
            max_depth = trial.suggest_int('max_depth', 1, 32)
            max_features = trial.suggest_categorical('max_features', ['sqrt', 'log2'])
            min_samples_split = trial.suggest_int('min_samples_split', 2, 32)
            min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 20)

            model = ExtraTreesRegressor(n_estimators=n_estimators,
                                        max_depth=max_depth,
                                        min_samples_leaf=min_samples_leaf,
                                        max_features=max_features,
                                        min_samples_split=min_samples_split,
                                        n_jobs=-1,
                                        random_state=42)

        elif self.algorithm == 'ridge':
            alpha = trial.suggest_loguniform('alpha', 1e-8, 10.0)
            model = Ridge(alpha=alpha, random_state=42)

        elif self.algorithm == 'lasso':
            alpha = trial.suggest_loguniform('alpha', 1e-8, 10.0)
            model = Lasso(alpha=alpha, random_state=42)

        elif self.algorithm == 'elastic_net':
            alpha = trial.suggest_loguniform('alpha', 1e-8, 10.0)
            l1_ratio = trial.suggest_uniform('l1_ratio', 0.0, 1.0)
            model = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, random_state=42)

        else:
            return False

        return -cross_val_score(model, self.X, self.y, n_jobs=-1, cv=5, verbose=0, scoring=self.scoring).mean()


class BayesOptimize:
    """
    Bayesian hyperparameter optimization using Optuna.
    Parameters:
    - X: Features.
    - y: Target variable.
    - algorithm: Algorithm to optimize.
    - scoring: Scoring metric for optimization.
    - n_trials: Number of optimization trials.
    """

    def __init__(self, X, y, algorithm, scoring, n_trials=100):
        self.X = X
        self.y = y
        self.algorithm = algorithm
        self.scoring = scoring
        self.n_trials = n_trials
        self.study = optuna.create_study(direction='minimize')

    def optimize(self):
        """
        Run the hyperparameter optimization.
        Returns:
        - Best hyperparameters found during optimization.
        """
        objective = Objective(self.X, self.y, self.algorithm, self.scoring)
        self.study.optimize(objective, n_trials=self.n_trials)
        best_params = self.study.best_trial.params
        return best_params

if __name__ == "__main__":
    # An example:
    algorithm = 'elastic_net'  
    scoring = 'r2'

    optimizer = BayesOptimize(x_train, y_train, algorithm, scoring)
    best_params = optimizer.optimize()

    print("Best parameters:")
    print(json.dumps(best_params, indent=4))

[I 2025-12-08 15:50:13,778] A new study created in memory with name: no-name-97c2a178-1265-4419-b19c-59e869cf24bc
  alpha = trial.suggest_loguniform('alpha', 1e-8, 10.0)
  l1_ratio = trial.suggest_uniform('l1_ratio', 0.0, 1.0)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(

KeyboardInterrupt: 