MLR SCRIPT ON FULL DATASETS

In [2]:
# Importing necessary libraries

import numpy as np
import math
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import seaborn as sns

In [3]:
# Loading in gene/TF expression full data

gene_expression = pd.read_csv(('/Users/christianlangridge/Desktop/Zhang-Lab/Zhang Lab Data/Full data files/Geneexpression (full).tsv'), sep='\t', header=0)
tf_expression = pd.read_csv(('/Users/christianlangridge/Desktop/Zhang-Lab/Zhang Lab Data/Full data files/TF(full).tsv'), sep='\t', header=0)

In [5]:
# Split into training, testing and validation sets and into numpy arrays + combining dataframes
x = tf_expression
y = gene_expression

combined_data = pd.concat([x, y], axis=1)

# First split: 70% train and 30% temp (test + val)
x_train, x_temp, y_train, y_temp = train_test_split(
    x, y, test_size=0.3, random_state=42)

# Second split: split the temp set into 20% test and 10% val (which is 2/3 and 1/3 of temp)
x_test, x_val, y_test, y_val = train_test_split(
    x_temp, y_temp, test_size=1/3, random_state=42)

# For training set
x_train = x_train.to_numpy()
y_train = y_train.to_numpy()

# For validation set
x_val = x_val.to_numpy()
y_val = y_val.to_numpy()

x_test = x_test.to_numpy()
y_test = y_test.to_numpy()

In [6]:
# out-of-the-box r2 score LinearRegression:
# train score: 0.90870454039286
# test score:  0.7934193600256717 
# difference of 0.12 between train and test is understandable given the high-dimensionality and noise in gene expression data

reg_test = LinearRegression(
                    n_jobs=-1,
).fit(x_train, y_train)

In [None]:
print('Score: ', reg_test.score(x_train, y_train))
print('Score: ', reg_test.score(x_test, y_test))

Score:  0.7934193600256717
Score:  0.90870454039286


In [None]:
# conducting a k-fold validation to check for overfitting
# splitting training set into 10-folds
# train sets: ~10038 targets
# test sets: ~1116 targets

from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

cnt = 0
n_splits = 10
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
for train_index, test_index in kf.split(x_train, y_train):
    print(f'Fold:{cnt}, Train set: {len(train_index)}, \
    Test set:{len(test_index)}')
    cnt += 1

Fold:0, Train set: 10038,     Test set:1116
Fold:1, Train set: 10038,     Test set:1116
Fold:2, Train set: 10038,     Test set:1116
Fold:3, Train set: 10038,     Test set:1116
Fold:4, Train set: 10039,     Test set:1115
Fold:5, Train set: 10039,     Test set:1115
Fold:6, Train set: 10039,     Test set:1115
Fold:7, Train set: 10039,     Test set:1115
Fold:8, Train set: 10039,     Test set:1115
Fold:9, Train set: 10039,     Test set:1115


In [None]:
# running the 10-fold validation 
#----- MLR model cross validation ------
#Scores: [0.78811923 0.79602325 0.7830914  0.78294147 0.78665107 0.7712694
# 0.77717178 0.7840341  0.79817987 0.76680642]
#Mean: 0.7834287993694773 -> similar to the holdout training run
#StandardDeviation: 0.009341412328795035 -> very consistent between folds

# Unlikely that MLR is overfitting based on these results -> good sanity check for reference

def cross_validation(reg_model, training_set, training_target, cv):
    scores = cross_val_score(
      reg_model, training_set,
      training_target,
      scoring="r2", cv=cv)
    r2_scores = scores
    print("Scores:", r2_scores)
    print("Mean:", r2_scores.mean())
    print("StandardDeviation:", r2_scores.std())

print("----- MLR model cross validation ------")
lin_reg = LinearRegression()
cross_validation(lin_reg, x_train, y_train, kf)
print("")

----- MLR model cross validation ------
Scores: [0.78811923 0.79602325 0.7830914  0.78294147 0.78665107 0.7712694
 0.77717178 0.7840341  0.79817987 0.76680642]
Mean: 0.7834287993694773
StandardDeviation: 0.009341412328795035



In [10]:
import joblib
import json

# saving model + metadata

metadata = {
    'sklearn_version': '1.7.2',
    'model_type': 'LinearRegression',
    'n_features': reg_test.n_features_in_,
    'coef_shape': reg_test.coef_.shape,
    'intercept_shape': reg_test.intercept_.shape,
}

joblib.dump(reg_test, "/Users/christianlangridge/Desktop/Zhang-Lab/Zhang Lab Data/Saved models/MLR/MLR_model.joblib")
with open("model_metadata.json", "w") as f:
    json.dump(metadata, f)

ELASTICNET ON FULL DATASETS + TUNING  

In [None]:
# out-of-the-box r2 score with ElasticNet: 0.00432651511965587
# no tuning at all so not surprised by uncompetitive results vs. MLR

from sklearn.linear_model import ElasticNet
elas_reg = ElasticNet().fit(x_train,y_train)
print('Score ', elas_reg.score(x_test,y_test))    


Score  0.00432651511965587


In [12]:
# tuning performance of ElasticNet

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import MultiTaskElasticNetCV

scaler = StandardScaler()
X_train_std = scaler.fit_transform(x_train)
X_test_std  = scaler.transform(x_test)

In [1]:
# L1/L2 penalty tuning with ElasticNetCV

elas = MultiTaskElasticNetCV(
    alphas=np.logspace(-4, 1, 50),
    l1_ratio=[0.1, 0.3, 0.5, 0.7, 0.9],
    cv=5,
    n_jobs=-1,
    max_iter=10000,
)
elas.fit(X_train_std, y_train)
print("Test R2:", elas.score(X_test_std, y_test))
print("alpha_:", elas.alpha_, "l1_ratio_:", elas.l1_ratio_)

NameError: name 'MultiTaskElasticNetCV' is not defined