In [None]:
import os
from pathlib import Path
import json

# 1. Find the Repo Root dynamically
# Walks up folders until it finds the README.md file
_root = next(p for p in Path.cwd().parents if (p / "README.md").exists())
REPO_ROOT = str(_root)

# 2. Add to sys.path so standard 'import' statements work
import sys
if REPO_ROOT not in sys.path:
    sys.path.insert(0, REPO_ROOT)
    
# Load the Data Root from the JSON file
with open(Path(REPO_ROOT) / "data_config.json", "r") as f:
    config = json.load(f)
    DATA_ROOT = config["DATA_ROOT"]

print(f"Data is being pulled from: {DATA_ROOT}")
print(f"Repo root identified as: {REPO_ROOT}")

In [None]:
############### RUN DATA PREPROCESSING ###############

%run "$REPO_ROOT/run/data preprocessing/model_boilerplate_remote.py"

In [12]:
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

(12748, 1197)
(12748, 16100)
(3187, 1197)
(3187, 16100)


In [2]:
# out-of-the-box r2 score LinearRegression:
# train score: 0.90870454039286
# test score:  0.7934193600256717 
# difference of 0.12 between train and test is understandable given the high-dimensionality and noise in gene expression data

# after applying the centering boilerplate (only difference between MLR v1 and v2 is column-wise centering)
# train score: 0.90870454039286
# test score:  0.7934193600256717 

# after working with final, uncentered data preprocessing that included a network check to ensure that nodes matched input expression data columns 
# train score:  0.9041830139739396
# test score:  0.7986089431408789


from sklearn.linear_model import LinearRegression

reg_test = LinearRegression(
                    n_jobs=-1,
).fit(x_train, y_train)

In [14]:
print('Score: ', reg_test.score(x_train, y_train))
print('Score: ', reg_test.score(x_test, y_test))

Score:  0.9041830139739396
Score:  0.7986089431408789


In [None]:
# conducting a k-fold validation to check for overfitting
# splitting training set into 10-folds
# train sets: ~11473 targets
# test sets: ~1274 targets

from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

cnt = 0
n_splits = 10
kf = KFold(n_splits=n_splits, shuffle=True, random_state=888) # changed from 42 to 888 to match training seed for RNN 13/01/26
for train_index, test_index in kf.split(x_train, y_train):
    print(f'Fold:{cnt}, Train set: {len(train_index)}, \
    Test set:{len(test_index)}')
    cnt += 1

Fold:0, Train set: 11473,     Test set:1275
Fold:1, Train set: 11473,     Test set:1275
Fold:2, Train set: 11473,     Test set:1275
Fold:3, Train set: 11473,     Test set:1275
Fold:4, Train set: 11473,     Test set:1275
Fold:5, Train set: 11473,     Test set:1275
Fold:6, Train set: 11473,     Test set:1275
Fold:7, Train set: 11473,     Test set:1275
Fold:8, Train set: 11474,     Test set:1274
Fold:9, Train set: 11474,     Test set:1274


In [None]:
# running the 10-fold validation 
#----- MLR model cross validation ------
#Scores: [0.78811923 0.79602325 0.7830914  0.78294147 0.78665107 0.7712694
# 0.77717178 0.7840341  0.79817987 0.76680642]
#Mean: 0.7834287993694773 -> similar to the holdout training run
#StandardDeviation: 0.009341412328795035 -> very consistent between folds

# unlikely that MLR is overfitting based on these results -> good sanity check for reference

# redid same analysis with column-wise centered data and got the exact same results 
#----- MLR model cross validation ------
#Scores: [0.78811923 0.79602325 0.7830914  0.78294147 0.78665107 0.7712694
# 0.77717178 0.7840341  0.79817987 0.76680642]
#Mean: 0.7834287993694773
#StandardDeviation: 0.00934141232879502

# redid same analysis once again, with uncentered data like the first run, but set random seed to 888
#----- MLR model cross validation ------
#Scores: [0.79322189 0.79384094 0.80615516 0.77715731 0.78278073 0.78325776
# 0.79921787 0.77486414 0.75888273 0.7807757 ]
#Mean: 0.7850154223167911
#StandardDeviation: 0.01291359135998445


# same as before but used index_col=0 and removed 'TF' which was a junk filler column 15/01/26
#----- MLR model cross validation ------
#Scores: [0.79306553 0.79355143 0.80595273 0.77689887 0.78254468 0.78294832
# 0.79897946 0.77465899 0.75861371 0.78061954]
#Mean: 0.7847833249078063
#StandardDeviation: 0.012924317642343348


def cross_validation(reg_model, training_set, training_target, cv):
    scores = cross_val_score(
      reg_model, training_set,
      training_target,
      scoring="r2", cv=cv)
    r2_scores = scores
    print("Scores:", r2_scores)
    print("Mean:", r2_scores.mean())
    print("StandardDeviation:", r2_scores.std())

print("----- MLR model cross validation ------")
lin_reg = LinearRegression()
cross_validation(lin_reg, x_train, y_train, kf)
print("")

----- MLR model cross validation ------


In [3]:
import joblib
import json

# saving model + metadata

metadata = {
    'sklearn_version': '1.7.2',
    'model_type': 'LinearRegression',
    'n_features': reg_test.n_features_in_,
    'coef_shape': reg_test.coef_.shape,
    'intercept_shape': reg_test.intercept_.shape,
}

joblib.dump(reg_test, "/home/christianl/Zhang-Lab/Zhang Lab Data/Saved models/MLR/MLR_v3/MLR_model_v4(uncentered[FINAL]).joblib")
with open("/home/christianl/Zhang-Lab/Zhang Lab Data/Saved models/MLR/MLR_v3/model_metadata_uncentered[FINAL].json", "w") as f:
    json.dump(metadata, f)