In [3]:
from dataset import SNPmarkersDataset
from xgboost import XGBRegressor
from scipy.stats import pearsonr
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error
import time
import json
import pandas as pd
from utils import print_elapsed_time

In [None]:
selected_phenotypes = ["ep_res","de_res","FESSEp_res","FESSEa_res"]

train_dataset = SNPmarkersDataset(mode="local_train", skip_check = True)
train_dataset.set_phenotypes = selected_phenotypes
validation_dataset = SNPmarkersDataset(mode="validation", skip_check = True)
validation_dataset.set_phenotypes = selected_phenotypes

X_train = train_dataset.get_all_SNP()
Y_train_cpu = pd.DataFrame([train_dataset.phenotypes[pheno] for pheno in selected_phenotypes]).transpose()
for pheno in Y_train_cpu:
    Y_train_cpu[pheno] /= train_dataset.pheno_std[pheno]
Y_train_gpu = Y_train_cpu

X_validation = validation_dataset.get_all_SNP()
Y_validation = pd.DataFrame([validation_dataset.phenotypes[pheno] for pheno in selected_phenotypes]).transpose()

for pheno in Y_validation:
    Y_validation[pheno] /= validation_dataset.pheno_std[pheno]

              ep_res    de_res  FESSEp_res  FESSEa_res
id                                                    
BBB2024_1   2.340485  1.048687    4.869720    3.754475
BBB2024_2  -0.029124  4.097066    1.512663    0.915872
BBB2024_3   3.009160  1.033428    3.086161    2.775228
BBB2024_4   3.133567  7.720563    4.487325    4.062251
BBB2024_5   3.449609  4.448934    5.071065    4.734881
BBB2024_6   0.394976 -4.779438    0.166496    0.263310
BBB2024_7   2.906651 -1.434209    1.090841    3.148582
BBB2024_8   1.517752  0.903852   -0.878456   -0.016035
BBB2024_9  -0.086624 -0.349057    1.429496    0.997888
BBB2024_10  2.075847  3.703129    2.241961    1.641621
              ep_res    de_res  FESSEp_res  FESSEa_res
id                                                    
BBB2024_1   1.083635  0.263945    2.975387    2.285565
BBB2024_2  -0.013484  1.031195    0.924234    0.557544
BBB2024_3   1.393229  0.260105    1.885637    1.689441
BBB2024_4   1.450829  1.943196    2.741745    2.472926
BBB2024_5 

In [7]:
sub_sampling = [1, 0.5]
learning_rates = [0.5, 0.4, 0.3, 0.2, 0.1, 0.01, 0.001, 0.0005, 0.0001]
max_depth = [12, 11, 10, 9, 8, 7, 6]

nb_phenotypes = len(selected_phenotypes)
MAE_results = np.zeros((nb_phenotypes, len(sub_sampling), len(learning_rates), len(max_depth)))
correlation_results = np.zeros((nb_phenotypes, len(sub_sampling), len(learning_rates), len(max_depth)))

start_time = time.time()
iteration_counter = 0
for i,sub_sampling_value in enumerate(sub_sampling):
    for j,learning_rates_value in enumerate(learning_rates):
        for k,depth in enumerate(max_depth):
            model = XGBRegressor(n_estimators=1,
                                subsample=sub_sampling_value,
                                learning_rate=learning_rates_value,
                                max_depth= depth,
                                n_jobs = -1,
                                random_state=2307, 
                                device="cpu")
            model = model.fit(X_train, Y_train_gpu)
            validation_predictions = model.predict(X_validation)

            for m in range(nb_phenotypes):
                MAE_results[m,i,j,k] = mean_absolute_error(Y_validation.iloc[:, m] * validation_dataset.pheno_std[selected_phenotypes[m]], validation_predictions[:, m] * validation_dataset.pheno_std[selected_phenotypes[m]]) 
                correlation_results[m,i,j,k] = pearsonr(Y_validation.iloc[:, m], validation_predictions[:, m]).statistic
                
            iteration_counter += 1
            
            print("////////////////////////////////////////////")
            print(f"Iteration {iteration_counter}/{len(sub_sampling) * len(learning_rates) * len(max_depth)} finished")
            print("Hyper parameters tested:")
            print(f"    - sub_sampling: {sub_sampling_value}")
            print(f"    - learning_rate: {learning_rates_value}")
            print(f"    - depth: {depth}")
            print(f"Elapsed time from start: {print_elapsed_time(start_time)}")
            print(f"Results:")
            print(f"    - MAE : {MAE_results[:,i,j,k]}")
            print(f"    - Correlation : {correlation_results[:,i,j,k]}")
            break
        break
    break

print("////////////////////////////////////////////")
print(f"Computation finished in {int((time.time() - start_time) // 3600)}h {int(((time.time() - start_time) % 3600) // 60)}m {int((time.time() - start_time) % 60)}s")


////////////////////////////////////////////
Iteration 1/126 finished
Hyper parameters tested:
    - sub_sampling: 1
    - learning_rate: 0.5
    - depth: 12
Elapsed time from start: 0d 0h 0m 23s
Results:
    - MAE : [ 4.41364255 18.02678972  2.69455864  2.90826075]
    - Correlation : [0.13671239 0.04846106 0.04651116 0.03983687]
////////////////////////////////////////////
Computation finished in 0h 0m 23s


In [None]:
with open("tmp.json", "w") as f:
    results = {
        "dim_0_values": Y_validation.columns.to_list(),
        "dim_0_label": "phenotypes",
        "dim_1_values": sub_sampling,
        "dim_1_label": "sub_sampling",
        "dim_2_values": learning_rates,
        "dim_2_label": "learning_rates",
        "dim_3_values": max_depth,
        "dim_3_label": "max_depth",
        "correlation": correlation_results.tolist(),
        "MAE": MAE_results.tolist()
    }
    json.dump(results, f)