In [1]:
from dataset import SNPmarkersDataset
from xgboost import XGBRegressor
from scipy.stats import pearsonr
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error
import time
import json
import pandas as pd

In [2]:
selected_phenotypes = ["ep_res","de_res","FESSEp_res","FESSEa_res"]

train_dataset = SNPmarkersDataset(mode="local_train")
train_dataset.set_phenotypes = selected_phenotypes
validation_dataset = SNPmarkersDataset(mode="validation")
validation_dataset.set_phenotypes = selected_phenotypes

X_train = train_dataset.get_all_SNP()
Y_train_cpu = pd.DataFrame([train_dataset.phenotypes[pheno] for pheno in selected_phenotypes]).transpose()
Y_train_gpu = Y_train_cpu

X_validation = validation_dataset.get_all_SNP()
Y_validation = pd.DataFrame([validation_dataset.phenotypes[pheno] for pheno in selected_phenotypes]).transpose()

In [None]:
sub_sampling = [1, 0.5]
learning_rates = [0.5, 0.4, 0.3, 0.2, 0.1, 0.01, 0.001, 0.0005, 0.0001]
max_depth = [12, 11, 10, 9, 8, 7, 6]

nb_phenotypes = Y_validation.shape[-1]
MAE_results = np.zeros((nb_phenotypes, len(sub_sampling), len(learning_rates), len(max_depth)))
correlation_results = np.zeros((nb_phenotypes, len(sub_sampling), len(learning_rates), len(max_depth)))

start_time = time.time()
iteration_counter = 0
for i,sub_sampling_value in enumerate(sub_sampling):
    for j,learning_rates_value in enumerate(learning_rates):
        for k,depth in enumerate(max_depth):
            model = XGBRegressor(n_estimators=1000,
                                subsample=sub_sampling_value,
                                learning_rate=learning_rates_value,
                                max_depth= depth,
                                n_jobs = -1,
                                random_state=2307, 
                                device="cpu")
            model = model.fit(X_train, Y_train_gpu)
            train_predictions = model.predict(X_train)
            validation_predictions = model.predict(X_validation)
            
            for m in range(nb_phenotypes):
                MAE_results[m,i,j,k] = mean_absolute_error(Y_validation.iloc[:, m], validation_predictions[:, m])
                correlation_results[m,i,j,k] = pearsonr(Y_validation.iloc[:, m], validation_predictions[:, m]).statistic
                
            iteration_counter += 1
            
            print("////////////////////////////////////////////")
            print(f"Iteration {iteration_counter}/{len(sub_sampling) * len(learning_rates) * len(max_depth)} finished")
            print("Hyper parameters tested:")
            print(f"    - sub_sampling: {sub_sampling_value}")
            print(f"    - learning_rate: {learning_rates_value}")
            print(f"    - depth: {depth}")
            print(f"Elapsed time from start: {int((time.time() - start_time) // 60)}m {int((time.time() - start_time) % 60)}s")
            print(f"Results:")
            print(f"    - MAE : {MAE_results[:,i,j,k]}")
            print(f"    - Correlation : {correlation_results[:,i,j,k]}")
            break
        break
    break

print("////////////////////////////////////////////")
print(f"Computation finished in {int((time.time() - start_time) // 3600)}h {int(((time.time() - start_time) % 3600) // 60)}m {int((time.time() - start_time) % 60)}s")

with open("tmp.json", "w") as f:
    results = {
        "dim_0_values": Y_validation.columns.to_list(),
        "dim_0_label": "phenotypes",
        "dim_1_values": sub_sampling,
        "dim_1_label": "sub_sampling",
        "dim_2_values": learning_rates,
        "dim_2_label": "learning_rates",
        "dim_3_values": max_depth,
        "dim_3_label": "max_depth",
        "correlation": correlation_results.tolist(),
        "MAE": MAE_results.tolist()
    }
    json.dump(results, f)

////////////////////////////////////////////
Iteration 1/126 finished
Hyper parameters tested:
    - sub_sampling: 1
    - learning_rate: 0.5
    - depth: 12
Elapsed time from start: 4m 17s
Results:
    - MAE : [1.55523461 3.22876916 1.28514196 1.37045504]
    - Correlation : [0.1803991  0.16892071 0.07338055 0.0811551 ]
////////////////////////////////////////////
Computation finished in 0h 4m 17s
