In [1]:
from dataset import SNPmarkersDataset
from sklearn.ensemble import RandomForestRegressor
from scipy.stats import pearsonr
import numpy as np
from sklearn.metrics import mean_absolute_error
import time
import pandas as pd

In [2]:
selected_phenotypes = ["ep_res","de_res","FESSEp_res","FESSEa_res"]

train_dataset = SNPmarkersDataset(mode="local_train")
train_dataset.set_phenotypes = selected_phenotypes
validation_dataset = SNPmarkersDataset(mode="validation")
validation_dataset.set_phenotypes = selected_phenotypes

X_train = train_dataset.get_all_SNP()
Y_train = pd.DataFrame([train_dataset.phenotypes[pheno] for pheno in selected_phenotypes]).transpose()

X_validation = validation_dataset.get_all_SNP()
Y_validation = pd.DataFrame([validation_dataset.phenotypes[pheno] for pheno in selected_phenotypes]).transpose()

In [5]:
max_depth = [15]
max_features = [int(np.sqrt(X_train.shape[-1])), 0.005]
nb_phenotypes = len(selected_phenotypes)
MAE_results = np.zeros((nb_phenotypes, len(max_depth), len(max_features)))
correlation_results = np.zeros((nb_phenotypes, len(max_depth), len(max_features)))  

In [7]:
for i,max_depth_value in enumerate(max_depth):
    for j, max_feature in enumerate(max_features):
        model = RandomForestRegressor(n_estimators=4, max_depth=max_depth_value, max_features=max_feature, random_state=2307, n_jobs=-1).fit(X_train, Y_train)
        predictions = model.predict(X_validation)
        print("////////////////////////////////////////////")
        print(f"Iteration {i * len(max_features) + (j+1)}/{len(max_features) * len(max_depth)}")
        if type(max_feature) == int:
            max_nb_of_tree = max_feature
        elif type(max_feature) == float:
            max_nb_of_tree = int(max_feature*X_train.shape[-1])
        print(f"Max depth value tested: {max_depth_value}, max nb of features used per tree: {max_nb_of_tree}")

        for k in range(nb_phenotypes):
            MAE_results[k][i][j] = mean_absolute_error(Y_validation.iloc[:, k], predictions[:, k])
            correlation_results[k][i][j] = pearsonr(Y_validation.iloc[:, k], predictions[:, k]).statistic
            print("--------------------------------------------")
            print(f"Pearson correlation for {selected_phenotypes[k]}: {correlation_results[k][i][j]:.5f}")
            print(f"MAE results for {selected_phenotypes[k]}: {MAE_results[k][i][j]:.5f}")


////////////////////////////////////////////
Iteration 1/2
Max depth value tested: 15, max nb of features used per tree: 190
--------------------------------------------
Pearson correlation for ep_res: 0.08271
MAE results for ep_res: 1.61798
--------------------------------------------
Pearson correlation for de_res: 0.06519
MAE results for de_res: 3.27636
--------------------------------------------
Pearson correlation for FESSEp_res: 0.07817
MAE results for FESSEp_res: 1.25472
--------------------------------------------
Pearson correlation for FESSEa_res: 0.06520
MAE results for FESSEa_res: 1.35821
////////////////////////////////////////////
Iteration 2/2
Max depth value tested: 15, max nb of features used per tree: 181
--------------------------------------------
Pearson correlation for ep_res: 0.05823
MAE results for ep_res: 1.67724
--------------------------------------------
Pearson correlation for de_res: 0.03112
MAE results for de_res: 3.33628
--------------------------------