In [1]:
from dataset import SNPmarkersDataset
from sklearn.ensemble import RandomForestRegressor
from scipy.stats import pearsonr
import numpy as np
from sklearn.metrics import mean_absolute_error
import time
import pandas as pd

In [2]:
train_dataset = SNPmarkersDataset(mode="local_train", skip_check= True)
validation_dataset = SNPmarkersDataset(mode="validation", skip_check= True)
phenotypes = list(train_dataset.phenotypes.keys())

train_dataset.set_phenotypes = phenotypes[0]
X_train = train_dataset.get_all_SNP()

In [3]:
max_depth = [15]
max_features = [int(np.sqrt(X_train.shape[-1])), 0.005]
MAE_results = np.zeros((len(max_depth), len(max_features)))
correlation_results = np.zeros((len(max_depth), len(max_features)))  

In [9]:
print(np.array(train_dataset.phenotypes["ep_res"]).ravel()[0:5])
print((np.array(train_dataset.phenotypes["ep_res"]).ravel()/ train_dataset.pheno_std[phenotype])[0:5])

[ 2.340485 -0.029124  3.00916   3.133567  3.449609]
[ 1.08363485 -0.01348429  1.3932286   1.45082853  1.59715467]


In [7]:
for phenotype in phenotypes:
    train_dataset.set_phenotypes = phenotype
    validation_dataset.set_phenotypes = phenotype

    X_train = train_dataset.get_all_SNP()
    Y_train = np.array(train_dataset.phenotypes[phenotype]).ravel() / train_dataset.pheno_std[phenotype]
    
    X_validation = validation_dataset.get_all_SNP()
    Y_validation = np.array(validation_dataset.phenotypes[phenotype]).ravel()

    print(X_train.shape)
    print(Y_train.shape)
    print(X_validation.shape)
    print(Y_validation.shape)
    
    for i,max_depth_value in enumerate(max_depth):
        for j, max_feature in enumerate(max_features):
            model = RandomForestRegressor(n_estimators=4, max_depth=max_depth_value, max_features=max_feature, random_state=2307, n_jobs=-1).fit(X_train, Y_train)
            predictions = model.predict(X_validation)
            print("////////////////////////////////////////////")
            print(f"Iteration {i * len(max_features) + (j+1)}/{len(max_features) * len(max_depth)}")
            if type(max_feature) == int:
                max_nb_of_tree = max_feature
            elif type(max_feature) == float:
                max_nb_of_tree = int(max_feature*X_train.shape[-1])
            print(f"Max depth value tested: {max_depth_value}, max nb of features used per tree: {max_nb_of_tree}")

            MAE_results[i][j] = mean_absolute_error(Y_validation, predictions)
            correlation_results[i][j] = pearsonr(Y_validation, predictions).statistic
            print("--------------------------------------------")
            print(f"Pearson correlation for {phenotype}: {correlation_results[i][j]:.5f}")
            print(f"MAE results for {phenotype}: {MAE_results[i][j]:.5f}")


(1000, 36304)
(1000,)
(1000, 36304)
(1000,)
////////////////////////////////////////////
Iteration 1/8
Max depth value tested: 15, max nb of features used per tree: 190
--------------------------------------------
Pearson correlation for ep_res: 0.07183
MAE results for ep_res: 2.36750
////////////////////////////////////////////
Iteration 2/8
Max depth value tested: 15, max nb of features used per tree: 181
--------------------------------------------
Pearson correlation for ep_res: 0.12907
MAE results for ep_res: 2.40059
////////////////////////////////////////////
Iteration 3/8
Max depth value tested: 15, max nb of features used per tree: 190


IndexError: index 1 is out of bounds for axis 0 with size 1

In [None]:
max_depth = list()
avg_depth = []
for tree in model.estimators_:
    max_depth.append(tree.tree_.max_depth)

max_depth

[15, 15, 15, 15]

In [None]:
pd.DataFrame(MAE_results, 
                    index=[f"max_depth = {i}" for i in max_depth], 
                    columns=[f"max_features = {i}" for i in max_features]
                    )

In [None]:
pd.DataFrame(correlation_results, 
                    index=[f"max_depth = {i}" for i in max_depth], 
                    columns=[f"max_features = {i}" for i in max_features]
                    )