In [29]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold, train_test_split, cross_val_predict, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score
from scipy.stats import ttest_rel
import statsmodels.api as sm
import seaborn as sns
import pickle



from utils import *

%load_ext autoreload
%autoreload 2
sns.set_context("talk")
conditions = ['hf', 'pn', 'copd']


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [30]:
### LOAD IN ###
Xs = []
ys = []
for cond in conditions:
    y = pd.read_csv(DM_PATH / f"{cond}_y_90.csv", index_col=False)
    ys.append(y.squeeze())

    X = pd.read_csv(DM_PATH / f"{cond}_X_race_90.csv", index_col=False)
    Xs.append(X)

In [31]:
rfpal = sns.color_palette("mako", 4)
rfpal = rfpal.as_hex()
rfpal

## Hyperparams

In [32]:
from sklearn.model_selection import GridSearchCV

# Define the hyperparameter grid to search through
param_grid = {
    "n_estimators": [50, 100, 150, 200, 250],
    "max_depth": [5, 10, 15, 20],
    "min_samples_split": [10, 20, 40],
    "min_samples_leaf": [2, 4, 8, 16, 32],
    "max_features": ["auto", "sqrt"],
}

In [33]:
# Initialize the random forest model
hf_model = RandomForestRegressor(random_state=87)

# Perform GridSearchCV for hyperparameter tuning
hf_grid_search = GridSearchCV(
    estimator=hf_model, param_grid=param_grid, scoring="neg_mean_squared_error", cv=5
)
hf_grid_search.fit(Xs[0], ys[0])

1500 fits failed out of a total of 3000.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1500 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/jessbreda/anaconda3/envs/ds4a_py310/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/jessbreda/anaconda3/envs/ds4a_py310/lib/python3.10/site-packages/sklearn/base.py", line 1144, in wrapper
    estimator._validate_params()
  File "/Users/jessbreda/anaconda3/envs/ds4a_py310/lib/python3.10/site-packages/sklearn/base.py", line 637, in _validate_params
    validate_parameter_constraints(
  File "/Users/jessbreda/anaconda3/envs/ds4a_py310/lib/python3.1

In [34]:
# Print the best hyperparameters and the corresponding RMSE
print("Best Hyperparameters for HF 90th Model :", hf_grid_search.best_params_)
print("Best RMSE for HF 90th Model :", np.sqrt(-hf_grid_search.best_score_))

Best Hyperparameters for HF 90th Model : {'max_depth': 5, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 20, 'n_estimators': 50}
Best RMSE for HF 90th Model : 2.6083954922637753


In [35]:
# Initialize the random forest model
pn_model = RandomForestRegressor(random_state=87)

# Perform GridSearchCV for hyperparameter tuning
pn_grid_search = GridSearchCV(
    estimator=pn_model, param_grid=param_grid, scoring="neg_mean_squared_error", cv=5
)
pn_grid_search.fit(Xs[1], ys[1])

1500 fits failed out of a total of 3000.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1500 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/jessbreda/anaconda3/envs/ds4a_py310/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/jessbreda/anaconda3/envs/ds4a_py310/lib/python3.10/site-packages/sklearn/base.py", line 1144, in wrapper
    estimator._validate_params()
  File "/Users/jessbreda/anaconda3/envs/ds4a_py310/lib/python3.10/site-packages/sklearn/base.py", line 637, in _validate_params
    validate_parameter_constraints(
  File "/Users/jessbreda/anaconda3/envs/ds4a_py310/lib/python3.1

In [36]:
# Print the best hyperparameters and the corresponding RMSE
print("Best Hyperparameters for PN 90th Model :", pn_grid_search.best_params_)
print("Best RMSE for PN 90th Model :", np.sqrt(-pn_grid_search.best_score_))

Best Hyperparameters for PN 90th Model : {'max_depth': 5, 'max_features': 'sqrt', 'min_samples_leaf': 32, 'min_samples_split': 10, 'n_estimators': 100}
Best RMSE for PN 90th Model : 2.1099437518748028


In [37]:
# Initialize the random forest model
copd_model = RandomForestRegressor(random_state=87)

# Perform GridSearchCV for hyperparameter tuning
copd_grid_search = GridSearchCV(
    estimator=copd_model, param_grid=param_grid, scoring="neg_mean_squared_error", cv=5
)
copd_grid_search.fit(Xs[2], ys[2])

1500 fits failed out of a total of 3000.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1500 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/jessbreda/anaconda3/envs/ds4a_py310/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/jessbreda/anaconda3/envs/ds4a_py310/lib/python3.10/site-packages/sklearn/base.py", line 1144, in wrapper
    estimator._validate_params()
  File "/Users/jessbreda/anaconda3/envs/ds4a_py310/lib/python3.10/site-packages/sklearn/base.py", line 637, in _validate_params
    validate_parameter_constraints(
  File "/Users/jessbreda/anaconda3/envs/ds4a_py310/lib/python3.1

In [38]:
# Print the best hyperparameters and the corresponding RMSE
print("Best Hyperparameters for COPD 90th Model :", copd_grid_search.best_params_)
print("Best RMSE for COPD 90th Model :", np.sqrt(-copd_grid_search.best_score_))

Best Hyperparameters for COPD 90th Model : {'max_depth': 5, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 10, 'n_estimators': 100}
Best RMSE for COPD 90th Model : 2.450842450830314


In [39]:
# Create a new dictionary to store all three dictionaries with appropriate keys
combined_dict = {
    "hf": hf_grid_search.best_params_,
    "pn": pn_grid_search.best_params_,
    "copd": copd_grid_search.best_params_,
}

# Save the combined dictionary as a pickle file
with open(RESULTS_PATH / "rf_hyperparams_90th_all_conds.pickle", "wb") as file:
    pickle.dump(combined_dict, file)

In [40]:
importances_dfs = []
for cond, X, y in zip(conditions, Xs, ys):
    # fit
    rf_model = RandomForestRegressor(random_state=87, **combined_dict[cond])
    rf_model.fit(X, y)

    # get importances
    importances = rf_model.feature_importances_
    feature_importance_df = pd.DataFrame(
        {
            "feature": X.columns,
            "importance": importances,
            "condition": [cond] * len(X.columns),
        }
    )
    importances_dfs.append(feature_importance_df)

In [41]:
pd.concat(importances_dfs).to_csv(
    RESULTS_PATH / "rf_all_conds_feature_importances_90th.csv"
)