In [1]:
from sklearn.model_selection import train_test_split
import load_dataset as ld

In [None]:
feature_names=['mass', 'semi_major_axis','eccentricity', 'star_metallicity',
                'star_radius', 'star_teff','star_mass', 'radius']

In [None]:
def split_dataset(dataset):
    dataset_exo = dataset[:-8]
    dataset_sol = dataset[-8:]
    
    

In [None]:
def build_random_forest(dataset):
    dataset_exo = dataset[:-8]
    dataset_sol = dataset[-8:]
    

In [None]:
def random_forest_regression(dataset,
                             model=saved_pickle_model,
                             fit=False):

    """Split the dataset into a training (75%) and testing set (25%)
    Removing 3 outliers planets from both sets

    If fit is True:
    Fitting the hyperparameters of the random forest regression
    otherwise loading a already fitted model


    INPUTS: dataset = pandas dataframe with all the exoplanets
                      and their planetary and stellar parameters as features
            model = random forest model with best fit hyperparameters
            fit = boolean, to do the fitting (True) or not (False)
    OUPUTS: regr = the random forest regression model
            y_test_predict = radius predictions of the test set
            train_test_values = arrays with the values of the train and test sets
            train_test_sets = pandas dataframes with exoplanets and features names
                              as well as the values"""


    # Preparing the training and test sets
    # ------------------------------------
    # Exoplanet and Solar system dataset
    dataset_exo = dataset[:501]
    dataset_sol = dataset[501:]

    # Separating the data into dependent and independent variables
    features = dataset_exo.iloc[:, :-1]   # mass, teq, etc
    labels = dataset_exo.iloc[:, -1]      # radius

    # Splitting the dataset into the Training set and Test set
    X_train, X_test, y_train, y_test = train_test_split(features,
                                                        labels,
                                                        test_size=0.25,
                                                        random_state=0)
    features_sol = dataset_sol.iloc[:, :-1]
    labels_sol = dataset_sol.iloc[:, -1]

    X_train_sol, X_test_sol, y_train_sol, y_test_sol = train_test_split(features_sol,
                                                                        labels_sol,
                                                                        test_size=0.25,
                                                                        random_state=0)

    X_train = X_train.append(X_train_sol)
    y_train = y_train.append(y_train_sol)
    X_test = X_test.append(X_test_sol)
    y_test = y_test.append(y_test_sol)

    # Outliers in the sample
    # Remove HATS-12 b from the training set
    X_test = X_test.drop(['HATS-12 b'])
    y_test = y_test.drop(labels=['HATS-12 b'])
    print('\nHATS-12 b removes from test set\n')

    # Remove K2-95 b from the training set
    X_train = X_train.drop(['K2-95 b'])
    y_train = y_train.drop(labels=['K2-95 b'])
    print('\nK2-95 b removes from training set\n')

    # Remove Kepler-11 g from the training set
    X_train = X_train.drop(['Kepler-11 g'])
    y_train = y_train.drop(labels=['Kepler-11 g'])
    print('\nKepler-11 g removes from training set\n')

    train_test_values = [X_train.values, X_test.values,
                         y_train.values, y_test.values]
    train_test_sets = [X_train, X_test, y_train, y_test]

    # Fitting the hyperparameters of the random forest model
    # with the grid search method
    # ------------------------------------------------------
    if fit:
        # Setting up the grid of hyperparameters
        rf = GridSearchCV(RandomForestRegressor(),
                          param_grid={'n_estimators': np.arange(80, 200),
                                      'max_depth': np.arange(4, 10),
                                      'max_features': np.arange(3, 6),
                                      'min_samples_split': np.arange(4, 5)},
                          cv=3, verbose=1, n_jobs=-1)

        # Fitting training set - finding best hyperparameters
        rf.fit(X_train, y_train)

        # Best hyperparameters found by the grid search
        print(rf.best_params_)

        # Random forest model with the best hyperparameters
        regr = RandomForestRegressor(n_estimators=rf.best_params_['n_estimators'],
                                     max_depth=rf.best_params_['max_depth'],
                                     max_features=rf.best_params_['max_features'],
                                     min_samples_split=rf.best_params_['min_samples_split'],
                                     random_state=0, oob_score=True)

        # Saving the random forest model in a file
        outdir = 'bem_output'
        if not os.path.exists(outdir):
            os.mkdir(outdir)

        name_Rf = 'r2_' + str(round(rf.best_score_, 2)) + '_' + str(datetime.datetime.now().strftime("%Y-%m-%d_%H:%M")) + '.pkl'
        name_Rf = os.path.join(outdir, name_Rf)

        joblib.dump(regr, name_Rf)
        print('RF model save in : ', name_Rf)

    else:
        # Loading the random forest model saved
        print('Loading random forest model: ', model)
        regr = joblib.load(model)

    # Fit the best random forest model to the training set
    # ----------------------------------------------------
    regr.fit(X_train, y_train)

    # Predict the radius for the training and testing sets
    y_train_predict = regr.predict(X_train)
    y_test_predict = regr.predict(X_test)

    # Scores of the random forest
    test_score = r2_score(y_test, y_test_predict)
    pearson = pearsonr(y_test, y_test_predict)
    print(f'Test set, R-2 score: {test_score:>5.3}')
    print(f'\nTest set, Pearson correlation: {pearson[0]:.3}')

    # Mean squared errors of the train and test set
    print('Root mean squared errors')
    print('Train set: ', np.sqrt(np.mean((y_train-y_train_predict)**2)),
          '\nTest set:  ', np.sqrt(np.mean((y_test-y_test_predict)**2)))

    # Feature importance
    name_features = dataset.columns.tolist()
    print('\nFeature importance')
    _ = [print(name, ':  \t', value)
         for name, value
         in zip(name_features, regr.feature_importances_)]

    return regr, y_test_predict, train_test_values, train_test_sets