In [2]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV
from imblearn.over_sampling import RandomOverSampler
from imblearn.combine import SMOTEENN
from sklearn.ensemble import ExtraTreesClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.metrics import balanced_accuracy_score

In [9]:
filename ='data/star_classification.csv'
sdss17_df_raw = pd.read_csv(filename)

num_rows = np.shape(sdss17_df_raw)[0]
num_cols = np.shape(sdss17_df_raw)[1]

sdss17_df = sdss17_df_raw.drop(['obj_ID', 'alpha', 'delta', 'run_ID', 'rerun_ID', 'cam_col', 'field_ID', 'fiber_ID', 'spec_obj_ID', 'MJD', 'plate'], axis = 1)

# Remove Very Bright Objects
sdss17_df_no_anomaly = sdss17_df[sdss17_df['u'] > -1]

num_rows = np.shape(sdss17_df_no_anomaly)[0]
num_cols = np.shape(sdss17_df_no_anomaly)[1]

sdss17_features = sdss17_df_no_anomaly.drop('class', axis=1, inplace=False)
sdss17_class_categorical = sdss17_df_no_anomaly['class']
classes = sdss17_class_categorical.unique() # redundant - should be created earlier

X_train, X_test, y_train, y_test = train_test_split(
    sdss17_features,
    sdss17_class_categorical,
    test_size=0.33,
    random_state=1000)

In [4]:
scorers = {
            'f1_score': make_scorer(f1_score, average='macro'),
            'precision_score': make_scorer(precision_score, average='macro'),
            'recall_score': make_scorer(recall_score, average='macro'),
            'accuracy_score': make_scorer(accuracy_score)
          }

def multi_gridsearch(pipes, parameters):
    """
    Apply Grid Search to different pipelines of the same model.
    
    pipes: a list Pipeline objects
    parameters: the parameter grid for Grid Search
    """
    for pipe in pipes:
        print(pipe)
        grid_pipeline = GridSearchCV(pipe, parameters, verbose=1, cv=3, scoring=scorers, refit="precision_score")
        grid_pipeline.fit(X_train,y_train)
        print(grid_pipeline.best_params_, end='\n\n')

In [5]:

smote = Pipeline([('SMOTE', SMOTE(random_state=0)),
                  ('rfc', RandomForestClassifier(random_state=0))
                 ])

roversample = Pipeline([('Oversample', RandomOverSampler(random_state=0)),
                        ('rfc', RandomForestClassifier(random_state=0))
                       ])



smoteenn = Pipeline([('SMOTEENN', SMOTEENN(random_state=0)),
                  ('rfc', RandomForestClassifier(random_state=0))
                 ])

param_grid = {
  'rfc__n_estimators':[500, 1000],
  'rfc__min_samples_split': [2, 4]
  }

pipeline_list= [smote, roversample, smoteenn]
multi_gridsearch(pipeline_list, param_grid)

Pipeline(steps=[('SMOTE', SMOTE()), ('rfc', RandomForestClassifier())])
Fitting 3 folds for each of 4 candidates, totalling 12 fits
{'rfc__min_samples_split': 4, 'rfc__n_estimators': 500}

Pipeline(steps=[('Oversample', RandomOverSampler()),
                ('rfc', RandomForestClassifier())])
Fitting 3 folds for each of 4 candidates, totalling 12 fits
{'rfc__min_samples_split': 2, 'rfc__n_estimators': 1000}

Pipeline(steps=[('SMOTEENN', SMOTEENN()), ('rfc', RandomForestClassifier())])
Fitting 3 folds for each of 4 candidates, totalling 12 fits
{'rfc__min_samples_split': 4, 'rfc__n_estimators': 1000}



In [6]:

smote = Pipeline([('SMOTE', SMOTE(random_state=0)),
                  ('et', ExtraTreesClassifier(random_state=0))
                 ])

roversample = Pipeline([('Oversample', RandomOverSampler(random_state=0)),
                        ('et', ExtraTreesClassifier(random_state=0))
                       ])



smoteenn = Pipeline([('SMOTEENN', SMOTE(random_state=0)),
                  ('et', ExtraTreesClassifier(random_state=0))
                 ])



param_grid = {
  'et__n_estimators':[500, 1000],
  'et__min_samples_split': [2, 4]
  }

pipeline_list=[smote, roversample, smoteenn]

multi_gridsearch(pipeline_list, param_grid)

Pipeline(steps=[('SMOTE', SMOTE()), ('et', ExtraTreesClassifier())])
Fitting 3 folds for each of 4 candidates, totalling 12 fits
{'et__min_samples_split': 4, 'et__n_estimators': 500}

Pipeline(steps=[('Oversample', RandomOverSampler()),
                ('et', ExtraTreesClassifier())])
Fitting 3 folds for each of 4 candidates, totalling 12 fits
{'et__min_samples_split': 2, 'et__n_estimators': 1000}

Pipeline(steps=[('SMOTEENN', SMOTE()), ('et', ExtraTreesClassifier())])
Fitting 3 folds for each of 4 candidates, totalling 12 fits
{'et__min_samples_split': 2, 'et__n_estimators': 500}



In [11]:
brf = Pipeline([('brf', BalancedRandomForestClassifier())])


param_grid = {
  'brf__n_estimators':[500, 1000],
  'brf__min_samples_split': [2, 4]
  }

pipeline_list=[brf]

multi_gridsearch(pipeline_list, param_grid)

Pipeline(steps=[('brf', BalancedRandomForestClassifier())])
Fitting 3 folds for each of 4 candidates, totalling 12 fits
{'brf__min_samples_split': 2, 'brf__n_estimators': 1000}

