In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import math
from sklearn.metrics import mean_absolute_error, accuracy_score
from catboost import CatBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from modeling import select_best_classifier

### Load Data

In [6]:
# load previously prepared data for machine learning models
X_train = np.genfromtxt('./data_preprocessed/X_train', delimiter=',')
X_val = np.genfromtxt('./data_preprocessed/X_val', delimiter=',')
X_test = np.genfromtxt('./data_preprocessed/X_test', delimiter=',')

y_train = np.genfromtxt('./data_preprocessed/y_train', delimiter=',')
y_val = np.genfromtxt('./data_preprocessed/y_val', delimiter=',')
y_test = np.genfromtxt('./data_preprocessed/y_test', delimiter=',')

# look at shape loaded data
print(X_train.shape, X_val.shape, X_test.shape)
print(y_train.shape, y_val.shape, y_test.shape)

(4250, 140) (375, 140) (375, 140)
(4250,) (375,) (375,)


### Scaling

In [7]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import MinMaxScaler, StandardScaler

class MultipleColumnScaler(BaseEstimator, TransformerMixin):
    ''' take multiple columns and scaling it's keeping original ratio between them '''
    def __init__(self, scaler):
        self.scaler = scaler
        
    def fit(self, X, y=None):
        columns_merged = X[:,0]
        for i in range(1, X.shape[1]):
            columns_merged = np.concatenate((columns_merged, X[:,i]), axis=0)
        self.scaler.fit(columns_merged.reshape(-1,1))
        return self.scaler
    
    def transform(self, X, y=None):
        X_new = self.scaler.transform(X[:, 0].reshape(-1,1))
        for i in range(1, X.shape[1]):
            X_curr = self.scaler.transform(X[:, i].reshape(-1,1))
            X_new = np.concatenate((X_new, X_curr), axis=1)
        return X_new

In [8]:
X_scaler = MultipleColumnScaler(MinMaxScaler(feature_range=(-1,1) ) )
X_scaler = X_scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_val_scaled = X_scaler.transform(X_val)
X_test_scaled = X_scaler.transform(X_test)

In [9]:
# inverse scalling
X_train_inv = X_scaler.inverse_transform(X_train_scaled)
X_val_inv = X_scaler.inverse_transform(X_val_scaled)
X_test_inv = X_scaler.inverse_transform(X_test_scaled)

# checking correctivness of scalling using mean square error
print(f'Train set inverse transformation error: {mean_absolute_error(X_train.flatten(), X_train_inv.flatten())}')
print(f'Val set inverse transformation error: {mean_absolute_error(X_val.flatten(), X_val_inv.flatten())}')
print(f'Test set inverse transformation error: {mean_absolute_error(X_test.flatten(), X_test_inv.flatten())}')

Train set inverse transformation error: 2.5378206490315346e-17
Val set inverse transformation error: 2.483230499483388e-17
Test set inverse transformation error: 2.540636574741984e-17


### RandomForest Classifier

In [12]:
# define params for random grid search
params_grid={
   'n_estimators': [100, 200, 300, 400, 600, 800, 1000, 1200, 1400, 1600],
   'max_depth': [3, 5, 7, 9, 11, 13, 15, 17, 19, 21],
   'criterion': ['gini', 'entropy']
}
    
# to safely run multiprocessing on Windows
if __name__ == '__main__':
    
    # function selecting best classifiers using multiprocessing
    best_model, best_scoring = select_best_classifier(estimator=RandomForestClassifier, 
                                                      params_grid=params_grid,
                                                      n_iter=20, 
                                                      random_state=42,
                                                      X_train=X_train_scaled, 
                                                      y_train=y_train, 
                                                      X_val=X_val_scaled, 
                                                      y_val=y_val, 
                                                      verbose=1,
                                                      max_workers=1)
    # show best selected models
    print(best_model, best_scoring)

["RandomForestClassifier{'n_estimators': 800, 'max_depth': 21, 'criterion': 'gini'}"
 RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=21, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=800,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)] 0.992
