# I.  Libraries

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import recall_score, classification_report
from sklearn.feature_selection import RFECV, SequentialFeatureSelector, SelectFromModel

# II. Defaults

In [2]:
seed = 12 # 12 ears
y = 'OverallPoF' # target

# cv parameters
cv_params = {
    'cv': 10, # k-fold
    'n_jobs': -1, # use all cpu cores
    'scoring': 'f1_macro',
}

# classifier parameters
base_params = {
    'dual': False,
    'random_state': seed,
    'max_iter': 23220451 * 12,
    'class_weight': 'balanced'
}
hyper_params = {
    'penalty': [ 'l1', 'l2' ],
    'C': [ 0.01, 0.1, 1, 10, 100 ]
}

# sample choices
sample = '00 - Dataset/03 - Sampling Splits/00 - Original/'
# sample = '00 - Dataset/03 - Sampling Splits/01 - NM2/'
# sample = '00 - Dataset/03 - Sampling Splits/02 - SMOTE/'
# sample = '00 - Dataset/03 - Sampling Splits/03 - SMOTEENN/'
# sample = '00 - Dataset/03 - Sampling Splits/04 - SMOTETomek/'

# III. Dataset

## Import

In [3]:
X_train = pd.read_csv(sample + 'X_train.csv')
y_train = pd.read_csv(sample + 'y_train.csv')
X_val = pd.read_csv(sample + 'X_val.csv')
y_val = pd.read_csv(sample + 'y_val.csv')
X_test = pd.read_csv(sample + 'X_test.csv')
y_test = pd.read_csv(sample + 'y_test.csv')

## Absorbance Only

In [4]:
X_train = X_train.filter(regex = r'fa[0-9]*')
X_val = X_val.filter(regex = r'fa[0-9]*')
X_test = X_test.filter(regex = r'fa[0-9]*')

## Scaling

In [5]:
scaler = StandardScaler()
X_train = pd.DataFrame(scaler.fit_transform(X_train), columns = X_train.columns)
X_val = pd.DataFrame(scaler.transform(X_val), columns = X_val.columns)
X_test = pd.DataFrame(scaler.transform(X_test), columns = X_test.columns)

# IV. Functions

In [6]:
def evaluate_pipeline(pipeline, tune_params, X_train = X_train, X_val = X_val, return_n = True):
    best_pipe = GridSearchCV(
        **cv_params,
        estimator = pipeline,
        param_grid = tune_params,
    ).fit(X_train, y_train[ y ]).best_estimator_
    
    f_selector = best_pipe[ 'f_selector' ]
    classifier = best_pipe[ 'classifier' ]
    
    columns = X_train.columns[ f_selector.get_support() ]
    print(f'\nSelected number of features:')
    print(len(columns))
    print(f'\nThe selected features:')
    print(columns)
    
    X_train = f_selector.transform(X_train)
    X_val = f_selector.transform(X_val)
    
    print(f'\nTraining Recall:', recall_score(
        y_true = y_train, 
        average = 'macro',
        y_pred = classifier.predict(X_train)
    ))
    print(f'\nValidation Recall:', recall_score(
        y_true = y_val, 
        average = 'macro',
        y_pred = classifier.predict(X_val)
    ))
    
    return len(columns)

# V. Feature Selection

## SelectFromModel

In [7]:
sfm_pipe = Pipeline([
    ('f_selector', SelectFromModel(estimator = LinearSVC(**base_params, penalty = 'l1'))),
    ('classifier', LinearSVC(**base_params))
])

In [8]:
# sfm_pipe.get_params()

In [9]:
sfm_n_features = evaluate_pipeline(
    pipeline = sfm_pipe,
    tune_params = {
        # 'f_selector__estimator__C': hyper_params[ 'C' ],
        'classifier__C': hyper_params[ 'C' ],
        'classifier__penalty': hyper_params[ 'penalty' ]
    }
)


Selected number of features:
24

The selected features:
Index(['fa280', 'fa363', 'fa408', 'fa727', 'fa771', 'fa943', 'fa1296',
       'fa1542', 'fa1781', 'fa1834', 'fa2310', 'fa2911', 'fa2996', 'fa3174',
       'fa3363', 'fa3775', 'fa4000', 'fa4237', 'fa4756', 'fa4896', 'fa5495',
       'fa5656', 'fa5993', 'fa7550'],
      dtype='object')

Training Recall: 0.9272727272727272

Validation Recall: 0.8611111111111112


## RFE

In [10]:
rfe_pipe = Pipeline([
    ('f_selector', RFECV(**cv_params, estimator = LinearSVC(**base_params))),
    ('classifier', LinearSVC(**base_params))
])

In [11]:
# rfe_pipe.get_params()

In [12]:
rfe_n_features = evaluate_pipeline(
    pipeline = rfe_pipe,
    tune_params = {
        # 'f_selector__estimator__C': hyper_params[ 'C' ],
        # 'f_selector__estimator__penalty': hyper_params[ 'penalty' ],
        'classifier__C': hyper_params[ 'C' ],
        'classifier__penalty': hyper_params[ 'penalty' ]
    }
)

26 fits failed out of a total of 100.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
24 fits failed with the following error:
Traceback (most recent call last):
  File "C:\ProgramData\Anaconda3\envs\dududev\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\ProgramData\Anaconda3\envs\dududev\lib\site-packages\sklearn\pipeline.py", line 378, in fit
    Xt = self._fit(X, y, **fit_params_steps)
  File "C:\ProgramData\Anaconda3\envs\dududev\lib\site-packages\sklearn\pipeline.py", line 336, in _fit
    X, fitted_transformer = fit_transform_one_cached(
  File "C:\ProgramData\Anaconda3\envs\dududev\lib\site-packages\joblib\memory.py", line 349, in 


Selected number of features:
62

The selected features:
Index(['fa226', 'fa257', 'fa280', 'fa297', 'fa363', 'fa385', 'fa408', 'fa432',
       'fa471', 'fa514', 'fa561', 'fa577', 'fa594', 'fa648', 'fa707', 'fa727',
       'fa749', 'fa771', 'fa793', 'fa840', 'fa865', 'fa917', 'fa943', 'fa971',
       'fa1029', 'fa1090', 'fa1122', 'fa1155', 'fa1189', 'fa1296', 'fa1334',
       'fa1373', 'fa1498', 'fa1542', 'fa1587', 'fa1681', 'fa1731', 'fa1781',
       'fa1834', 'fa1943', 'fa2058', 'fa2911', 'fa2996', 'fa3174', 'fa3267',
       'fa3775', 'fa3886', 'fa4000', 'fa4117', 'fa4237', 'fa4362', 'fa4489',
       'fa4621', 'fa4756', 'fa4896', 'fa5187', 'fa5339', 'fa5495', 'fa5656',
       'fa5993', 'fa6168', 'fa7550'],
      dtype='object')

Training Recall: 0.7398601398601399

Validation Recall: 0.8277777777777777


## SFS

In [13]:
sfs_pipe = Pipeline([
    ('f_selector', SequentialFeatureSelector(**cv_params, estimator = LinearSVC(**base_params))),
    ('classifier', LinearSVC(**base_params))
])

In [14]:
# sfs_pipe.get_params()

In [15]:
evaluate_pipeline(
    return_n = False,
    pipeline = sfs_pipe,
    tune_params = {
        # 'f_selector__estimator__C': hyper_params[ 'C' ],
        # 'f_selector__estimator__penalty': hyper_params[ 'penalty' ],
        'f_selector__n_features_to_select': [ sfm_n_features ],
        'classifier__C': hyper_params[ 'C' ],
        'classifier__penalty': hyper_params[ 'penalty' ]
    }
)


Selected number of features:
24

The selected features:
Index(['fa226', 'fa667', 'fa686', 'fa707', 'fa971', 'fa1000', 'fa1090',
       'fa1122', 'fa1155', 'fa1224', 'fa1633', 'fa1681', 'fa1731', 'fa1887',
       'fa1943', 'fa2378', 'fa2593', 'fa2669', 'fa2747', 'fa2828', 'fa2911',
       'fa2996', 'fa3084', 'fa3174'],
      dtype='object')

Training Recall: 0.8475524475524476

Validation Recall: 0.8222222222222222


24