# 1.  Libraries

In [1]:
import pandas as pd
from scipy import stats
import matplotlib.pyplot as plt
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, LeaveOneOut
from sklearn.metrics import roc_auc_score, classification_report
from sklearn.feature_selection import RFECV, SequentialFeatureSelector, SelectFromModel

# 2. Defaults

In [2]:
seed = 12 # 12 ears
y = 'OverallPoF' # target

# cv parameters
cv_params = {
    'cv': 3, # k-fold
    'n_jobs': -1, # use all cpu cores
    'scoring': 'recall_macro',
    'return_train_score': True
}

# model parameters
base_params = {
    'dual': False,
    'random_state': seed,
    'max_iter': 23220451 * 12,
    'class_weight': 'balanced'
}
hyper_params = {
    'penalty': [ 'l1', 'l2' ],
    'C': [ 0.01, 0.1, 1, 10, 100 ]
}

# sample choices
sample = '00 - Dataset/03 - Sampling Splits/00 - Original/'
# sample = '00 - Dataset/03 - Sampling Splits/01 - NM2/'
# sample = '00 - Dataset/03 - Sampling Splits/02 - SMOTE/'
# sample = '00 - Dataset/03 - Sampling Splits/03 - SMOTEENN/'
# sample = '00 - Dataset/03 - Sampling Splits/04 - SMOTETomek/'

# 3. Dataset

## Import

In [3]:
X_train = pd.read_csv(sample + 'X_train.csv').filter(regex = r'fa[0-9]*')
y_train = pd.read_csv(sample + 'y_train.csv')[ y ]
X_val = pd.read_csv(sample + 'X_val.csv').filter(regex = r'fa[0-9]*')
y_val = pd.read_csv(sample + 'y_val.csv')[ y ]
X_test = pd.read_csv(sample + 'X_test.csv').filter(regex = r'fa[0-9]*')
y_test = pd.read_csv(sample + 'y_test.csv')[ y ]

## Scaling

In [4]:
scaler = StandardScaler()
X_train = pd.DataFrame(scaler.fit_transform(X_train), columns = X_train.columns)
X_val = pd.DataFrame(scaler.transform(X_val), columns = X_val.columns)
X_test = pd.DataFrame(scaler.transform(X_test), columns = X_test.columns)

# 4. Functions

In [5]:
def refit_strategy(cv_results):
    splits = set([ 
        k.split('_')[ 0 ] for k in cv_results.keys() if k.startswith('split') 
    ])
    n_folds = len(cv_results[ 'params' ])
    significance = [ ]
    
    for i in range(n_folds):
        train_scores = [ ]
        test_scores = [ ]
        
        for split in splits:
            train_scores.append(cv_results[ split + '_train_score' ][ i ])
            test_scores.append(cv_results[ split + '_test_score' ][ i ])
        
        _, pvalue = stats.ttest_ind(
            train_scores, 
            test_scores,
            equal_var = False
        )
        
        if (pvalue > 0.05):
            significance.append(False)
        else:
            significance.append(True)
        
    chosen_idx = list(cv_results[ 'rank_test_score' ]).index(1)
    for i in range(len(significance) - 1, -1, -1):
        if (significance[ i ]):
            continue
        chosen_idx = i
        break
        
    print(cv_results[ 'mean_train_score' ])
    print(cv_results[ 'std_train_score' ])
    print(cv_results[ 'mean_test_score' ])
    print(cv_results[ 'std_test_score' ])
    print(significance)
    return chosen_idx

def evaluate_pipeline(pipeline, tune_params, X_train = X_train, X_val = X_val, X_test = X_test):
    grid = GridSearchCV(
        **cv_params,
        estimator = pipeline,
        refit = refit_strategy,
        param_grid = tune_params,
    )
    grid.fit(X_train, y_train)
    # print(grid.best_score_)
    
    f_selector = grid.best_estimator_[ 'f_selector' ]
    classifier = grid.best_estimator_[ 'classifier' ]
    
    columns = X_train.columns[ f_selector.get_support() ]
    
    X_train = f_selector.transform(X_train)
    X_val = f_selector.transform(X_val)
    X_test = f_selector.transform(X_test)
    
    print(f'\nSelected number of features:')
    print(len(columns))
    
    print(f'\nThe selected features:')
    print(list(columns))
    
    print(f'\nOptimum parameters:')
    print(classifier.get_params())
    
    print(f'\nTraining AUC:', roc_auc_score(
        y_true = y_train, 
        y_score = classifier.decision_function(X_train)
    ))
    print(f'\nValidation AUC:', roc_auc_score(
        y_true = y_val, 
        y_score = classifier.decision_function(X_val)
    ))
    print(f'\nTesting AUC:', roc_auc_score(
        y_true = y_test, 
        y_score = classifier.decision_function(X_test)
    ))

# 5. Pipelines

## 5.1. L1

In [6]:
l1_pipe = Pipeline([
  (
      'f_selector', 
      SelectFromModel(
          estimator = LinearSVC(**base_params, penalty = 'l1')
      )
  ),
  (
      'classifier', 
      LinearSVC(**base_params)
  )
])

In [7]:
# l1_pipe.get_params()

In [8]:
l1_pipe_params = {
    'classifier__C': hyper_params[ 'C' ],
    'classifier__penalty': hyper_params[ 'penalty' ]
}

## 5.2. L2

In [9]:
l2_pipe = Pipeline([
  (
      'f_selector', 
      SelectFromModel(
          estimator = LinearSVC(**base_params),
          threshold = '1.5 * mean'
      )
  ),
  (
      'classifier', 
      LinearSVC(**base_params)
  )
])

In [10]:
# l2_pipe.get_params()

In [11]:
l2_pipe_params = {
    'classifier__C': hyper_params[ 'C' ],
    'classifier__penalty': hyper_params[ 'penalty' ]
}

# 6. Fitting the Pipelines

## 6.1. L1

In [12]:
evaluate_pipeline(
    pipeline = l1_pipe,
    tune_params = l1_pipe_params
)

[0.73941778 0.84558814 0.87209175 0.93657604 0.969215   0.97113071
 1.         0.99035463 1.         1.        ]
[0.00382343 0.02327263 0.03458981 0.01686702 0.00551538 0.00485712
 0.         0.00723963 0.         0.        ]
[0.7240071  0.81911935 0.8108137  0.8137836  0.80990763 0.78222214
 0.73495545 0.71114593 0.74640718 0.71114593]
[0.01708939 0.04755865 0.0671038  0.05446248 0.04197524 0.04507883
 0.08872216 0.10268102 0.08689437 0.10268102]
[False, False, False, False, True, True, False, False, False, False]

Selected number of features:
24

The selected features:
['fa280', 'fa363', 'fa408', 'fa727', 'fa771', 'fa943', 'fa1296', 'fa1542', 'fa1781', 'fa1834', 'fa2310', 'fa2911', 'fa2996', 'fa3174', 'fa3363', 'fa3775', 'fa4000', 'fa4237', 'fa4756', 'fa4896', 'fa5495', 'fa5656', 'fa5993', 'fa7550']

Optimum parameters:
{'C': 100, 'class_weight': 'balanced', 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'loss': 'squared_hinge', 'max_iter': 278645412, 'multi_class': 'o

## 6.2. L2

In [13]:
evaluate_pipeline(
    pipeline = l2_pipe,
    tune_params = l2_pipe_params
)

[0.7436597  0.83749761 0.84182355 0.9066687  0.96581128 0.97493986
 0.99806202 0.99420832 1.         1.        ]
[0.0138449  0.02688273 0.00965048 0.0428711  0.01651996 0.0120119
 0.00274072 0.00474718 0.         0.        ]
[0.72013113 0.77989404 0.79152195 0.80224378 0.76293038 0.77068232
 0.76310656 0.74317301 0.74308492 0.75092495]
[0.01701789 0.08658061 0.07812826 0.05449599 0.02528693 0.02751592
 0.05930889 0.04019737 0.0437018  0.05084785]
[False, False, False, False, True, True, True, True, True, True]

Selected number of features:
24

The selected features:
['fa280', 'fa408', 'fa432', 'fa514', 'fa707', 'fa727', 'fa771', 'fa793', 'fa1296', 'fa1334', 'fa1498', 'fa1542', 'fa2996', 'fa3775', 'fa3886', 'fa4000', 'fa4117', 'fa4237', 'fa4362', 'fa4756', 'fa4896', 'fa5495', 'fa5656', 'fa5993']

Optimum parameters:
{'C': 0.1, 'class_weight': 'balanced', 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'loss': 'squared_hinge', 'max_iter': 278645412, 'multi_class': 'ovr', 'p