In [7]:
import matplotlib.pyplot as plt
import numpy as np
import matplotlib
from operator import itemgetter

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

from sklearn.preprocessing import StandardScaler

from sklearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression
from sklearn.dummy import DummyClassifier
from sklearn.svm import LinearSVC

%matplotlib inline
matplotlib.rcParams['figure.figsize'] = (10.0, 8.0)

# hyperparameter permutations
SEED = 42    # random state seed for consistent results
FOLDS = 5    # number of folds for K-fold cross validation

DUAL = [False, True] # WHEN n_samples > n_features DUAL=FALSE
LOSSES = ['squared_hinge','hinge']
PENALTIES = ['l1','l2']
SOLVERS = ['lbfgs'] # 'newton-cg','sag','saga',
error_coefs = np.logspace(-0.01, 2, 10)

KFOLD = KFold(n_splits=FOLDS, random_state=SEED)

#### Model selection using subset of original dataset

In [2]:
mini_X = np.loadtxt("./Datasets/mini_train_x.csv", delimiter=",")
mini_y = np.loadtxt("./Datasets/mini_train_y.csv", delimiter=",")

mini_X_train, mini_X_test, mini_y_train, mini_y_test = train_test_split(mini_X, mini_y, 
                                                                        test_size=0.3,
                                                                        random_state=SEED)

#### Helper methods
Methods to generate all possible permutations of HyperParameters 
for LinearSVMs and Logistic Regressor:

In [9]:
def get_svm_hp_permutations():
    clfs = []
    for d in DUAL:
        for l in LOSSES:
            for p in PENALTIES:
                # ignore unvalid combinations
                if not d and p == 'l2': continue
                if p == 'l1' and l == 'hinge': continue
                if d and p == 'l1' and l == 'squared_hinge': continue

                for c in error_coefs:
                    clfs.append(LinearSVC(C=c,dual=d,loss=l,penalty=p))
    
    return clfs

def get_logreg_hp_permutations():
    clfs = []
#     for d in DUAL:
    for p in PENALTIES:
        for s in SOLVERS:
            # ignore unvalid combinations
#             if not d and p == 'l2': continue
            if s != 'saga' and p == 'l1': continue

            for c in error_coefs:
                clfs.append(LogisticRegression(C=c,penalty=p,solver=s,n_jobs=8))
    
    return clfs

#### Base and dummy clf performances for reference

In [4]:
# dummy clf performance
random_clf = DummyClassifier(random_state=42)
result = cross_val_score(random_clf, mini_X_train, mini_y_train, cv=KFOLD, scoring='f1_micro')
print('Random clf performance: {:.4f}'.format(result.mean()))

# Base performance (default HP)
result = cross_val_score(LinearSVC(), mini_X_train, mini_y_train, cv=KFOLD, scoring='f1_micro')
print('LinearSVM base performance: {:.4f}'.format(result.mean()))

result = cross_val_score(LogisticRegression(), mini_X_train, mini_y_train, cv=KFOLD, scoring='f1_micro')
print('LogReg base performance: {:.4f}'.format(result.mean()))

Random clf performance: 0.0857
LinearSVM base performance: 0.1000
LogReg base performance: 0.1000


#### Perform Model Selection for LinearSM

In [5]:
scores = []
# create preprocessing pipeline
for specific_svm_permutation in get_svm_hp_permutations():
    estimators = []
    estimators.append(('standardize', StandardScaler()))
    estimators.append(('clf', specific_svm_permutation))
    model = Pipeline(estimators)
    result = cross_val_score(model, mini_X_train, mini_y_train, cv=KFOLD, scoring='f1_micro')

    scores.append((specific_svm_permutation, result.mean()))
    
best_svm, max_train = max(scores,key=itemgetter(1)) 
print('Best SVM: \n{}'.format(best_svm))
print('TRAIN SCORE: {:.4f}'.format(max_train))

Best SVM: 
LinearSVC(C=0.97723722095581067, class_weight=None, dual=True,
     fit_intercept=True, intercept_scaling=1, loss='squared_hinge',
     max_iter=1000, multi_class='ovr', penalty='l2', random_state=None,
     tol=0.0001, verbose=0)
TRAIN SCORE: 0.1429


#### Perform Model Selection for LogisticRegression

In [10]:
scores = []
# create preprocessing pipeline
for specific_logreg_permutation in get_logreg_hp_permutations():
    estimators = []
    estimators.append(('standardize', StandardScaler()))
    estimators.append(('clf', specific_logreg_permutation))
    model = Pipeline(estimators)
    result = cross_val_score(model, mini_X_train, mini_y_train, cv=KFOLD, scoring='f1_micro')

    scores.append((specific_logreg_permutation, result.mean()))
    
best_svm, max_train = max(scores,key=itemgetter(1)) 

print('Best SVM: \n{}'.format(best_svm))
print('TRAIN SCORE: {:.4f}'.format(max_train))

Best SVM: 
LogisticRegression(C=0.97723722095581067, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=8, penalty='l2', random_state=None,
          solver='lbfgs', tol=0.0001, verbose=0, warm_start=False)
TRAIN SCORE: 0.1143
