In [2]:
# set up working catalog
import sys
from pathlib import Path
project_path = str(Path().cwd().parent.parent.resolve())
if project_path not in sys.path:
    sys.path.append(project_path)
    
# imports
from common.utils import get_datasets, X_TRAIN, Y_TRAIN, X_TEST, Y_TEST

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier, StackingClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

In [3]:
datasets = get_datasets(n_splits = 5)

In [4]:
estimators = [
    ("Logistic deg 2 l1", Pipeline([
        ("poly", PolynomialFeatures(degree=2, include_bias=False)), 
        ("clf", LogisticRegression(C=1, max_iter=200, penalty="l1", solver="liblinear"))
    ])),
    ("Logistic deg 2 l2", Pipeline([
        ("poly", PolynomialFeatures(degree=2, include_bias=False)), 
        ("clf", LogisticRegression(C=0.1, max_iter=200, penalty="l2", solver="lbfgs"))
    ])),
    ("Logistic l1", LogisticRegression(C=1, max_iter=200, penalty="l1", solver="liblinear")),
    ("Logistic l2", LogisticRegression(C=0.1, max_iter=200, penalty="l2", solver="lbfgs")),
    ("RFC 1", RandomForestClassifier(bootstrap=False, max_depth=None, min_samples_leaf=2, min_samples_split=3, n_estimators=200)),
    ("RFC 2", RandomForestClassifier(bootstrap=False, max_depth=50, min_samples_leaf=1, min_samples_split=2, n_estimators=500)),
    ("RFC 3", RandomForestClassifier(bootstrap=False, max_depth=30, min_samples_leaf=1, min_samples_split=2, n_estimators=500)),
    ("SVC 1", SVC(C=1, gamma="scale")),
    ("SVC 2", SVC(C=10, gamma="scale")),
    ("SVC 3", SVC(C=10, gamma="auto"))
]

In [5]:
votingClassifier = VotingClassifier(
    estimators=estimators,
    voting="hard"           # classify based on majority class label
)

In [6]:
for dataset in datasets:
    votingClassifier.fit(dataset[X_TRAIN], dataset[Y_TRAIN])
    
    y_pred = votingClassifier.predict(dataset[X_TEST])
    
    print(f"Accuracy score: {accuracy_score(dataset[Y_TEST], y_pred)}")

Accuracy score: 0.7672316384180791
Accuracy score: 0.7943502824858757
Accuracy score: 0.7887005649717514
Accuracy score: 0.7853107344632768
Accuracy score: 0.7794117647058824


In [8]:
for dataset in datasets:
    final_estimator = GradientBoostingClassifier(
        learning_rate=0.01,
        subsample=0.8, 
        min_samples_leaf=2, 
        max_features="sqrt"
    )

    reg = StackingClassifier(
        estimators=estimators,
        final_estimator=final_estimator
    )

    reg.fit(dataset[X_TRAIN], dataset[Y_TRAIN])
    y_pred = reg.predict(dataset[X_TEST])
    print(f"Accuracy: {accuracy_score(dataset[Y_TEST], y_pred)}")

Accuracy: 0.7615819209039548
Accuracy: 0.7898305084745763
Accuracy: 0.7819209039548023
Accuracy: 0.7875706214689265
Accuracy: 0.7794117647058824
