In [12]:
# set up working catalog
import sys
from pathlib import Path
project_path = str(Path().cwd().parent.parent.resolve())
if project_path not in sys.path:
    sys.path.append(project_path)

# imports
from common.utils import get_data, get_preprocessor

from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.model_selection import train_test_split,  cross_val_score
from sklearn.ensemble import GradientBoostingClassifier, StackingClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

In [13]:
data = get_data()
X = data.drop(columns=["Target"])
y = data["Target"]

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size= 0.8, random_state=6)

numerical_column_names = X.select_dtypes(include=["number"]).columns.tolist()
categorical_column_names = X.select_dtypes(include=["object"]).columns.tolist()

smt = SMOTE(random_state=16)

def get_pipeline(classifier):
    return ImbPipeline([
        ("preprocessing", get_preprocessor(numerical_column_names, categorical_column_names)),
        # ("sampling", RandomOverSampler(random_state=16)),
        ("smt", smt),
        ("classifier", classifier)
    ])

In [14]:
rfc = get_pipeline(RandomForestClassifier(n_estimators=300))
lr = get_pipeline(LogisticRegression(max_iter=1000))
svc = get_pipeline(SVC(probability=True))

estimators = [("rfc", rfc), ("lr", lr), ("svc", svc)]

In [15]:
votingClassifier = VotingClassifier(
    estimators=estimators,
    voting="hard"           # classify based on majority class label
)

In [16]:
for classifier, label in zip([rfc, lr, svc, votingClassifier], 
    ["Random Forst Classifier", "Logistic Regression", "SVC", 'Ensemble']):
    scores = cross_val_score(classifier, X, y, scoring="accuracy", cv=3)
    print("Accuracy: %0.4f   (+/- %0.4f)   [%s]" % (scores.mean(), scores.std(), label))

Accuracy: 0.7710   (+/- 0.0022)   [Random Forst Classifier]
Accuracy: 0.7570   (+/- 0.0045)   [Logistic Regression]
Accuracy: 0.7615   (+/- 0.0032)   [SVC]
Accuracy: 0.7715   (+/- 0.0069)   [Ensemble]


In [17]:
final_estimator = GradientBoostingClassifier(
    n_estimators=100, 
    learning_rate=0.01,
    subsample=0.8, 
    min_samples_leaf=2, 
    max_features="sqrt"
)

reg = StackingClassifier(
    estimators=estimators,
    final_estimator=final_estimator
)

reg.fit(X_train, y_train)
y_pred = reg.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")

Accuracy: 0.7684
