# Pipelines and models

In [9]:
# set up working catalog
import sys
from pathlib import Path
sys.path.append(str(Path("..")))

# imports
from common.utils import *

import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

In [10]:
data = get_data()

X = data.drop('Target', axis=1)
y = data["Target"]

X_train, X_val, X_test, y_train, y_val, y_test = split(X, y)

In [11]:
numeric_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),                  # strategy="mean"
    ("scaler", StandardScaler()), 
])

categoric_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),           # strategy="constant"
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

numeric_features = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
categorical_features = X.select_dtypes(include=["object", "category"]).columns.tolist()

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_pipeline, numeric_features),
        ("cat", categoric_pipeline, categorical_features)
    ]
)

def train_and_predict(model, should_print=False):
    pipeline = Pipeline([
        ("preprocessing", preprocessor),
        ("classifier", model)
    ])
    
    pipeline.fit(X_train, y_train)
    
    y_pred_train = pipeline.predict(X_train)
    y_pred_val = pipeline.predict(X_val)
    y_pred_test = pipeline.predict(X_test)
    
    accuracy_train = accuracy_score(y_train, y_pred_train)
    accuracy_validation = accuracy_score(y_val, y_pred_val)
    accuracy_test = accuracy_score(y_test, y_pred_test)
    
    if should_print:
        print(f"===== {model} =====")
        print(f"\tTrain accuracy: {accuracy_train}")
        print(f"\tValidation accuracy: {accuracy_validation}")
        print(f"\tTest accuracy: {accuracy_test}")
    
    return {
        "Model": model.__class__.__name__,
        "Train accuracy": accuracy_train,
        "Validation accuracy": accuracy_validation,
        "Test accuracy": accuracy_test
    }

In [12]:
results = []

In [13]:
result = train_and_predict(LogisticRegression(max_iter=200), should_print=True)
results.append(result)

===== LogisticRegression(max_iter=200) =====
	Train accuracy: 0.8133074935400517
	Validation accuracy: 0.7984934086629002
	Test accuracy: 0.7565872020075283


In [14]:
result = train_and_predict(DecisionTreeClassifier(), should_print=True)
results.append(result)

===== DecisionTreeClassifier() =====
	Train accuracy: 1.0
	Validation accuracy: 0.71939736346516
	Test accuracy: 0.6599749058971142


In [15]:
result = train_and_predict(SVC(kernel='rbf'), should_print=True)
results.append(result)

===== SVC() =====
	Train accuracy: 0.8394702842377261
	Validation accuracy: 0.8060263653483992
	Test accuracy: 0.7478042659974906


In [16]:
pd.DataFrame(results)

Unnamed: 0,Model,Train accuracy,Validation accuracy,Test accuracy
0,LogisticRegression,0.813307,0.798493,0.756587
1,DecisionTreeClassifier,1.0,0.719397,0.659975
2,SVC,0.83947,0.806026,0.747804
