# Pipelines and models

In [1]:
# set up working catalog
import sys
from pathlib import Path
sys.path.append(str(Path("..")))

# imports
from common.read_data import read_data

import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

In [2]:
data = read_data()

X = data.drop('Target', axis=1)
y = data["Target"]

X_train, X_rest, y_train, y_rest = train_test_split(X, y, test_size=0.3, random_state=30)
X_val, X_test, y_val, y_test = train_test_split(X_rest, y_rest, test_size=0.6, random_state=30)

In [3]:
numeric_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),          # strategy="mean"
    ("scaler", StandardScaler()), 
])

categoric_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),         # strategy="constant"
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

numeric_features = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
categorical_features = X.select_dtypes(include=["object", "category"]).columns.tolist()

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_pipeline, numeric_features),
        ("cat", categoric_pipeline, categorical_features)
    ]
)

In [4]:
def train(model):
    pipeline = Pipeline([
        ("preprocessing", preprocessor),
        ("classifier", model)
    ])
    
    pipeline.fit(X_train, y_train)
    
    y_pred_train = pipeline.predict(X_test)
    y_pred_val = pipeline.predict(X_test)
    y_pred_test = pipeline.predict(X_test)
    
    accuracy_train = accuracy_score(y_test, y_pred_train)
    accuracy_validation = accuracy_score(y_test, y_pred_val)
    accuracy_test = accuracy_score(y_test, y_pred_test)
    
    print(f"===== {model} =====")
    print(f"\tTrain accuracy: {accuracy_train}")
    print(f"\tValidation accuracy: {accuracy_validation}")
    print(f"\tTest accuracy: {accuracy_test}")
    
    return {
        "Model": model.__class__.__name__,
        "Train accuracy": accuracy_train,
        "Validation accuracy": accuracy_validation,
        "Test accuracy": accuracy_test
    }

In [5]:
results = []

In [6]:
result = train(LogisticRegression(max_iter=200))
results.append(result)

===== LogisticRegression(max_iter=200) =====
	Train accuracy: 0.7565872020075283
	Validation accuracy: 0.7565872020075283
	Test accuracy: 0.7565872020075283


In [7]:
result = train(DecisionTreeClassifier())
results.append(result)

===== DecisionTreeClassifier() =====
	Train accuracy: 0.6323713927227101
	Validation accuracy: 0.6323713927227101
	Test accuracy: 0.6323713927227101


In [8]:
result = train(SVC(kernel='rbf'))
results.append(result)

===== SVC() =====
	Train accuracy: 0.7478042659974906
	Validation accuracy: 0.7478042659974906
	Test accuracy: 0.7478042659974906


In [9]:
pd.DataFrame(results)

Unnamed: 0,Model,Train accuracy,Validation accuracy,Test accuracy
0,LogisticRegression,0.756587,0.756587,0.756587
1,DecisionTreeClassifier,0.632371,0.632371,0.632371
2,SVC,0.747804,0.747804,0.747804
