# Pipelines and models

In [1]:
# set up working catalog
import sys
from pathlib import Path
project_path = str(Path().cwd().parent.resolve())
if project_path not in sys.path:
    sys.path.append(project_path)

# imports
from common.utils import *

import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

In [2]:
data = get_data()

X = data.drop('Target', axis=1)
y = data["Target"]

X_train, X_val, X_test, y_train, y_val, y_test = split(X, y)

data.head()

Unnamed: 0,Marital status,Application mode,Application order,Course,Daytime/evening attendance,Previous qualification,Previous qualification (grade),Nacionality,Mother's qualification,Father's qualification,...,Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP,Target
0,single,2nd phase - general contingent,6th choice,Animation and Multimedia Design,Daytime,Secondary education,122.0,Portuguese,Basic Ed 3rd Cycle,Other - 11th Year,...,0,0,0,0,0.0,0,10.8,1.4,1.74,Dropout
1,single,International student (bachelor),2nd choice,Tourism,Daytime,Secondary education,160.0,Portuguese,Secondary Education - 12th Year or Eq.,Higher Ed - Degree,...,0,6,6,6,13.666667,0,13.9,-0.3,0.79,Graduate
2,single,1st phase - general contingent,6th choice,Communication Design,Daytime,Secondary education,122.0,Portuguese,Basic Ed 1st Cycle (4th/5th),Basic Ed 1st Cycle (4th/5th),...,0,6,0,0,0.0,0,10.8,1.4,1.74,Dropout
3,single,2nd phase - general contingent,3rd choice,Journalism and Communication,Daytime,Secondary education,122.0,Portuguese,Basic Ed 2nd Cycle (6th–8th),Basic Ed 1st Cycle (4th/5th),...,0,6,10,5,12.4,0,9.4,-0.8,-3.12,Graduate
4,married,Over 23 years old,2nd choice,Social Service (evening attendance),Evening,Secondary education,100.0,Portuguese,Basic Ed 1st Cycle (4th/5th),Basic Ed 2nd Cycle (6th–8th),...,0,6,6,6,13.0,0,13.9,-0.3,0.79,Graduate


In [3]:
numeric_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),                  # strategy="mean"
    ("scaler", StandardScaler()), 
])

categoric_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),           # strategy="constant"
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

numeric_features = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
categorical_features = X.select_dtypes(include=["object", "category"]).columns.tolist()

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_pipeline, numeric_features),
        ("cat", categoric_pipeline, categorical_features)
    ]
)

def train_and_predict(model, should_print=False):
    pipeline = Pipeline([
        ("preprocessing", preprocessor),
        ("classifier", model)
    ])
    
    pipeline.fit(X_train, y_train)
    
    y_pred_train = pipeline.predict(X_train)
    y_pred_val = pipeline.predict(X_val)
    y_pred_test = pipeline.predict(X_test)
    
    accuracy_train = accuracy_score(y_train, y_pred_train)
    accuracy_validation = accuracy_score(y_val, y_pred_val)
    accuracy_test = accuracy_score(y_test, y_pred_test)
    
    if should_print:
        print(f"===== {model} =====")
        print(f"\tTrain accuracy: {accuracy_train}")
        print(f"\tValidation accuracy: {accuracy_validation}")
        print(f"\tTest accuracy: {accuracy_test}")
    
    return {
        "Model": model.__class__.__name__,
        "Train accuracy": accuracy_train,
        "Validation accuracy": accuracy_validation,
        "Test accuracy": accuracy_test
    }

In [4]:
results = []

In [5]:
result = train_and_predict(LogisticRegression(max_iter=200), should_print=True)
results.append(result)

===== LogisticRegression(max_iter=200) =====
	Train accuracy: 0.8133074935400517
	Validation accuracy: 0.7984934086629002
	Test accuracy: 0.7565872020075283


In [6]:
result = train_and_predict(DecisionTreeClassifier(), should_print=True)
results.append(result)

===== DecisionTreeClassifier() =====
	Train accuracy: 1.0
	Validation accuracy: 0.71939736346516
	Test accuracy: 0.6587202007528231


In [7]:
result = train_and_predict(SVC(kernel='rbf'), should_print=True)
results.append(result)

===== SVC() =====
	Train accuracy: 0.8394702842377261
	Validation accuracy: 0.8060263653483992
	Test accuracy: 0.7478042659974906


In [8]:
pd.DataFrame(results)

Unnamed: 0,Model,Train accuracy,Validation accuracy,Test accuracy
0,LogisticRegression,0.813307,0.798493,0.756587
1,DecisionTreeClassifier,1.0,0.719397,0.65872
2,SVC,0.83947,0.806026,0.747804
