In [42]:
import numpy as np
import pandas as pd


In [43]:
train = pd.read_csv('../data/train.csv')

from sklearn.model_selection import train_test_split

train_data, val_data = train_test_split(train, test_size=0.2, stratify=train['AgeCat'])

train_data.drop(columns=['AgeCat'], inplace=True)
val_data.drop(columns=['AgeCat'], inplace=True)

X_train = train_data.drop(columns=['Outcome'])
y_train = train_data['Outcome']

X_val = val_data.drop(columns=['Outcome'])
y_val = val_data['Outcome']

num_features = X_train.select_dtypes(include='number').columns
cat_features = X_train.select_dtypes(include='object').columns


In [44]:
from sklearn.pipeline import Pipeline
from sklearn.impute import  SimpleImputer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder

num_pipeline = Pipeline([
    ('num_imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

cat_pipeline = Pipeline([
    ('cat_imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OrdinalEncoder())
])

from sklearn.compose import ColumnTransformer

pre_processor = ColumnTransformer([
    ('num_pre', num_pipeline, num_features),
    ('cat_pre', cat_pipeline, cat_features)
])

pre_processor


In [45]:
from sklearn.linear_model import LogisticRegression

model_pipeline = Pipeline([
    ('pre_processing', pre_processor),
    ('model', LogisticRegression())
])

model_pipeline

In [49]:

log_reg_with_piplien = model_pipeline.fit(X_train, y_train)
print(log_reg_with_piplien.score(X_val, y_val))


1.0


In [52]:
import joblib

joblib.dump(log_reg_with_piplien, '../models/log_reg_with_pipeline.pkl')

['../models/log_reg_with_pipeline.pkl']