In [1]:
import pandas as pd
import numpy as np

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

ads = pd.read_csv("../Data/Social_Network_Ads.csv")

ads.head()

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,Male,19,19000,0
1,15810944,Male,35,20000,0
2,15668575,Female,26,43000,0
3,15603246,Female,27,57000,0
4,15804002,Male,19,76000,0


In [2]:
X = ads.drop(["User ID", "Purchased"], axis=1)
y = ads["Purchased"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=2023)

numeric_features = list(X.dtypes[(X.dtypes != 'object') & (X.dtypes != 'category')].index)
categorical_features = list(X.dtypes[(X.dtypes == 'object') | (X.dtypes == 'category')].index)

numeric_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler()) 
])

categorical_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))    
])

data_prep = ColumnTransformer(transformers=[
    ('numeric', numeric_transformer, numeric_features),
    ('categorical', categorical_transformer, categorical_features)    
])

pipeline = Pipeline([
    ("prep", data_prep),
    ("lr", LogisticRegression(C=.422, penalty="l1", solver="saga"))  
])

pipeline.fit(X_train, y_train)
pipeline.score(X_test, y_test)

0.875

In [3]:
pipeline

In [4]:
for name, step in pipeline.named_steps.items():
    if hasattr(step, 'get_feature_names'):
        print(step.get_feature_names())

In [5]:
import joblib

#joblib.dump(pipeline, "ads_model_pipeline2.pkl")

['ads_model_pipeline2.pkl']