In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
from seaborn import load_dataset 

In [3]:
data = load_dataset('tips') 

In [4]:
data 

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.00,Female,Yes,Sat,Dinner,2
241,22.67,2.00,Male,Yes,Sat,Dinner,2
242,17.82,1.75,Male,No,Sat,Dinner,2


In [59]:
data['smoker'].value_counts()

smoker
No     151
Yes     93
Name: count, dtype: int64

In [36]:
data.dtypes

total_bill     float64
tip            float64
sex           category
smoker        category
day           category
time          category
size             int64
dtype: object

In [61]:
x = data.drop('smoker',axis=1)
y = data['smoker'].map({'Yes':1,'No':0}) 

In [62]:
y

0      0
1      0
2      0
3      0
4      0
      ..
239    0
240    1
241    1
242    0
243    0
Name: smoker, Length: 244, dtype: category
Categories (2, int64): [1, 0]

In [63]:
from sklearn.model_selection import train_test_split 

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42) 

In [64]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder 

In [65]:
num_cols = ['total_bill','tip','size']
cat_cols = ['sex','time','day']


In [66]:
num_pipe = Pipeline(steps=[('SimpleImputer_num',SimpleImputer()),('StandardScaler',StandardScaler())])
cat_pipe = Pipeline(steps=[('SimpleImputer_cat',SimpleImputer(strategy='most_frequent')),('OneHotEncoder',OneHotEncoder())]) 


In [67]:
preprocessor_pipe = ColumnTransformer(transformers=[('num_pipe',num_pipe,num_cols),('cat_cols',cat_pipe,cat_cols)]) 

In [68]:
model_pipe = Pipeline([('preprocessor_pipe',preprocessor_pipe),('model_pipe',LogisticRegression())])

In [69]:
model_pipe

In [70]:
model_pipe.fit(x_train, y_train)

In [71]:
model_pipe.predict(x_train)

array([0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0,
       1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1],
      dtype=int64)

In [72]:
import pickle

In [73]:
pickle.dump(model_pipe, open('model.pkl', 'wb'))