In [25]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [26]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn import set_config
set_config(display="diagram")

In [27]:
import warnings
warnings.filterwarnings('ignore')

In [28]:
import pickle

-----------------------------------

In [29]:
data = sns.load_dataset("tips")

In [30]:
y = data['tip']
X = data.drop(columns=['tip'])

In [31]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=60)

In [32]:
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="median")),
    ('minmax_scaler', MinMaxScaler())
])

In [33]:
cat_transformer = OneHotEncoder(handle_unknown='ignore', drop='first')

In [34]:
preprocessor = ColumnTransformer([
    ('num_transformer', num_pipeline, ["size","total_bill"]),
    ('cat_transformer', cat_transformer, ["sex","smoker","day","time"])
])

In [35]:
pipeline_workflow = make_pipeline(preprocessor, LinearRegression())

In [36]:
pipeline_workflow.fit(X_train, y_train)

In [37]:
pipeline_workflow.score(X_test, y_test)

0.3484015613334338

In [38]:
cross_val_score(pipeline_workflow, X_train, y_train, cv=5, scoring='r2').mean()

0.3660585021016723

In [39]:
pipeline_workflow.fit(X, y)

In [40]:
pickle.dump(pipeline_workflow, open('pipeline.pkl', 'wb'))

In [41]:
pickled_model = pickle.load(open('pipeline.pkl', 'rb'))

In [48]:
dtf = pd.DataFrame({"size" : [5], "total_bill" : [None], "sex" : ['Male'], "smoker" : ['Yes'], "day" : ['Sun'], "time" : ['Lunch']})

In [49]:
round(pickled_model.predict(X)[0],2)

2.74