# Linear Regression and Random Forest Classifier

In [2]:
import joblib
import numpy as np
import pandas as pd
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error, classification_report, accuracy_score

In [3]:
df = pd.read_csv("restaurants_with_metrics.csv")

In [4]:
df.head()

Unnamed: 0,ID,Name,Food Type,Details,Smoking Area,Price Range,City,State,Image,Waiter Count,People,Total,Tip,Payment Method,Classification
0,1,Ocean's Catch,Mediterranean,"Fresh seafood dishes with a coastal vibe, insp...",Yes,$$$,Houston,TX,pexels-photo-2042591.jpeg,3,6.119587,110.367585,13.808969,Debit Card,Good
1,2,Spice Symphony,American,An orchestra of flavors from around the world ...,Yes,$$$,San Diego,TX,pexels-photo-29172133.jpeg,6,5.727653,109.592477,14.036327,Cash,Good
2,3,La Bella Cucina,Chinese,"Authentic Italian cuisine served in a cozy, fa...",No,$$$$,Los Angeles,TX,pexels-photo-29132866.jpeg,10,5.311244,111.868814,13.98022,Cash,Good
3,4,The Burger Bar,Mexican,Classic and gourmet burgers with a variety of ...,No,$,Dallas,AZ,pexels-photo-7317354.jpeg,4,5.32548,108.776504,13.046276,Cash,Bad
4,5,Veggie Garden,Cafe,"A plant-based paradise featuring vibrant, fres...",Yes,$$$,Los Angeles,TX,pexels-photo-28577205.jpeg,3,5.645668,109.974089,13.949825,Credit Card,Good


## Data for Linear Regression

- Removing columns from X that TipOn won't have access to when evaluating new clients.
- We want to predict the Tip amount.

In [5]:
X = df.drop(columns=["ID", "Name", "Details", "Image", "Waiter Count", "People", "Total", "Tip", "Payment Method", "Classification"])
y = df["Tip"]

## Defining our Pipeline for Linear Regression

In [6]:
numerical_columns = X.select_dtypes(include=["int64", "float64"]).columns
categorical_nominal_columns = X.select_dtypes(include=["object"]).columns.drop("Price Range")
categorical_ordinal_columns = ["Price Range"]

In [7]:
numerical_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler())
])

categorical_nominal_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")), # change strategy to constant and fill_value to "missing" if you want to use missing as a category
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

categorical_ordinal_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OrdinalEncoder(categories=[["$", "$$", "$$$", "$$$$"]]))
])

In [8]:
preprocessor = ColumnTransformer(transformers=[
    ("num", numerical_pipeline, numerical_columns),
    ("cat_nom", categorical_nominal_pipeline, categorical_nominal_columns),
    ("cat_ord", categorical_ordinal_pipeline, categorical_ordinal_columns)
])

In [9]:
lr_model = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", LinearRegression())
])

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [11]:
lr_model.fit(X_train, y_train)

In [12]:
y_pred = lr_model.predict(X_test)

In [13]:
print(f"MSE: {mean_squared_error(y_test, y_pred)}")

MSE: 0.28974070059465806


In [14]:
joblib.dump(lr_model, "lr_model.joblib")

['lr_model.joblib']

## Data for Random Forest Classifier

- Removing columns from X that TipOn won't have access to when evaluating new clients.
- We want to predict the Classification column (good or bad).

In [15]:
df.head()

Unnamed: 0,ID,Name,Food Type,Details,Smoking Area,Price Range,City,State,Image,Waiter Count,People,Total,Tip,Payment Method,Classification
0,1,Ocean's Catch,Mediterranean,"Fresh seafood dishes with a coastal vibe, insp...",Yes,$$$,Houston,TX,pexels-photo-2042591.jpeg,3,6.119587,110.367585,13.808969,Debit Card,Good
1,2,Spice Symphony,American,An orchestra of flavors from around the world ...,Yes,$$$,San Diego,TX,pexels-photo-29172133.jpeg,6,5.727653,109.592477,14.036327,Cash,Good
2,3,La Bella Cucina,Chinese,"Authentic Italian cuisine served in a cozy, fa...",No,$$$$,Los Angeles,TX,pexels-photo-29132866.jpeg,10,5.311244,111.868814,13.98022,Cash,Good
3,4,The Burger Bar,Mexican,Classic and gourmet burgers with a variety of ...,No,$,Dallas,AZ,pexels-photo-7317354.jpeg,4,5.32548,108.776504,13.046276,Cash,Bad
4,5,Veggie Garden,Cafe,"A plant-based paradise featuring vibrant, fres...",Yes,$$$,Los Angeles,TX,pexels-photo-28577205.jpeg,3,5.645668,109.974089,13.949825,Credit Card,Good


In [16]:
X = df.drop(columns=["ID", "Name", "Details", "Image", "Waiter Count", "People", "Total", "Tip", "Payment Method", "Classification"])
y = df["Classification"]

## Defining our Pipeline for Random Forest Classifier

In [17]:
numerical_columns = X.select_dtypes(include=["int64", "float64"]).columns
categorical_nominal_columns = X.select_dtypes(include=["object"]).columns.drop("Price Range")
categorical_ordinal_columns = ["Price Range"]

In [18]:
numerical_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler())
])

categorical_nominal_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")), # change strategy to constant and fill_value to "missing" if you want to use missing as a category
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

categorical_ordinal_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OrdinalEncoder(categories=[["$", "$$", "$$$", "$$$$"]]))
])

In [19]:
preprocessor = ColumnTransformer(transformers=[
    ("num", numerical_pipeline, numerical_columns),
    ("cat_nom", categorical_nominal_pipeline, categorical_nominal_columns),
    ("cat_ord", categorical_ordinal_pipeline, categorical_ordinal_columns)
])

In [20]:
rf_model = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", RandomForestClassifier())
])

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [22]:
rf_model.fit(X_train, y_train)

In [23]:
y_pred = rf_model.predict(X_test)

In [24]:
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print(classification_report(y_test, y_pred))

Accuracy: 0.45
              precision    recall  f1-score   support

         Bad       0.56      0.42      0.48        12
        Good       0.36      0.50      0.42         8

    accuracy                           0.45        20
   macro avg       0.46      0.46      0.45        20
weighted avg       0.48      0.45      0.45        20



In [25]:
importances = rf_model.named_steps['classifier'].feature_importances_
categorical_feature_names = rf_model.named_steps["preprocessor"]["cat_nom"].named_steps["encoder"].get_feature_names_out(categorical_nominal_columns)
feature_names = np.concatenate([numerical_columns, categorical_feature_names, categorical_ordinal_columns])

fig = px.bar(x=importances, y=feature_names, orientation="h", title="Feature Importances in Random Forest Model", labels={"x": "Importance", "y": "Feature"}, height=1000)
fig.update_layout(yaxis={'categoryorder':'total ascending'})
fig.show()

In [26]:
joblib.dump(rf_model, "rf_model.joblib")

['rf_model.joblib']