In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

In [2]:
df = pd.read_csv("restaurants.csv")

In [3]:
df.head()

Unnamed: 0,id,restaurant,food type,price range,rating,clients per month,location,recommended
0,1,Taco Town,Indian,$30-$40,2.5,2147,"San Jose, CA",Yes
1,2,Pizza Paradise,Indian,$40-$50,2.2,218,"New York, NY",No
2,3,Taco Town,Italian,$10-$20,2.4,9566,"Los Angeles, CA",Yes
3,4,Burger Barn,American,$30-$40,3.1,9144,"Los Angeles, CA",Yes
4,5,Sushi Central,Vegan,$40-$50,3.4,5876,"New York, NY",No


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 200 non-null    int64  
 1   restaurant         200 non-null    object 
 2   food type          200 non-null    object 
 3   price range        200 non-null    object 
 4   rating             200 non-null    float64
 5   clients per month  200 non-null    int64  
 6   location           200 non-null    object 
 7   recommended        200 non-null    object 
dtypes: float64(1), int64(2), object(5)
memory usage: 12.6+ KB


In [5]:
X = df.drop(columns=["id", "restaurant", "recommended"])
y = df["recommended"]

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [7]:
X_train = pd.get_dummies(X_train)
X_test = pd.get_dummies(X_test)

In [8]:
(X_train.columns == X_test.columns).all()

np.True_

In [9]:
model = MultinomialNB()
model.fit(X_train, y_train)

In [10]:
y_pred = model.predict(X_test)

In [11]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

          No       0.67      0.61      0.64        23
         Yes       0.53      0.59      0.56        17

    accuracy                           0.60        40
   macro avg       0.60      0.60      0.60        40
weighted avg       0.61      0.60      0.60        40



In [12]:
import joblib
joblib.dump(model, "modelo.joblib")

['modelo.joblib']

In [13]:
model.feature_names_in_

array(['rating', 'clients per month', 'food type_American',
       'food type_BBQ', 'food type_Indian', 'food type_Italian',
       'food type_Japanese', 'food type_Mexican', 'food type_Seafood',
       'food type_Vegan', 'price range_$10-$20', 'price range_$20-$30',
       'price range_$30-$40', 'price range_$40-$50',
       'location_Chicago, IL', 'location_Dallas, TX',
       'location_Houston, TX', 'location_Los Angeles, CA',
       'location_New York, NY', 'location_Philadelphia, PA',
       'location_Phoenix, AZ', 'location_San Antonio, TX',
       'location_San Diego, CA', 'location_San Jose, CA'], dtype=object)

In [35]:
restaurant_df = pd.DataFrame([{
    "food type": "BBQ",
    "price range": "$0-$100",
    "rating": 4.2,
    "clients per month": 636542,
    "location": "location_Mexico"
}])
restaurant_df = pd.get_dummies(restaurant_df)
restaurant_df.head()

Unnamed: 0,rating,clients per month,food type_BBQ,price range_$0-$100,location_location_Mexico
0,4.2,636542,True,True,True


In [36]:
missing = [x for x in model.feature_names_in_ if x not in restaurant_df.columns]
restaurant_df[missing] = False

In [None]:
model.predict(restaurant_df[model.feature_names_in_])

array(['Yes'], dtype='<U3')