In [3]:
# Let's first load and explore the dataset to understand its structure.
import pandas as pd

# Load the dataset
file_path = '/content/Dataset .csv'
data = pd.read_csv(file_path)

# Display the first few rows of the dataset to understand its structure
data.head(), data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9551 entries, 0 to 9550
Data columns (total 21 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Restaurant ID         9551 non-null   int64  
 1   Restaurant Name       9551 non-null   object 
 2   Country Code          9551 non-null   int64  
 3   City                  9551 non-null   object 
 4   Address               9551 non-null   object 
 5   Locality              9551 non-null   object 
 6   Locality Verbose      9551 non-null   object 
 7   Longitude             9551 non-null   float64
 8   Latitude              9551 non-null   float64
 9   Cuisines              9542 non-null   object 
 10  Average Cost for two  9551 non-null   int64  
 11  Currency              9551 non-null   object 
 12  Has Table booking     9551 non-null   object 
 13  Has Online delivery   9551 non-null   object 
 14  Is delivering now     9551 non-null   object 
 15  Switch to order menu 

(   Restaurant ID         Restaurant Name  Country Code              City  \
 0        6317637        Le Petit Souffle           162       Makati City   
 1        6304287        Izakaya Kikufuji           162       Makati City   
 2        6300002  Heat - Edsa Shangri-La           162  Mandaluyong City   
 3        6318506                    Ooma           162  Mandaluyong City   
 4        6314302             Sambo Kojin           162  Mandaluyong City   
 
                                              Address  \
 0  Third Floor, Century City Mall, Kalayaan Avenu...   
 1  Little Tokyo, 2277 Chino Roces Avenue, Legaspi...   
 2  Edsa Shangri-La, 1 Garden Way, Ortigas, Mandal...   
 3  Third Floor, Mega Fashion Hall, SM Megamall, O...   
 4  Third Floor, Mega Atrium, SM Megamall, Ortigas...   
 
                                      Locality  \
 0   Century City Mall, Poblacion, Makati City   
 1  Little Tokyo, Legaspi Village, Makati City   
 2  Edsa Shangri-La, Ortigas, Mandaluyong 

In [4]:
# Step 1: Handle missing values
# Since the target column (Cuisines) has a few missing values, we'll drop the rows with missing cuisine data
data_cleaned = data.dropna(subset=['Cuisines'])

# Step 2: Encode categorical variables
# For categorical columns, we'll apply label encoding for binary columns and one-hot encoding for others

# Convert binary categorical variables to binary encoding (Yes/No to 1/0)
binary_columns = ['Has Table booking', 'Has Online delivery', 'Is delivering now', 'Switch to order menu']
data_cleaned[binary_columns] = data_cleaned[binary_columns].replace({'Yes': 1, 'No': 0})

# One-hot encode categorical columns (e.g., City, Currency)
data_encoded = pd.get_dummies(data_cleaned, columns=['City', 'Currency'])

# Step 3: Split data into features (X) and target (y)
X = data_encoded.drop(columns=['Restaurant ID', 'Restaurant Name', 'Address', 'Locality', 'Locality Verbose', 'Cuisines'])
y = data_encoded['Cuisines']

# Perform train-test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Check the shape of the resulting splits
X_train.shape, X_test.shape, y_train.shape, y_test.shape


  data_cleaned[binary_columns] = data_cleaned[binary_columns].replace({'Yes': 1, 'No': 0})
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_cleaned[binary_columns] = data_cleaned[binary_columns].replace({'Yes': 1, 'No': 0})


((7633, 165), (1909, 165), (7633,), (1909,))

In [8]:
# Checking for any non-numeric columns in the feature set (X_train)
non_numeric_columns = X_train.select_dtypes(include=['object']).columns
non_numeric_columns


Index(['Rating color', 'Rating text'], dtype='object')

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report
import numpy as np

# Step 1: Load the dataset
file_path = '/content/Dataset .csv'  # Adjust the file path if needed
data = pd.read_csv(file_path)

# Step 2: Clean the data
# Drop rows with missing values in the 'Cuisines' column (target variable)
data_cleaned = data.dropna(subset=['Cuisines'])

# Convert binary categorical columns to 0/1
binary_columns = ['Has Table booking', 'Has Online delivery', 'Is delivering now', 'Switch to order menu']
data_cleaned.loc[:, binary_columns] = data_cleaned[binary_columns].replace({'Yes': 1, 'No': 0})

# Step 3: Encode categorical variables
label_encoder = LabelEncoder()

# One-hot encode columns like City, Currency
data_encoded = pd.get_dummies(data_cleaned, columns=['City', 'Currency'])

# Apply label encoding to target variable (Cuisines)
y = label_encoder.fit_transform(data_encoded['Cuisines'])

# Step 4: Select important features for training
important_features = ['Average Cost for two', 'Price range', 'Aggregate rating', 'Votes', 'Has Table booking', 'Has Online delivery', 'Is delivering now']
X = data_encoded[important_features]

# Step 5: Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 6: Scale the features using MinMaxScaler to ensure non-negative values
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Step 7: Train a Naive Bayes model
nb_model = MultinomialNB()
nb_model.fit(X_train_scaled, y_train)

# Step 8: Make predictions on the test set
y_pred = nb_model.predict(X_test_scaled)

# Step 9: Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
recall = recall_score(y_test, y_pred, average='weighted')

# Get unique labels in the test set
unique_labels = np.unique(y_test)

# Adjust target_names to match the unique labels in y_test
classification_rep = classification_report(y_test, y_pred, labels=unique_labels, target_names=label_encoder.inverse_transform(unique_labels))

# Print evaluation metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("Classification Report:\n", classification_rep)


  data_cleaned.loc[:, binary_columns] = data_cleaned[binary_columns].replace({'Yes': 1, 'No': 0})


Accuracy: 0.09900471451021477
Precision: 0.012821237914464888
Recall: 0.09900471451021477
Classification Report:
                                                                                           precision    recall  f1-score   support

                                                               Afghani, Mughlai, Chinese       0.00      0.00      0.00         1
                                                                                American       0.00      0.00      0.00         3
                                                                    American, BBQ, Steak       0.00      0.00      0.00         3
                                                                      American, Bar Food       0.00      0.00      0.00         1
                                                                     American, Breakfast       0.00      0.00      0.00         1
                                                              American, Breakfast, Greek       0.00      

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report
import numpy as np

# Step 1: Load the dataset
file_path = '/content/Dataset .csv'  # Adjust the file path if needed
data = pd.read_csv(file_path)

# Step 2: Clean the data
# Drop rows with missing values in the 'Cuisines' column (target variable)
data_cleaned = data.dropna(subset=['Cuisines'])

# Convert binary categorical columns to 0/1
binary_columns = ['Has Table booking', 'Has Online delivery', 'Is delivering now', 'Switch to order menu']
data_cleaned.loc[:, binary_columns] = data_cleaned[binary_columns].replace({'Yes': 1, 'No': 0})

# Step 3: Encode categorical variables
label_encoder = LabelEncoder()

# One-hot encode columns like City, Currency
data_encoded = pd.get_dummies(data_cleaned, columns=['City', 'Currency'])

# Apply label encoding to target variable (Cuisines)
y = label_encoder.fit_transform(data_encoded['Cuisines'])

# Step 4: Select important features for training
important_features = ['Average Cost for two', 'Price range', 'Aggregate rating', 'Votes', 'Has Table booking', 'Has Online delivery', 'Is delivering now']
X = data_encoded[important_features]

# Step 5: Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 6: Scale the features using MinMaxScaler to ensure non-negative values
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Step 7: Train a Naive Bayes model
nb_model = MultinomialNB()
nb_model.fit(X_train_scaled, y_train)

# Step 8: Make predictions on the test set
y_pred = nb_model.predict(X_test_scaled)

# Step 9: Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)

# Get unique labels in the test set
unique_labels = np.unique(y_test)

# Adjust target_names to match the unique labels in y_test
classification_rep = classification_report(y_test, y_pred, labels=unique_labels, target_names=label_encoder.inverse_transform(unique_labels), zero_division=0)

# Print evaluation metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("Classification Report:\n", classification_rep)


  data_cleaned.loc[:, binary_columns] = data_cleaned[binary_columns].replace({'Yes': 1, 'No': 0})


Accuracy: 0.09900471451021477
Precision: 0.012821237914464888
Recall: 0.09900471451021477
Classification Report:
                                                                                           precision    recall  f1-score   support

                                                               Afghani, Mughlai, Chinese       0.00      0.00      0.00         1
                                                                                American       0.00      0.00      0.00         3
                                                                    American, BBQ, Steak       0.00      0.00      0.00         3
                                                                      American, Bar Food       0.00      0.00      0.00         1
                                                                     American, Breakfast       0.00      0.00      0.00         1
                                                              American, Breakfast, Greek       0.00      

In [7]:
import pickle

In [8]:
pickle.dump(nb_model, open('Cuisine_Classification.pkl', 'wb'))