# Develop a machine learning model to classify restaurants based on their cuisines.

In [34]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [35]:
import warnings
warnings.filterwarnings("ignore")

In [36]:
df=pd.read_csv("Dataset .csv")

In [6]:
df.head(3)

Unnamed: 0,Restaurant ID,Restaurant Name,Country Code,City,Address,Locality,Locality Verbose,Longitude,Latitude,Cuisines,...,Currency,Has Table booking,Has Online delivery,Is delivering now,Switch to order menu,Price range,Aggregate rating,Rating color,Rating text,Votes
0,6317637,Le Petit Souffle,162,Makati City,"Third Floor, Century City Mall, Kalayaan Avenu...","Century City Mall, Poblacion, Makati City","Century City Mall, Poblacion, Makati City, Mak...",121.027535,14.565443,"French, Japanese, Desserts",...,Botswana Pula(P),Yes,No,No,No,3,4.8,Dark Green,Excellent,314
1,6304287,Izakaya Kikufuji,162,Makati City,"Little Tokyo, 2277 Chino Roces Avenue, Legaspi...","Little Tokyo, Legaspi Village, Makati City","Little Tokyo, Legaspi Village, Makati City, Ma...",121.014101,14.553708,Japanese,...,Botswana Pula(P),Yes,No,No,No,3,4.5,Dark Green,Excellent,591
2,6300002,Heat - Edsa Shangri-La,162,Mandaluyong City,"Edsa Shangri-La, 1 Garden Way, Ortigas, Mandal...","Edsa Shangri-La, Ortigas, Mandaluyong City","Edsa Shangri-La, Ortigas, Mandaluyong City, Ma...",121.056831,14.581404,"Seafood, Asian, Filipino, Indian",...,Botswana Pula(P),Yes,No,No,No,4,4.4,Green,Very Good,270


# Dropping the uncessary columns

In [7]:
df.drop(['Restaurant ID', 'Country Code', 'City', 'Address', 'Locality', 
         'Locality Verbose', 'Longitude', 'Latitude', 'Currency', 
         'Has Table booking', 'Has Online delivery', 'Is delivering now', 
         'Switch to order menu', 'Price range', 'Aggregate rating', 
         'Rating color', 'Rating text', 'Votes'], axis=1, inplace=True)

In [8]:
# Checking for missing values
df.isnull().sum()

Restaurant Name         0
Cuisines                9
Average Cost for two    0
dtype: int64

In [9]:
df=df.dropna()

# Converting Categorical data to Numerical

In [11]:
from sklearn.preprocessing import LabelEncoder 

label_encoder = LabelEncoder() 
df['Restaurant Name'] = label_encoder.fit_transform(df['Restaurant Name']) 
df['Cuisines'] = label_encoder.fit_transform(df['Cuisines'])

In [12]:
df

Unnamed: 0,Restaurant Name,Cuisines,Average Cost for two
0,3742,920,1100
1,3167,1111,1200
2,2892,1671,4000
3,4700,1126,1500
4,5515,1122,1500
...,...,...,...
9546,4436,1813,80
9547,1310,1824,105
9548,3063,1110,170
9549,512,1657,120



# Creating Independent and dependent variable

In [15]:
# Create X and Y
X = df[['Restaurant Name', 'Average Cost for two']] 
y = df['Cuisines']

# Train test split

In [16]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.40,random_state=40)

In [19]:
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()

In [20]:
# Fit and transform the scaler on the training set
X_train_scaled = scaler.fit_transform(X_train)

# Transform the test set using the scaler fitted on the training set
X_test_scaled = scaler.transform(X_test)

In [23]:
X_train_preprocessed = np.concatenate([X_train, X_train_scaled], axis=1)

In [24]:
X_test_preprocessed = np.concatenate([X_test, X_test_scaled], axis=1)

In [26]:
model_DT = DecisionTreeClassifier(random_state=10,criterion="gini")

# fit the model on data and predict the values
model_DT.fit(X_train_preprocessed,y_train)      # fit is the function that is used for training the data
y_pred = model_DT.predict(X_test_preprocessed) # Validation Data

#print(y_pred)
print(list(zip(y_test,y_pred)))

[(191, 1514), (305, 307), (81, 1031), (1667, 168), (1766, 186), (1212, 1212), (549, 497), (331, 331), (1306, 1448), (986, 774), (1306, 1306), (201, 549), (1813, 1288), (1306, 1306), (1329, 1329), (497, 497), (1306, 1636), (497, 1590), (1520, 186), (1749, 1306), (1306, 1306), (191, 1329), (1306, 745), (1306, 186), (249, 249), (1663, 171), (55, 54), (333, 837), (422, 331), (331, 828), (703, 1315), (828, 828), (1524, 471), (1329, 1514), (1514, 1348), (1795, 686), (982, 982), (1222, 1520), (1822, 258), (77, 818), (497, 629), (1592, 865), (201, 177), (1650, 1650), (497, 497), (1722, 177), (1402, 1060), (497, 497), (837, 1699), (549, 1275), (31, 1106), (497, 1306), (1599, 1599), (1451, 1406), (546, 1514), (1306, 1559), (177, 186), (1334, 1334), (1354, 684), (1306, 186), (1381, 1306), (650, 1759), (331, 331), (719, 1031), (1306, 1329), (1011, 258), (1765, 1275), (165, 1275), (1771, 841), (1812, 497), (1387, 1380), (1716, 518), (426, 1514), (1306, 1262), (497, 497), (1597, 1306), (1384, 1795),

In [27]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
 
cfm=confusion_matrix(y_test,y_pred)
print(cfm)
 
print("Classification report: ")
 
print(classification_report(y_test,y_pred))
 
acc=accuracy_score(y_test, y_pred)
print("Accuracy of the model: ",acc)

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
Classification report: 
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         1
           4       0.00      0.00      0.00         1
           6       0.00      0.00      0.00        16
           8       0.00      0.00      0.00         1
           9       0.00      0.00      0.00         0
          11       0.00      0.00      0.00         0
          12       0.00      0.00      0.00         1
          15       0.00      0.00      0.00         1
          16       0.00      0.00      0.00         0
          17       0.00      0.00      0.00         1
          18       0.00      0.00      0.00         0
          19       0.00      0.00      0.00         1
          20       0.00      0.00      0.00         1
          21       0.00      0.00      0.00         4
          22       0.00      0.00      0.00  

# Logistic Regression

In [28]:
from sklearn.linear_model import LogisticRegression
#create a model object
classifier = LogisticRegression(multi_class="multinomial")
#train the model object
classifier.fit(X_train_preprocessed,y_train)      # fit is the function that is used for training the data

y_pred = classifier.predict(X_test_preprocessed)
print(y_pred)

[1329 1306 1306 ... 1306 1329 1329]


In [29]:

 
cfm=confusion_matrix(y_test,y_pred)
print(cfm)
 
print("Classification report: ")
 
print(classification_report(y_test,y_pred))
 
acc=accuracy_score(y_test, Y_pred)
print("Accuracy of the model: ",acc)

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
Classification report: 
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         1
           4       0.00      0.00      0.00         1
           6       0.00      0.00      0.00        16
           8       0.00      0.00      0.00         1
          12       0.00      0.00      0.00         1
          15       0.00      0.00      0.00         1
          17       0.00      0.00      0.00         1
          19       0.00      0.00      0.00         1
          20       0.00      0.00      0.00         1
          21       0.00      0.00      0.00         4
          29       0.00      0.00      0.00         9
          31       0.00      0.00      0.00         1
          33       0.00      0.00      0.00         1
          35       0.00      0.00      0.00         2
          39       0.00      0.00      0.00  

# Conclusion

By comparing Random Forest and Logistic Regression. Random Forest is Performing Better as compared to Logistic Regression

# Second Approach to the same problem

In [11]:
df.columns

Index(['Restaurant ID', 'Restaurant Name', 'Country Code', 'City', 'Address',
       'Locality', 'Locality Verbose', 'Longitude', 'Latitude', 'Cuisines',
       'Average Cost for two', 'Currency', 'Has Table booking',
       'Has Online delivery', 'Is delivering now', 'Switch to order menu',
       'Price range', 'Aggregate rating', 'Rating color', 'Rating text',
       'Votes'],
      dtype='object')

# Using Top most frequent frequency encoding

Since there is many cuisines which appears to be once so encoding the ones which are repeated more than once

In [37]:
frequency_encoding = df['Cuisines'].value_counts(normalize=True)

# Define the number of top frequent cuisines to encode
top_n = 500

# Get the top n frequent cuisines
top_n_cuisines = frequency_encoding.head(top_n).index.tolist()

# Encode cuisines that are not in the top n as 'Other'
df['Cuisine_Encoded'] = df['Cuisines'].apply(lambda x: x if x in top_n_cuisines else 'Other')

In [38]:
df["Cuisine_Encoded"].value_counts()

Cuisine_Encoded
Other                                                          1381
North Indian                                                    936
North Indian, Chinese                                           511
Fast Food                                                       354
Chinese                                                         354
                                                               ... 
Cafe, Mexican                                                     2
North Indian, Mughlai, South Indian                               2
Bakery, Street Food                                               2
Chinese, Japanese                                                 2
Continental, American, Italian, Seafood, North Indian, Cafe       2
Name: count, Length: 501, dtype: int64

In [39]:
# Extract features and target variable
X = df.drop(columns=['Cuisines','Restaurant ID', 'Country Code','City','Address','Locality','Locality Verbose','Has Table booking','Has Online delivery',
                     'Is delivering now','Switch to order menu','Rating text','Rating color','Currency'])
y = df['Cuisine_Encoded']

# Perform one-hot encoding for categorical features
encoder = OneHotEncoder()
X_encoded = encoder.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

In [40]:
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
rf_classifier.fit(X_train, y_train)

# Predict on the test data
y_pred = rf_classifier.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.9010989010989011


In [41]:
y_pred

array(['North Indian', 'Other', 'North Indian, Mughlai', ...,
       'Chinese, Fast Food', 'Continental', 'Other'], dtype=object)

In [42]:
print(list(zip(y_test,y_pred)))

[('North Indian', 'North Indian'), ('Other', 'Other'), ('North Indian, Mughlai', 'North Indian, Mughlai'), ('Fast Food', 'Fast Food'), ('North Indian, Mughlai', 'North Indian, Mughlai'), ('Beverages, Fast Food', 'Beverages, Fast Food'), ('Bakery, Fast Food', 'Bakery, Fast Food'), ('Street Food, Mithai', 'Street Food, Mithai'), ('North Indian, Chinese, Mughlai', 'North Indian, Chinese, Mughlai'), ('Asian', 'Asian'), ('Chinese, Thai, Asian', 'Other'), ('North Indian', 'North Indian'), ('Fast Food, North Indian', 'Fast Food, North Indian'), ('Desserts', 'Desserts'), ('North Indian', 'North Indian'), ('North Indian, Chinese', 'North Indian, Chinese'), ('Cafe, Italian, Desserts, Fast Food, Chinese, Tea', 'Cafe, Italian, Desserts, Fast Food, Chinese, Tea'), ('North Indian, Chinese, Continental, Fast Food', 'North Indian'), ('North Indian, Chinese', 'North Indian, Chinese'), ('Chinese, North Indian', 'Chinese, North Indian'), ('Other', 'Other'), ('Chinese, Asian, Thai', 'Chinese, Asian, Thai'

In [40]:
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)

# Predict on the test set
y_pred = clf.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.9507595599790466


# Conclusion

By using top most frequent encodin random forest is performing better