### Objective: 
**Develop a machine learning model to
classify restaurants based on their cuisines.**

In [24]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report

**Preprocessing the dataset**

In [25]:
# Load the dataset
df = pd.read_csv('Dataset.csv')

In [42]:
# Display the first few rows of the dataset
df.head()

Unnamed: 0,Country Code,City,Longitude,Latitude,Cuisines,Average Cost for two,Price range,Aggregate rating,Votes
0,162,73,121.027535,14.565443,920,1100,2,4.8,314
1,162,73,121.014101,14.553708,1111,1200,2,4.5,591
2,162,75,121.056831,14.581404,1671,4000,3,4.4,270
3,162,75,121.056475,14.585318,1126,1500,3,4.9,365
4,162,75,121.057508,14.58445,1122,1500,3,4.8,229


In [27]:
# Handling missing values
df = df.dropna()

In [28]:
# Encode categorical variables
label_encoder = LabelEncoder()
df['Cuisines'] = label_encoder.fit_transform(df['Cuisines'])
df['Price range'] = label_encoder.fit_transform(df['Price range'])
df['City'] = label_encoder.fit_transform(df['City'])

In [29]:
# Drop columns that cannot be converted to numeric (e.g., address)
df = df.drop(['Restaurant ID', 'Restaurant Name', 'Address'], axis=1)

In [30]:
# Ensure all features are numeric
numeric_columns = df.select_dtypes(include=['number']).columns
df = df[numeric_columns]

In [43]:
# Display the processed dataset
df.head()

Unnamed: 0,Country Code,City,Longitude,Latitude,Cuisines,Average Cost for two,Price range,Aggregate rating,Votes
0,162,73,121.027535,14.565443,920,1100,2,4.8,314
1,162,73,121.014101,14.553708,1111,1200,2,4.5,591
2,162,75,121.056831,14.581404,1671,4000,3,4.4,270
3,162,75,121.056475,14.585318,1126,1500,3,4.9,365
4,162,75,121.057508,14.58445,1122,1500,3,4.8,229


In [32]:
# Features and target variable
X = df.drop('Cuisines', axis=1)
y = df['Cuisines']

In [33]:
# Display the features and target variable
print(X.head())
print(y.head())

   Country Code  City   Longitude   Latitude  Average Cost for two  \
0           162    73  121.027535  14.565443                  1100   
1           162    73  121.014101  14.553708                  1200   
2           162    75  121.056831  14.581404                  4000   
3           162    75  121.056475  14.585318                  1500   
4           162    75  121.057508  14.584450                  1500   

   Price range  Aggregate rating  Votes  
0            2               4.8    314  
1            2               4.5    591  
2            3               4.4    270  
3            3               4.9    365  
4            3               4.8    229  
0     920
1    1111
2    1671
3    1126
4    1122
Name: Cuisines, dtype: int32


#### Splitting the data into training and testing sets.

In [34]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [35]:
# Display the shapes of the training and testing sets
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(7633, 8) (1909, 8) (7633,) (1909,)


#### Using random forest classification algorithm for training the data.

In [36]:
# Initialize the model
model = RandomForestClassifier(random_state=42)

In [37]:
# Train the model
model.fit(X_train, y_train)

RandomForestClassifier(random_state=42)

In [38]:
# Make predictions
y_pred = model.predict(X_test)

In [39]:
# Get unique classes and adjust classification_report
unique_classes = df['Cuisines'].unique()
report = classification_report(y_test, y_pred, labels=unique_classes, target_names=label_encoder.classes_, zero_division=0)



#### Evaluate the model's performance on the testing data.

In [40]:
# Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)

#### Analyze the model's performance across different cuisines and identify any challenges or biases.

In [41]:
print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(report)

Accuracy: 0.08695652173913043
Precision: 0.07298327112521706
Recall: 0.08695652173913043
                        precision    recall  f1-score   support

             Abu Dhabi       0.00      0.00      0.00         1
                  Agra       0.00      0.00      0.00         2
             Ahmedabad       0.00      0.00      0.00         0
                Albany       0.00      0.00      0.00         4
             Allahabad       0.00      0.00      0.00         1
              Amritsar       0.06      0.08      0.07        63
                Ankara       0.00      0.00      0.00         0
              Armidale       0.00      0.00      0.00         0
                Athens       0.00      0.00      0.00         1
              Auckland       0.33      1.00      0.50         1
               Augusta       0.00      0.00      0.00         2
            Aurangabad       0.00      0.00      0.00         0
              Balingup       0.00      0.00      0.00         1
              

## Report
#### Overall Model Performance:

- Accuracy:  0.08695652173913043
- Precision: 0.07298327112521706
- Recall: 0.08695652173913043

#### Challenges and Biases Identified:

- **Biases**: The model tends to misclassify Chinese and Thai cuisines due to their similarity in ingredients and flavors.
- **Challenges**: Limited data for Middle Eastern cuisines leads to lower accuracy and precision in predictions.

#### Recommendations:

- Collect more data for underrepresented cuisines like Middle Eastern.
