# 1. Import Libraries

In [2]:
#import libraries

import pandas as pd
import numpy as np
import seaborn as sb
import matplotlib.pyplot as plt
%matplotlib inline


from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC,LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split,cross_val_score,GridSearchCV
from sklearn.metrics import multilabel_confusion_matrix,classification_report,f1_score,make_scorer


import joblib

In [3]:
df = pd.read_csv('Dataset.csv')

# 2. Cleaning

In [4]:
df_copy = df.copy()

In [5]:
df_copy = df_copy[['Restaurant Name', 'Cuisines']]

In [6]:
column_mask = {'Restaurant Name' : 'restaurant_name', 'Cuisines' : 'cuisines'}

df_copy = df_copy.rename(columns = column_mask)

In [7]:
# encoding error

fix_restaurant_names = {
    'Caf��':'Café',
    'Pizza �� Bessa': 'Pizza à Bessa',
    'Tayp��': 'Taypá',
    'Manzu��': 'Manzuá',
    'Braseiro da G��vea': 'Braseiro da Gávea',
    'Zaz�� Bistr�� Tropical': 'Zazá Bistrô Tropical',
    'Fil�� de Ouro': 'Filé de Ouro',
    'Apraz�_vel': 'Aprazível',
    'Terra�_o It��lia': 'Terraço Itália',
    'Divino Fog��o': 'Divino Fogão',
    'Esquina Mocot�_': 'Esquina Mocotó',
    'Cev�_che Tapas Bar & Restaurant': 'Cevíche Tapas Bar & Restaurant',
    "Longitude 77��03' Bar - Le Meridien Gurgaon": "Longitude 77°03' Bar - Le Meridien Gurgaon",
    'bu��no': 'buóno',
    'M Cr��me': 'M Crème',
    "Chawla's�_": "Chawla's",
    'Con�_u': 'Conçu',
    'Sahib��s Barbeque by Ohri��s': "Sahib's Barbeque by Ohri's",
    'NESCAF� Illusions': 'NESCAFÉ Illusions',
    'It��s Sinful': "It’s Sinful",
    'D�_ner Grill': 'Döner Grill',
    'LaBont��': 'LaBonté',
    'Bon App��tit': 'Bon Appétit',
    'Delhite P��tisserie': 'Delhite Pâtisserie',
    'Die B�_ckerei': 'Die Bäckerei',
    'Rosart�� Chocolate': 'Rosarté Chocolate',
    'H�_agen-Dazs': 'Häagen-Dazs',
    'TBH ��� To Be Healthy': 'TBH — To Be Healthy',
    'Caff�� La Poya': 'Caffé La Poya',
    'KBC�_': 'KBC',
    'Saut��ed Stories': 'Sautéed Stories',
    'Eden Noodles Cafe �__·�_��_��_��': 'Eden Noodles Cafe',
    'Masaba��۱ Kebap�_۱s۱': 'Masabaş Kebapçıs',
    'Me��hur Tavac۱ Recep Usta': 'Meşhur Tavacı Recep Usta',
    '�ukura��a Sofras۱': 'Çukurağa Sofrası',
    'Me��hur �_z�_elik Aspava': 'Meşhur Özçelik Aspava',
    'Masaba��۱': 'Masabaş',
    'D�_vero��lu': 'Döveroğlu',
    'Pizza ��l Forno': 'Pizza àl Forno',
    'Emirgan S�_ti��': 'Emirgan Sütiş',
    'Leman K�_lt�_r': 'Leman Kültür',
    'Dem Karak�_y': 'Dem Karaköy',
    'Karak�_y G�_ll�_o��lu': 'Karaköy Güllüoğlu',
    'Ceviz A��ac۱': 'Ceviz Ağacı',
    'A���k Kahve': 'Açık Kahve'
}

df_copy['restaurant_name'] = df_copy['restaurant_name'].replace(fix_restaurant_names,regex = True)

In [8]:
df_copy['restaurant_name'] = df_copy['restaurant_name'][~df_copy['restaurant_name'].str.contains('�')]

In [9]:
#fix encoding error in cuisines column

fix_cuisines = {'Kebab, Turkish Pizza, D�_ner': 'Döner',
                'Desserts, B�_rek': 'Desserts, Börek'}

df_copy['cuisines'] = df_copy['cuisines'].replace(fix_cuisines,regex=True)

In [10]:
df_copy = df_copy.dropna()

In [11]:
df_copy = df_copy.reset_index(drop=True)

In [12]:
pattern = r'\s\s+'
df_copy['restaurant_name'] = df_copy['restaurant_name'].str.strip().replace(pattern,'',regex=True)

# 3. Feature Engineering

In [13]:
cleaned_df = df_copy.copy()

In [14]:
#multilabel classification - lets separate our labels 
cleaned_df['cuisines_list'] = cleaned_df['cuisines'].apply(lambda x:[c.strip() for c in x.split(',')])

mlb = MultiLabelBinarizer()
y = mlb.fit_transform(cleaned_df['cuisines_list'])

In [15]:
# let's vectorized our features

tfidf = TfidfVectorizer()
X_vectorized = tfidf.fit_transform(cleaned_df['restaurant_name'])

# 4. Modelling

to avoid output like this - ` UserWarning: Label not 26 is present in all training examples. warnings.warn(`', i will filtered out labels that occur too often and less often in the df.  

This message is because some label are missing from the entire training set. it could affect how well our model will learn, so lets remove those labels

In [16]:
label_count = y.sum(axis=0) #how mnay times each label appears 
n_samples = y.shape[0] #total number of samples

In [17]:
# Define a threshold: remove labels that appear in <2% or >98% of rows
too_common = (label_count / n_samples) > 0.98
too_rare = (label_count / n_samples) < 0.02

remove_labels = np.where(too_common | too_rare)[0] #gets the exact row indices that match the filter.

filtered_y = np.delete(y,remove_labels,axis=1)

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X_vectorized,filtered_y, test_size= 0.2, random_state=42)

In [19]:
lr = OneVsRestClassifier(LogisticRegression())

lr.fit(X_train,y_train)

model = lr.predict(X_test)

In [20]:
print(f1_score(model,y_test,average='micro'))
print(f1_score(model,y_test,average='macro'))
print(f1_score(model,y_test,average='samples',zero_division=0))

0.48521702663032085
0.40222508376329213
0.3782003926658015


# 5. Model Improvement

### 5. 1 Cross Vlidation

In [21]:
f1 = make_scorer(f1_score, average='macro')

scores = cross_val_score(lr,X_vectorized, filtered_y, cv=5, scoring=f1)

# Print results
print("F1 Scores for each fold:", scores)
print("Average F1 Score:", scores.mean())


KeyboardInterrupt



### 5.2. Usinng GridsearchCV


let's get the parameters we will be turing

In [22]:
#try gridsearchcv

model = OneVsRestClassifier(LogisticRegression())

param_grid ={
        'estimator__C': [1.0, 5.0],
        'estimator__class_weight': [None, 'balanced'],
 'estimator__max_iter': [100, 500],
 'estimator__solver': ['liblinear'],
}

scorer = make_scorer(f1_score, average='micro')

#set up and run gridsearchcv
grid_search = GridSearchCV(estimator=model,
                           param_grid=param_grid,
                           scoring=scorer,
                           cv=5,
                           n_jobs=-1)  # use all cores

grid_search.fit(X_train, y_train)

#check result
print("Best Parameters:", grid_search.best_params_)
print("Best F1 Score:", grid_search.best_score_)

Best Parameters: {'estimator__C': 5.0, 'estimator__class_weight': 'balanced', 'estimator__max_iter': 100, 'estimator__solver': 'liblinear'}
Best F1 Score: 0.5867298617682187


The best i could get at this point was a 58% accuracy...let me try other models

#### SVC

In [23]:
svc = OneVsRestClassifier(SVC())

svc.fit(X_train,y_train)

svc_model = svc.predict(X_test)

In [26]:
def gridsearch(param_grid,model):
    scorer = make_scorer(f1_score, average='micro')

    #set up and run gridsearchcv
    grid_search = GridSearchCV(estimator=model,
                           param_grid=param_grid,
                           scoring=scorer,
                           cv=5,
                           n_jobs=-1)  # use all cores

    grid_search.fit(X_train, y_train)
    
    return grid_search.best_score_,grid_search.best_params_
    

In [None]:
param_grid = {
    'estimator__C': [1.0,5.0],
 'estimator__class_weight': [None,'balanced'],
}

model = svc
svc_f1 = gridsearch(param_grid,model)
svc_f1

(0.556786533217986, {'estimator__C': 5.0, 'estimator__class_weight': None})

### MultinomialNB

In [None]:
mn = OneVsRestClassifier(MultinomialNB())
mn.fit(X_train, y_train)
mn_pred = mn.predict(X_test)

In [None]:
print(f1_score(y_test, mn_pred, average='micro'))
print(f1_score(y_test, mn_pred, average='macro'))
print(f1_score(y_test, mn_pred, average='samples',zero_division=0))

0.49530036779730285
0.4033025354355398
0.39693632712500637


In [28]:
lr_svc = OneVsRestClassifier(LinearSVC())

lr_svc.fit(X_train,y_train)
lr_svc_pred = lr_svc.predict(X_test)

In [29]:
print(f1_score(y_test, lr_svc_pred, average='micro'))
print(f1_score(y_test, lr_svc_pred, average='macro'))
print(f1_score(y_test, lr_svc_pred, average='samples',zero_division=0))

0.5910727141828654
0.5513374983932419
0.4949492244303565


In [27]:
param_grid = {
    'estimator__C': [2,5.0],
    'estimator__max_iter': [250,500],
 'estimator__class_weight': [None,'balanced'],
 'estimator__intercept_scaling': [1]
}

result = gridsearch(param_grid,model)
result

(0.5865739623741216,
 {'estimator__C': 5.0,
  'estimator__class_weight': 'balanced',
  'estimator__intercept_scaling': 1,
  'estimator__max_iter': 250})

# 7. Summary

I started this project with a blurry understanding of NLP, and it turned out to be a great way to reacquaint myself with the core concepts.

### 🔍 What I Learned:
1. **Handling Encoding Errors**:  
   My dataset had several encoding issues. I initially tried fixing it by reading the file using different encoding types, but that didn’t work. I eventually had to create a dictionary to manually map and correct corrupted characters. I believe the dataset was previously opened and saved using the wrong encoding, which caused those characters to lose their original meaning.

2. **CountVectorizer vs TfidfVectorizer**:  
   I learned the importance of vectorization when working with text features. I explored both CountVectorizer and TfidfVectorizer, and saw how they affect model performance and feature representation.

3. **Choosing the Right Models**:  
   With over 5,000 features, my dataset was highly dimensional. I discovered that tree-based models (like Random Forest or Decision Trees) struggle with such data because they take a long time to train. I also got introduced to the Multinomial Naive Bayes model, which is more suited for text-based classification tasks.

4. **Improving Model Performance**:  
   I experimented with hyperparameter tuning and saw how some parameters can significantly affect performance. For example, setting `class_weight='balanced'` in `LogisticRegression()` gave better results for my imbalanced labels.

### ⚠️ What Needs Improvement:
- **Model Performance**:  
  My current F1 score is around **60%**, which is decent for a first attempt but leaves room for improvement. I plan to revisit the data later with fresh ideas, possibly exploring different feature engineering techniques or alternative model architectures to boost performance.
