In [95]:
# imports
import pandas as pd
import numpy as np
import mlflow
import mlflow.sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler, OneHotEncoder
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_absolute_error, r2_score

# models
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

print("Libraries imported!")

Libraries imported!


In [96]:
# data loading
df = pd.read_csv('/Users/salikali/VSCodeProjects/Uni/MLOps/Project/Taste-Karachi/data/restaurants.csv')
print("Data loaded!")

Data loaded!


In [97]:
df.shape

(834, 52)

In [98]:
df.head()

Unnamed: 0,name,area,rating,user_rating_count,price_level,phone,international_phone,website,google_maps_link,address,...,parking_free_street,parking_paid_street,accepts_credit_cards,accepts_debit_cards,accepts_cash_only,accepts_nfc,wheelchair_accessible,place_id,extracted_date,extracted_time
0,Wholesome Seafood,Keamari,5.0,258,,0311 1199912,+92 311 1199912,,https://maps.google.com/?cid=75646902353004851...,"Office # 3, 2nd Floor, Building A5/A, Karachi ...",...,,,,,,,,,2025-10-27,10:52:43
1,Napoli Pizza,Bahadurabad,4.9,191,,0304 8222949,+92 304 8222949,https://napolipizzapk.com/,https://maps.google.com/?cid=55913826230992501...,"Shop No 2, Indigo Business Center, 104 Alamgir...",...,True,True,False,True,False,True,False,,2025-10-27,10:52:44
2,Cafe Marina,DHA,4.9,185,,0328 2069011,+92 328 2069011,,https://maps.google.com/?cid=92076325532893502...,"Building 10 c, street 34, DHA Phase 5 Tauheed ...",...,True,,,,,,,,2025-10-27,10:52:45
3,The Burgery's,Gulshan-e-Iqbal,4.9,107,,,,,https://maps.google.com/?cid=12178323240068295...,"W3FR+FMX, Block 5 Gulshan-e-Iqbal, Karachi, Pa...",...,,,,,,,,,2025-10-27,10:52:45
4,Al-Madina Shinwari Hotel,Kharadar,4.9,74,,0315 2555527,+92 315 2555527,,https://maps.google.com/?cid=12750069394653371...,"Al-Madina Shinwari Hotel, Tower, Machhi Miani ...",...,True,,,,False,True,True,,2025-10-27,10:52:46


In [99]:
# Drop rows where target is missing (if any)
TARGET = 'rating'
df = df.dropna(subset=[TARGET])
df.shape

(834, 52)

In [100]:
# show all unique values in area column
df['area'].value_counts().index.tolist()

['Central Karachi',
 'Clifton',
 'Defence',
 'Nazimabad',
 'DHA',
 'Saddar',
 'Gulshan-e-Iqbal',
 'Federal B Area',
 'Malir',
 'Korangi',
 'Bahadurabad',
 'Johar',
 'North Karachi',
 'Scheme 33',
 'Orangi Town',
 'PECHS',
 'Tipu Sultan',
 'Liaquatabad',
 'Burns Road',
 'Landhi',
 'Sindhi Muslim',
 'Model Colony',
 'Gulshan',
 'Tariq Road',
 'Keamari',
 'Surjani Town',
 'Cantonment',
 'Shahrah-e-Faisal',
 'Civil Lines',
 'Kharadar',
 'Sharfabad',
 'MACHS',
 'Phase',
 'Garden East',
 'Cantt',
 'Zamzama',
 'Soldier Bazaar',
 'Khayaban',
 '1 4 Naval Colony',
 'Block 1 Gulistan-e-Johar',
 '5 بلاک Block 5 کلفٹن، کراچی، کراچی',
 'Karachi Administration Employees Housing Society Block 5 KAECHS',
 'Mehmoodabad Karachi',
 'opposite Muhammdi house',
 'Central Jacob Lines Block 1 Khudadad Colony',
 'Sector 15-A/1 Sector 15 A 1 Buffer Zone',
 'Seari Quarters',
 'A27/1 khudadad colony society office single',
 'Jamshed Quarters Karachi',
 'Block 13 Gulistan-e-Johar',
 'Rohail Khand Society',
 'Pak Co

In [101]:
area_mapping = {
    # --- DHA / Defence ---
    'Defence': 'DHA',
    'DHA': 'DHA',
    'Bukhari Commercial': 'DHA',
    '57-C 10th Badar Commercial Street، ڈی ایچ اے فیز 5 Badar Commercial Area ڈیفنس ۵ ڈیفنس ہاؤسنگ اتھارٹی، کراچی': 'DHA',

    # --- Gulshan-e-Iqbal ---
    'Gulshan': 'Gulshan-e-Iqbal',
    'Gulshan-e-Iqbal': 'Gulshan-e-Iqbal',

    # --- Gulistan-e-Johar ---
    'Johar': 'Gulistan-e-Johar',
    'Block 1 Gulistan-e-Johar': 'Gulistan-e-Johar',
    'Block 13 Gulistan-e-Johar': 'Gulistan-e-Johar',
    'Block 1 Block 3 Gulistan-e-Johar': 'Gulistan-e-Johar',
    'Block 16 Gulistan-e-Johar': 'Gulistan-e-Johar',

    # --- Clifton ---
    'Clifton': 'Clifton',
    '5 بلاک Block 5 کلفٹن، کراچی، کراچی': 'Clifton',
    'Block 4 کلفٹن، کراچی، کراچی': 'Clifton',
    'Delhi Colony': 'Clifton', # Delhi Colony is adjacent to Clifton

    # --- Cantonment / Cantt ---
    'Cantonment': 'Cantonment',
    'Cantt': 'Cantonment',
    'Shapes Compound، 139 McNeil Rd': 'Cantonment',

    # --- Jamshed Quarters ---
    'Jamshed Quarters Karachi': 'Jamshed Quarters',
    'Jamshed Quarters': 'Jamshed Quarters',
    'Jamshed Quarters Amil Colony': 'Jamshed Quarters',
    'Central Jacob Lines Block 1 Khudadad Colony': 'Central Jacob Lines', # Contains two, defaulting to first
    'Central Jacob Lines': 'Central Jacob Lines',
    'A27/1 khudadad colony society office single': 'Khudadad Colony',

    # --- North Nazimabad / Nazimabad ---
    'North Nazimabad': 'North Nazimabad',
    'Nazimabad': 'Nazimabad', # Kept separate as they are distinct areas
    'Block E Hyderi Karachi': 'North Nazimabad', # Hyderi is in North Nazimabad
    'Sector 15-A/1 Sector 15 A 1 Buffer Zone': 'Buffer Zone',

    # --- KDA Scheme 1 ---
    'KDA Scheme #1 KDA Scheme 1': 'KDA Scheme 1',

    # --- Kharadar ---
    'Kharadar': 'Kharadar',
    'کھارادر Ghulam Hussain Kasim Quarters': 'Kharadar',

    # --- Specific Addresses simplified to Area ---
    'Block 8 Frere Town': 'Frere Town',
    'Block 14 Block 15 Gulberg Town': 'Gulberg Town',
    'Karachi Administration Employees Housing Society Block 5 KAECHS': 'KAECHS',
    'Mehmoodabad Karachi': 'Mehmoodabad',
    'Empress Market Karachi': 'Saddar', # Inside Saddar
    'Near Aurangzeb park Urdu bazaar': 'Urdu Bazaar',
    'Delhi Mercantile Society Block 3': 'Delhi Mercantile Society',
    'Pak Colony Wilayatabad': 'Pak Colony',
    '1 4 Naval Colony': 'Naval Colony',
    'Chakiwara Rd': 'Chakiwara',
    'I.I Chundrigar Rd': 'I.I. Chundrigar Road',

    # --- Ambiguous / Generic / Not an Area ---
    'opposite Muhammdi house': 'Other',
    'Phase': 'Other', # Too generic without a number
    'Khayaban': 'Other', # Too generic without a name
    'Block A Roadoad، Karachi': 'Other',
    'Block 7': 'Other', # Too generic without a parent area
    'Block 12 BRACH 1، Karachi': 'Other',
}

df['area'] = df['area'].map(area_mapping).fillna(df['area'])

In [102]:
# Set the option to show all rows (None means unlimited)
pd.set_option('display.max_rows', None)
print(df['area'].value_counts())

area
DHA                         112
Central Karachi              85
Clifton                      74
Nazimabad                    51
Gulshan-e-Iqbal              46
Saddar                       38
Federal B Area               30
Malir                        29
Gulistan-e-Johar             28
Korangi                      28
Bahadurabad                  27
North Karachi                22
Scheme 33                    22
PECHS                        18
Orangi Town                  18
Liaquatabad                  17
Tipu Sultan                  17
Burns Road                   16
Landhi                       16
Sindhi Muslim                14
Model Colony                 13
Tariq Road                   12
Cantonment                   11
Keamari                      10
Surjani Town                  9
Other                         7
Shahrah-e-Faisal              6
Kharadar                      5
Civil Lines                   5
Jamshed Quarters              3
Sharfabad                     3
Nor

In [103]:
# drop row with lahore as area
df = df[df['area'] != 'Lahore']
df.shape

(833, 52)

In [104]:
# show all names
df['name']

0                                      Wholesome Seafood
1                                           Napoli Pizza
2                                            Cafe Marina
3                                          The Burgery's
4                               Al-Madina Shinwari Hotel
5                                        Mr. Beef Burgrz
6                                               DUMPLISH
7                                      Thai Chin Karachi
8         Real Spice - Tariq Road ریئل اسپائس - طارق روڈ
9                                The Big Pizza - FB Area
10                                 The Big Pizza - Johar
11                        Caesar's Pizza North Nazimabad
12                                  Beef Smash - Gulshan
13                                            Beef Smash
14                         The Big Pizza - North Karachi
15                              CNS - Chinese Restaurant
16                                Pizza Bake orangi Town
17                             

In [105]:
# Create the list of names you want to drop
names_to_drop = [
    'Avari Towers Karachi',
    'Seafarer\'s Mess Pakistan Marine Academy Karachi',
    'Mangria Hotel',
    'Kharadar food street',
    'Port Grand - Karachi',
    'Burns Road Food Street',
    'Hussainabad Food Street',
    'The Basement Gaming place',
    'TROPIK',
    'THE STOVE CLUB',
    'Karachi Halwa Lea Market',
    'Boat Basin',
    'Beach View Park',
    'The Vegas Studio',
    'Creek Walk Food Street',
    'CSD Food Court',
    'Bohra Food Street',
    'Karachi Highway Nights',
    'DHA Food Street'
]

# 1. Get the index of rows to drop
indices_to_drop = df[df['name'].isin(names_to_drop)].index
df.drop(indices_to_drop, inplace=True)
df.shape

(814, 52)

In [106]:
# drop irrelevant columns
irrelevant_columns = ['user_rating_count', 'phone', 'international_phone', 'website',
                     'address', 'short_address', 'all_types', 'business_status', 'place_id', 'extracted_date', 'extracted_time']
df = df.drop(columns=irrelevant_columns)
df.shape

(814, 41)

In [107]:
# check for missing values
missing_values = df.isnull().sum()
print("Missing values in each column:")
print(missing_values)

Missing values in each column:
name                          0
area                          0
rating                        0
price_level                 376
google_maps_link              0
latitude                      0
longitude                     0
open_now                     22
hours                        22
primary_type                  1
category                      1
dine_in                      36
takeout                      24
delivery                    267
curbside_pickup             577
reservable                  301
serves_breakfast            472
serves_brunch               531
serves_lunch                199
serves_dinner               103
serves_coffee               160
serves_dessert              237
serves_vegetarian_food      617
outdoor_seating             247
live_music                  213
good_for_children           159
menu_for_children           341
good_for_groups              78
good_for_watching_sports    491
allows_dogs                 814
restroom 

In [108]:
# Drop columns with more than 600 missing values
threshold = 600
cols_to_drop = missing_values[missing_values > threshold].index.tolist()
print("Columns to drop:", cols_to_drop)

df = df.drop(columns=cols_to_drop)
print("\nNew dataframe shape:", df.shape)

Columns to drop: ['serves_vegetarian_food', 'allows_dogs', 'parking_paid_lot', 'parking_valet', 'parking_paid_street']

New dataframe shape: (814, 36)


In [109]:
# check for missing values
missing_values = df.isnull().sum()
print("Missing values in each column:")
print(missing_values)

Missing values in each column:
name                          0
area                          0
rating                        0
price_level                 376
google_maps_link              0
latitude                      0
longitude                     0
open_now                     22
hours                        22
primary_type                  1
category                      1
dine_in                      36
takeout                      24
delivery                    267
curbside_pickup             577
reservable                  301
serves_breakfast            472
serves_brunch               531
serves_lunch                199
serves_dinner               103
serves_coffee               160
serves_dessert              237
outdoor_seating             247
live_music                  213
good_for_children           159
menu_for_children           341
good_for_groups              78
good_for_watching_sports    491
restroom                    270
parking_free_lot            244
parking_f

In [110]:
# drop curbside_pickup, serves_brunch, menu_for_children, accepts_credit_cards, accepts_nfc
df = df.drop(columns=['open_now', 'curbside_pickup', 'serves_brunch', 'menu_for_children', 'accepts_credit_cards', 'accepts_nfc'])
df.shape

(814, 30)

In [111]:
#just the name 
df[df['dine_in'].isnull()]['name']

0                         Wholesome Seafood
32        Auntie Munaver's Dessert & Savory
40     Mards Food's The Barbecue Specialist
41                 Jameel Sweets جمیل سوئٹس
71                         NEW DISCO BAKERY
89                                    Ktoré
95        Helados Ice Cream اِیلادو آئسکریم
135                     Tea In Binoria Town
139         Delizia Bakery Khayaban-e-Roomi
147                        Shabbir Icecream
197        Dilpasand Sweets, Bakers & Nimco
198                  Ambala Sweets & Bakers
207                   United King Sharfabad
221                          Sialkot Bakers
241                       Delhi Rabri House
310                       Bin Qasim kitchen
340          United Bakery۔ یونائیٹیڈ بیکری
353          Karachi ice-cream Hussain abad
384                    Panwaari Bahadurabad
407                       New Crispo Bakery
419                        Jeddah Ice Cream
437                          Karachi Sweets
439                          Bas

In [112]:
# fill dine_in missing values with false boolean
df['dine_in'] = df['dine_in'].fillna(False)

In [113]:
# check for missing values
missing_values = df.isnull().sum()
print("Missing values in each column:")
print(missing_values)

Missing values in each column:
name                          0
area                          0
rating                        0
price_level                 376
google_maps_link              0
latitude                      0
longitude                     0
hours                        22
primary_type                  1
category                      1
dine_in                       0
takeout                      24
delivery                    267
reservable                  301
serves_breakfast            472
serves_lunch                199
serves_dinner               103
serves_coffee               160
serves_dessert              237
outdoor_seating             247
live_music                  213
good_for_children           159
good_for_groups              78
good_for_watching_sports    491
restroom                    270
parking_free_lot            244
parking_free_street          94
accepts_debit_cards         418
accepts_cash_only            97
wheelchair_accessible       478
dtype: in

In [114]:
# show rows with missing hours
missing_hours = df[df['hours'].isnull()]
print("Rows with missing 'hours':")
print(missing_hours)

Rows with missing 'hours':
                                                  name             area  \
3                                        The Burgery's  Gulshan-e-Iqbal   
39                             Beit Al Halab - Defence              DHA   
88                                          K Town Co.              DHA   
142      Dehli Darbar Kabab House دہلی دربار کباب ہاؤس  Gulshan-e-Iqbal   
176   House 202 area 2c landhi 3v1/2 July Nagri Haleem           Landhi   
191  Bholo Restaurant & BBQ بھولو ریستوران و بی بی کیو          Korangi   
203           Raees Ahmed Arain Standard Biryani House           Landhi   
274          New Delhi Islam Restaurant & Nihari House   Federal B Area   
441                           Madina Biryani and Pulao      Orangi Town   
477  New Qalandari Biryani & Pakwan Centre نیو قَلن...     Model Colony   
486                             Dua Restaurant Clifton          Clifton   
510                                Mr Cone food center     Surjani Town  

In [115]:
# fill null hours with 'median string'
mode_hours = df['hours'].mode()[0]
print("Mode of 'hours':", mode_hours)
df['hours'] = df['hours'].fillna(mode_hours)

Mode of 'hours': Monday: Open 24 hours | Tuesday: Open 24 hours | Wednesday: Open 24 hours | Thursday: Open 24 hours | Friday: Open 24 hours | Saturday: Open 24 hours | Sunday: Open 24 hours


In [116]:
# check for missing values
missing_values = df.isnull().sum()
print("Missing values in each column:")
print(missing_values)

Missing values in each column:
name                          0
area                          0
rating                        0
price_level                 376
google_maps_link              0
latitude                      0
longitude                     0
hours                         0
primary_type                  1
category                      1
dine_in                       0
takeout                      24
delivery                    267
reservable                  301
serves_breakfast            472
serves_lunch                199
serves_dinner               103
serves_coffee               160
serves_dessert              237
outdoor_seating             247
live_music                  213
good_for_children           159
good_for_groups              78
good_for_watching_sports    491
restroom                    270
parking_free_lot            244
parking_free_street          94
accepts_debit_cards         418
accepts_cash_only            97
wheelchair_accessible       478
dtype: in

In [117]:
# show rows with null takeout
missing_takeout = df[df['takeout'].isnull()]
print("Rows with missing 'takeout':")
print(missing_takeout)

Rows with missing 'takeout':
                                                  name              area  \
0                                    Wholesome Seafood           Keamari   
6                                             DUMPLISH     North Karachi   
62                               Chef's Table Pakistan               DHA   
89                                               Ktoré           Clifton   
102                                     Cafe Hill View   Central Karachi   
117                             Koyla Chai۔ کوئلہ چائے           Clifton   
152                  The Region banquet and restaurant           Clifton   
187                           Agha Chinese & Fast Food   Central Karachi   
273  Kundan kabab and nihari house - best kabab in ...   Gulshan-e-Iqbal   
310                                  Bin Qasim kitchen        Tariq Road   
312                          Super TESTY Pakwan Centre      Surjani Town   
419                                   Jeddah Ice Cream     

In [118]:
# fill takeout missing values with false
df['takeout'] = df['takeout'].fillna(False)

In [119]:
# show rows with null delivery
missing_delivery = df[df['delivery'].isnull()]
print("Rows with missing 'delivery':")
print(missing_delivery)

Rows with missing 'delivery':
                                                  name                 area  \
3                                        The Burgery's      Gulshan-e-Iqbal   
19             Broadway Pizza - Dolmen Mall Tariq Road           Tariq Road   
21                                  LEVEL2 by Espresso          Tipu Sultan   
24                                              Mizaaj        Sindhi Muslim   
30                             Broadway Pizza - Garden          Garden East   
43                                          Vox Coffee         KDA Scheme 1   
44                 California Pizza - M.A. Jinnah Road           Gazdarabad   
45                                     동분식 DONG BUNSIK                  DHA   
50                                     Quick Bite Cafe       Federal B Area   
51                        Abdul Wahid Gola Kabab House        North Karachi   
52                            Bagh e Jinnah Restaurant         Model Colony   
55                    

In [120]:
# delivery missing values with true
df['delivery'] = df['delivery'].fillna(True)

In [121]:
# reservable nulls as False
df['reservable'] = df['reservable'].fillna(False)

In [122]:
# missing value counts
df.isnull().sum()

name                          0
area                          0
rating                        0
price_level                 376
google_maps_link              0
latitude                      0
longitude                     0
hours                         0
primary_type                  1
category                      1
dine_in                       0
takeout                       0
delivery                      0
reservable                    0
serves_breakfast            472
serves_lunch                199
serves_dinner               103
serves_coffee               160
serves_dessert              237
outdoor_seating             247
live_music                  213
good_for_children           159
good_for_groups              78
good_for_watching_sports    491
restroom                    270
parking_free_lot            244
parking_free_street          94
accepts_debit_cards         418
accepts_cash_only            97
wheelchair_accessible       478
dtype: int64

In [123]:
# show names of rows where serves_breakfast is null
df[df['serves_breakfast'].isnull()]['name'].tolist()

['Wholesome Seafood',
 'Cafe Marina',
 "The Burgery's",
 'Mr. Beef Burgrz',
 'DUMPLISH',
 'Real Spice - Tariq Road ریئل اسپائس - طارق روڈ',
 'The Big Pizza - FB Area',
 'The Big Pizza - Johar',
 "Caesar's Pizza North Nazimabad",
 'Beef Smash - Gulshan',
 'Beef Smash',
 'The Big Pizza - North Karachi',
 'CNS - Chinese Restaurant',
 'Pizza Bake orangi Town',
 'Beit Al Halab - BTK',
 'Pizza Bistro- Garden',
 'Broadway Pizza - Dolmen Mall Tariq Road',
 'Pizza Bistro - Gulshan-e-Iqbal',
 'LEVEL2 by Espresso',
 'wedrink bahadurabad',
 'Mizaaj',
 'The Big Pizza',
 'The Carnivore Karachi',
 'Broadway Pizza - I.I Chundrigar',
 'Arabian Grill Night',
 "Auntie Munaver's Dessert & Savory",
 'Beef Smash - Johar',
 'Boss Cafe',
 'Sakura Japanese Restaurant',
 'Pizza Creations',
 'dbrewd Coffee & More',
 'Beit Al Halab - Defence',
 "Mards Food's The Barbecue Specialist",
 'Jameel Sweets جمیل سوئٹس',
 'Lala Biryani and Catering',
 'Vox Coffee',
 'California Pizza - M.A. Jinnah Road',
 '동분식 DONG BUNSIK

In [124]:
# serves_breakfast null as false
df['serves_breakfast'] = df['serves_breakfast'].fillna(False)

In [125]:
# price level null as mode
mode_price_level = df['price_level'].mode()[0]
print("Mode of 'price_level':", mode_price_level)
df['price_level'] = df['price_level'].fillna(mode_price_level)

Mode of 'price_level': PRICE_LEVEL_MODERATE


In [126]:
# check for missing values
missing_values = df.isnull().sum()
print("Missing values in each column:")
print(missing_values)

Missing values in each column:
name                          0
area                          0
rating                        0
price_level                   0
google_maps_link              0
latitude                      0
longitude                     0
hours                         0
primary_type                  1
category                      1
dine_in                       0
takeout                       0
delivery                      0
reservable                    0
serves_breakfast              0
serves_lunch                199
serves_dinner               103
serves_coffee               160
serves_dessert              237
outdoor_seating             247
live_music                  213
good_for_children           159
good_for_groups              78
good_for_watching_sports    491
restroom                    270
parking_free_lot            244
parking_free_street          94
accepts_debit_cards         418
accepts_cash_only            97
wheelchair_accessible       478
dtype: in

In [127]:
# serves_lunch null to false
df['serves_lunch'] = df['serves_lunch'].fillna(False).astype(bool)

# serves_dinner null to false
df['serves_dinner'] = df['serves_dinner'].fillna(False).astype(bool)

# serves_coffee null to false
df['serves_coffee'] = df['serves_coffee'].fillna(False).astype(bool)

# serves_dessert null to false
df['serves_dessert'] = df['serves_dessert'].fillna(False).astype(bool)

# outdoor_seating null to false
df['outdoor_seating'] = df['outdoor_seating'].fillna(False).astype(bool)

# live_music null to false
df['live_music'] = df['live_music'].fillna(False).astype(bool)

# good_for_children null to false
df['good_for_children'] = df['good_for_children'].fillna(False).astype(bool)

# good_for_groups null to false
df['good_for_groups'] = df['good_for_groups'].fillna(False).astype(bool)

# good for watching sports null to false
df['good_for_watching_sports'] = df['good_for_watching_sports'].fillna(False).astype(bool)

# restroom null to false
df['restroom'] = df['restroom'].fillna(False).astype(bool)

# parking_free_lot null to false
df['parking_free_lot'] = df['parking_free_lot'].fillna(False).astype(bool)

# parking_free_street null to false
df['parking_free_street'] = df['parking_free_street'].fillna(False).astype(bool)

In [128]:
# check for missing values
missing_values = df.isnull().sum()
print("Missing values in each column:")
print(missing_values)

Missing values in each column:
name                          0
area                          0
rating                        0
price_level                   0
google_maps_link              0
latitude                      0
longitude                     0
hours                         0
primary_type                  1
category                      1
dine_in                       0
takeout                       0
delivery                      0
reservable                    0
serves_breakfast              0
serves_lunch                  0
serves_dinner                 0
serves_coffee                 0
serves_dessert                0
outdoor_seating               0
live_music                    0
good_for_children             0
good_for_groups               0
good_for_watching_sports      0
restroom                      0
parking_free_lot              0
parking_free_street           0
accepts_debit_cards         418
accepts_cash_only            97
wheelchair_accessible       478
dtype: in

In [129]:
# fill wheelchair_accessible nulls with false
df['wheelchair_accessible'] = df['wheelchair_accessible'].fillna(False).astype(bool)

In [130]:
# accepts_cash_only null to false
df['accepts_cash_only'] = df['accepts_cash_only'].fillna(False).astype(bool)

In [131]:
# same as above for debit_cards
for index, row in df[df['accepts_debit_cards'].isnull()].iterrows():
    if row['accepts_cash_only'] == True:
        df.at[index, 'accepts_debit_cards'] = False
    else:
        df.at[index, 'accepts_debit_cards'] = True

In [132]:
# check for missing values
missing_values = df.isnull().sum()
print("Missing values in each column:")
print(missing_values)

Missing values in each column:
name                        0
area                        0
rating                      0
price_level                 0
google_maps_link            0
latitude                    0
longitude                   0
hours                       0
primary_type                1
category                    1
dine_in                     0
takeout                     0
delivery                    0
reservable                  0
serves_breakfast            0
serves_lunch                0
serves_dinner               0
serves_coffee               0
serves_dessert              0
outdoor_seating             0
live_music                  0
good_for_children           0
good_for_groups             0
good_for_watching_sports    0
restroom                    0
parking_free_lot            0
parking_free_street         0
accepts_debit_cards         0
accepts_cash_only           0
wheelchair_accessible       0
dtype: int64


In [133]:
# drop primary_type column
df = df.drop(columns=['primary_type'])
df.shape

(814, 29)

In [134]:
df['category'].value_counts()

category
Restaurant                   514
Fast Food Restaurant          44
Barbecue Restaurant           28
Chinese Restaurant            25
Pizza Restaurant              24
Cafe                          24
Bakery                        23
Indian Restaurant             16
Ice Cream Shop                13
Coffee Shop                   13
Buffet Restaurant             10
Juice Shop                     7
Italian Restaurant             7
Seafood Restaurant             6
Korean Restaurant              5
Breakfast Restaurant           5
Dessert Shop                   5
Tea House                      5
Thai Restaurant                4
Turkish Restaurant             3
Fine Dining Restaurant         3
American Restaurant            3
Asian Restaurant               3
Middle Eastern Restaurant      3
Food Store                     2
Japanese Restaurant            2
Afghan restaurant              2
Mediterranean Restaurant       2
Food Court                     1
Wholesaler                     1
M

In [135]:
# Create the mapping dictionary
category_mapping = {
    'Coffee Shop': 'Cafe',
    'Tea House': 'Cafe',
    'Dessert Restaurant': 'Dessert Shop',
    'Ice Cream Shop': 'Dessert Shop',
    'Chocolate Shop': 'Dessert Shop',
    'Sushi Restaurant': 'Japanese Restaurant',
    'Food': 'Restaurant',
    'Afghan restaurant': 'Afghan Restaurant'
}

# Apply the mapping
# .map(mapping) changes the values
# .fillna(df['category']) keeps all original values that were NOT in the map
df['category'] = df['category'].map(category_mapping).fillna(df['category'])

In [136]:
df['category'].value_counts()

category
Restaurant                   515
Fast Food Restaurant          44
Cafe                          42
Barbecue Restaurant           28
Chinese Restaurant            25
Pizza Restaurant              24
Bakery                        23
Dessert Shop                  20
Indian Restaurant             16
Buffet Restaurant             10
Italian Restaurant             7
Juice Shop                     7
Seafood Restaurant             6
Korean Restaurant              5
Breakfast Restaurant           5
Thai Restaurant                4
Asian Restaurant               3
Japanese Restaurant            3
Middle Eastern Restaurant      3
American Restaurant            3
Turkish Restaurant             3
Fine Dining Restaurant         3
Food Store                     2
Mediterranean Restaurant       2
Afghan Restaurant              2
Event Venue                    1
Wholesaler                     1
Mexican Restaurant             1
Market                         1
Food Court                     1
S

In [137]:
# drop rows with Event Venue, Wholesaler, Market, Food Court as category
# List of categories to drop
categories_to_drop = [
    'Event Venue', 
    'Wholesaler', 
    'Market', 
    'Food Court'
]

# This filters out the unwanted rows and overwrites your original 'df'
df = df[~df['category'].isin(categories_to_drop)].copy()
df.shape

(810, 29)

In [138]:
# drop rows where category is null
df = df[df['category'].notnull()]
df.shape

(809, 29)

In [139]:
# null values and counts
missing_values = df.isnull().sum()
print("Missing values in each column:")
print(missing_values)

Missing values in each column:
name                        0
area                        0
rating                      0
price_level                 0
google_maps_link            0
latitude                    0
longitude                   0
hours                       0
category                    0
dine_in                     0
takeout                     0
delivery                    0
reservable                  0
serves_breakfast            0
serves_lunch                0
serves_dinner               0
serves_coffee               0
serves_dessert              0
outdoor_seating             0
live_music                  0
good_for_children           0
good_for_groups             0
good_for_watching_sports    0
restroom                    0
parking_free_lot            0
parking_free_street         0
accepts_debit_cards         0
accepts_cash_only           0
wheelchair_accessible       0
dtype: int64


In [140]:
# get name, google maps link columns only
df[['name', 'google_maps_link']].to_csv('/Users/salikali/VSCodeProjects/Uni/MLOps/Project/Taste-Karachi/data/names_only.csv', index=False)

In [141]:
# drop name column
df = df.drop(columns=['name', 'google_maps_link'])
df.shape

(809, 27)

In [142]:
df.dtypes

area                         object
rating                      float64
price_level                  object
latitude                    float64
longitude                   float64
hours                        object
category                     object
dine_in                        bool
takeout                        bool
delivery                       bool
reservable                     bool
serves_breakfast               bool
serves_lunch                   bool
serves_dinner                  bool
serves_coffee                  bool
serves_dessert                 bool
outdoor_seating                bool
live_music                     bool
good_for_children              bool
good_for_groups                bool
good_for_watching_sports       bool
restroom                       bool
parking_free_lot               bool
parking_free_street            bool
accepts_debit_cards          object
accepts_cash_only              bool
wheelchair_accessible          bool
dtype: object

In [143]:
# accepts_debit_cards to boolean
df['accepts_debit_cards'] = df['accepts_debit_cards'].astype(bool)

In [144]:
df.dtypes

area                         object
rating                      float64
price_level                  object
latitude                    float64
longitude                   float64
hours                        object
category                     object
dine_in                        bool
takeout                        bool
delivery                       bool
reservable                     bool
serves_breakfast               bool
serves_lunch                   bool
serves_dinner                  bool
serves_coffee                  bool
serves_dessert                 bool
outdoor_seating                bool
live_music                     bool
good_for_children              bool
good_for_groups                bool
good_for_watching_sports       bool
restroom                       bool
parking_free_lot               bool
parking_free_street            bool
accepts_debit_cards            bool
accepts_cash_only              bool
wheelchair_accessible          bool
dtype: object

In [145]:
# unique values and counts of hours
df['hours'].value_counts()

hours
Monday: Open 24 hours | Tuesday: Open 24 hours | Wednesday: Open 24 hours | Thursday: Open 24 hours | Friday: Open 24 hours | Saturday: Open 24 hours | Sunday: Open 24 hours                                                                                                                                                                                   80
Monday: 12:00 PM – 12:00 AM | Tuesday: 12:00 PM – 12:00 AM | Wednesday: 12:00 PM – 12:00 AM | Thursday: 12:00 PM – 12:00 AM | Friday: 12:00 PM – 12:00 AM | Saturday: 12:00 PM – 12:00 AM | Sunday: 12:00 PM – 12:00 AM                                                                                                                                         22
Monday: 12:00 PM – 1:00 AM | Tuesday: 12:00 PM – 1:00 AM | Wednesday: 12:00 PM – 1:00 AM | Thursday: 12:00 PM – 1:00 AM | Friday: 12:00 PM – 1:00 AM | Saturday: 12:00 PM – 1:00 AM | Sunday: 12:00 PM – 1:00 AM                                                                            

In [146]:
# --- Feature Engineering from 'hours' ---

# Checks for the exact string "Open 24 hours"
df['is_open_24_7'] = df['hours'].str.contains("Open 24 hours", case=False)

# --- CORRECTED REGEX ---
# We use \s* to match any kind of whitespace (or none at all)
# We use [1-5] to match any hour from 1 to 5
df['open_after_midnight'] = df['hours'].str.contains(r'–\s*[1-5]:\d{2}\s*AM', case=False)

# Checks if the string "Closed" appears anywhere in the hours
df['is_closed_any_day'] = df['hours'].str.contains("Closed", case=False)

In [147]:
# drop hours column
df = df.drop(columns=['hours'])
df.dtypes

area                         object
rating                      float64
price_level                  object
latitude                    float64
longitude                   float64
category                     object
dine_in                        bool
takeout                        bool
delivery                       bool
reservable                     bool
serves_breakfast               bool
serves_lunch                   bool
serves_dinner                  bool
serves_coffee                  bool
serves_dessert                 bool
outdoor_seating                bool
live_music                     bool
good_for_children              bool
good_for_groups                bool
good_for_watching_sports       bool
restroom                       bool
parking_free_lot               bool
parking_free_street            bool
accepts_debit_cards            bool
accepts_cash_only              bool
wheelchair_accessible          bool
is_open_24_7                   bool
open_after_midnight         

In [148]:
# drop rating column and then append it again so it comes last
rating = df.pop('rating')
df['rating'] = rating
df.head()

Unnamed: 0,area,price_level,latitude,longitude,category,dine_in,takeout,delivery,reservable,serves_breakfast,...,restroom,parking_free_lot,parking_free_street,accepts_debit_cards,accepts_cash_only,wheelchair_accessible,is_open_24_7,open_after_midnight,is_closed_any_day,rating
1,Bahadurabad,PRICE_LEVEL_MODERATE,24.88079,67.069359,Pizza Restaurant,True,True,True,True,False,...,True,True,True,True,False,False,False,True,False,4.9
2,DHA,PRICE_LEVEL_MODERATE,24.807703,67.038819,Restaurant,True,True,True,True,False,...,True,False,True,True,False,False,False,False,False,4.9
3,Gulshan-e-Iqbal,PRICE_LEVEL_MODERATE,24.923728,67.091745,Fast Food Restaurant,True,True,True,False,False,...,False,True,False,True,False,False,True,False,False,4.9
4,Kharadar,PRICE_LEVEL_MODERATE,24.849613,66.995674,Barbecue Restaurant,True,True,True,True,True,...,True,True,True,True,False,True,False,True,False,4.9
5,Malir,PRICE_LEVEL_MODERATE,24.902723,67.183998,Fast Food Restaurant,True,True,True,False,False,...,False,False,False,True,False,False,False,True,False,4.9


In [149]:
# check for null values
missing_values = df.isnull().sum()
print("Missing values in each column:")
print(missing_values)

Missing values in each column:
area                        0
price_level                 0
latitude                    0
longitude                   0
category                    0
dine_in                     0
takeout                     0
delivery                    0
reservable                  0
serves_breakfast            0
serves_lunch                0
serves_dinner               0
serves_coffee               0
serves_dessert              0
outdoor_seating             0
live_music                  0
good_for_children           0
good_for_groups             0
good_for_watching_sports    0
restroom                    0
parking_free_lot            0
parking_free_street         0
accepts_debit_cards         0
accepts_cash_only           0
wheelchair_accessible       0
is_open_24_7                0
open_after_midnight         0
is_closed_any_day           0
rating                      0
dtype: int64


In [150]:
# create two data splits: train_set and holdout_test_set and save them to their respective csv files
train_val_df, test_df = train_test_split(df, test_size=0.1, random_state=42)
train_val_df.to_csv('/Users/salikali/VSCodeProjects/Uni/MLOps/Project/Taste-Karachi/data/train_set.csv', index=False)
test_df.to_csv('/Users/salikali/VSCodeProjects/Uni/MLOps/Project/Taste-Karachi/data/holdout_test_set.csv', index=False)
print("Train set and Holdout Test set saved to CSV!")
print("Holdout Test set saved to CSV!")

Train set and Holdout Test set saved to CSV!
Holdout Test set saved to CSV!
