In [1]:
# importing Libraries
import pandas as pd
import numpy as np
import pickle
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder

In [2]:
data = pd.read_csv('/content/swiggy.csv')

In [3]:
data.head()

Unnamed: 0,id,name,city,rating,rating_count,cost,cuisine,lic_no,link,address,menu
0,567335,AB FOODS POINT,Abohar,--,Too Few Ratings,₹ 200,"Beverages,Pizzas",22122652000138,https://www.swiggy.com/restaurants/ab-foods-po...,"AB FOODS POINT, NEAR RISHI NARANG DENTAL CLINI...",Menu/567335.json
1,531342,Janta Sweet House,Abohar,4.4,50+ ratings,₹ 200,"Sweets,Bakery",12117201000112,https://www.swiggy.com/restaurants/janta-sweet...,"Janta Sweet House, Bazar No.9, Circullar Road,...",Menu/531342.json
2,158203,theka coffee desi,Abohar,3.8,100+ ratings,₹ 100,Beverages,22121652000190,https://www.swiggy.com/restaurants/theka-coffe...,"theka coffee desi, sahtiya sadan road city",Menu/158203.json
3,187912,Singh Hut,Abohar,3.7,20+ ratings,₹ 250,"Fast Food,Indian",22119652000167,https://www.swiggy.com/restaurants/singh-hut-n...,"Singh Hut, CIRCULAR ROAD NEAR NEHRU PARK ABOHAR",Menu/187912.json
4,543530,GRILL MASTERS,Abohar,--,Too Few Ratings,₹ 250,"Italian-American,Fast Food",12122201000053,https://www.swiggy.com/restaurants/grill-maste...,"GRILL MASTERS, ADA Heights, Abohar - Hanumanga...",Menu/543530.json


In [4]:

# checking no_of_rows and columns
data.shape


(148541, 11)

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 148541 entries, 0 to 148540
Data columns (total 11 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   id            148541 non-null  int64 
 1   name          148455 non-null  object
 2   city          148541 non-null  object
 3   rating        148455 non-null  object
 4   rating_count  148455 non-null  object
 5   cost          148410 non-null  object
 6   cuisine       148442 non-null  object
 7   lic_no        148312 non-null  object
 8   link          148541 non-null  object
 9   address       148455 non-null  object
 10  menu          148541 non-null  object
dtypes: int64(1), object(10)
memory usage: 12.5+ MB


In [6]:
# checking duplicates rows in DataFrame
data.duplicated().sum()

np.int64(0)

In [7]:
data.isnull().sum()

Unnamed: 0,0
id,0
name,86
city,0
rating,86
rating_count,86
cost,131
cuisine,99
lic_no,229
link,0
address,86


In [8]:
data.fillna(0,inplace=True)

In [9]:
data.isnull().sum()

Unnamed: 0,0
id,0
name,0
city,0
rating,0
rating_count,0
cost,0
cuisine,0
lic_no,0
link,0
address,0


In [10]:
data.columns

Index(['id', 'name', 'city', 'rating', 'rating_count', 'cost', 'cuisine',
       'lic_no', 'link', 'address', 'menu'],
      dtype='object')

In [11]:
Swiggy_data = data.head(25000).copy()

In [12]:
Swiggy_data.shape

(25000, 11)

In [13]:
Swiggy_data.isnull().sum()

Unnamed: 0,0
id,0
name,0
city,0
rating,0
rating_count,0
cost,0
cuisine,0
lic_no,0
link,0
address,0


In [14]:
# checking all the unique values of cusines column
Swiggy_data['cuisine'].unique()


array(['Beverages,Pizzas', 'Sweets,Bakery', 'Beverages', ...,
       'Sweets,Italian', 'Indian,Italian-American', 'Pastas,North Indian'],
      dtype=object)

In [15]:
Swiggy_data['cuisine'].isnull().sum()

np.int64(0)

In [16]:
Swiggy_data['city'].unique()

array(['Abohar', 'Adilabad', 'Adityapur', 'Adoni', 'Agartala', 'Agra',
       'Vastrapur,Ahmedabad', 'GOTA,Ahmedabad',
       'Paldi & Ambawadi,Ahmedabad', 'Ghatlodia,Ahmedabad',
       'Bopal,Ahmedabad', 'Gandhinagar,Ahmedabad', 'LalDarwaja,Ahmedabad',
       'Naranpura,Ahmedabad', 'Navrangpura,Ahmedabad',
       'Science City,Ahmedabad', 'Maninagar,Ahmedabad',
       'Chandkheda,Ahmedabad', 'Ahmednagar', 'Aizawl', 'Ajmer', 'Akola',
       'Alappuzha', 'Aligarh', 'Alipurduar', 'Allahabad', 'Alwar',
       'Ambala', 'Ambikapur', 'Ambur', 'Amravati', 'Amreli', 'Amritsar',
       'Anand', 'Anantapur', 'Ankleshwar', 'Arakkonam', 'Arambagh',
       'Arrah', 'Aruppukottai', 'Asansol', 'Aurangabad',
       'Aurangabad_bihar', 'Azamgarh', 'Baddi', 'Bagalkot', 'Bagdogra',
       'Bahadurgarh', 'Bahraich', 'Balaghat', 'Balangir', 'Balasore',
       'Ballari', 'Balrampur', 'Balurghat', 'Banda',
       'Yeshwanthpur,Bangalore', 'Geddalahalli,Bangalore',
       'Koramangala,Bangalore', 'JP Nagar,B

In [17]:
def split_city(value):
    parts = value.split(',')
    if len(parts) == 1:
        return pd.Series([value.strip(), 'General'])
    else:
        return pd.Series([parts[-1].strip(), ','.join(parts[:-1]).strip()])

In [18]:
Swiggy_data.head()

Unnamed: 0,id,name,city,rating,rating_count,cost,cuisine,lic_no,link,address,menu
0,567335,AB FOODS POINT,Abohar,--,Too Few Ratings,₹ 200,"Beverages,Pizzas",22122652000138,https://www.swiggy.com/restaurants/ab-foods-po...,"AB FOODS POINT, NEAR RISHI NARANG DENTAL CLINI...",Menu/567335.json
1,531342,Janta Sweet House,Abohar,4.4,50+ ratings,₹ 200,"Sweets,Bakery",12117201000112,https://www.swiggy.com/restaurants/janta-sweet...,"Janta Sweet House, Bazar No.9, Circullar Road,...",Menu/531342.json
2,158203,theka coffee desi,Abohar,3.8,100+ ratings,₹ 100,Beverages,22121652000190,https://www.swiggy.com/restaurants/theka-coffe...,"theka coffee desi, sahtiya sadan road city",Menu/158203.json
3,187912,Singh Hut,Abohar,3.7,20+ ratings,₹ 250,"Fast Food,Indian",22119652000167,https://www.swiggy.com/restaurants/singh-hut-n...,"Singh Hut, CIRCULAR ROAD NEAR NEHRU PARK ABOHAR",Menu/187912.json
4,543530,GRILL MASTERS,Abohar,--,Too Few Ratings,₹ 250,"Italian-American,Fast Food",12122201000053,https://www.swiggy.com/restaurants/grill-maste...,"GRILL MASTERS, ADA Heights, Abohar - Hanumanga...",Menu/543530.json


In [19]:
Swiggy_data[['city_names','location']] = Swiggy_data['city'].apply(split_city)

In [20]:
columns_to_be_dropped = ['city','lic_no','link', 'address','menu' ]

In [21]:
Swiggy_data.drop(columns=columns_to_be_dropped,inplace=True)

In [22]:
Swiggy_data.head(20)

Unnamed: 0,id,name,rating,rating_count,cost,cuisine,city_names,location
0,567335,AB FOODS POINT,--,Too Few Ratings,₹ 200,"Beverages,Pizzas",Abohar,General
1,531342,Janta Sweet House,4.4,50+ ratings,₹ 200,"Sweets,Bakery",Abohar,General
2,158203,theka coffee desi,3.8,100+ ratings,₹ 100,Beverages,Abohar,General
3,187912,Singh Hut,3.7,20+ ratings,₹ 250,"Fast Food,Indian",Abohar,General
4,543530,GRILL MASTERS,--,Too Few Ratings,₹ 250,"Italian-American,Fast Food",Abohar,General
5,158204,Sam Uncle,3.6,20+ ratings,₹ 200,Continental,Abohar,General
6,156588,shere punjab veg,4.0,100+ ratings,₹ 150,North Indian,Abohar,General
7,244866,Shri Balaji Vaishno Dhaba,--,Too Few Ratings,₹ 100,North Indian,Abohar,General
8,156602,Hinglaj Kachori Bhandhar,4.2,20+ ratings,₹ 100,"Snacks,Chaat",Abohar,General
9,158193,yummy hub,--,Too Few Ratings,₹ 200,Indian,Abohar,General


In [23]:
# Replace '--' with NaN
Swiggy_data['rating'] = Swiggy_data['rating'].replace('--', None)

# Convert to float
Swiggy_data['rating'] = Swiggy_data['rating'].astype(float)

In [24]:
# Replace 'Too Few Ratings' with 0
Swiggy_data['rating_count'] = Swiggy_data['rating_count'].replace('Too Few Ratings', '0')

In [25]:
# # Remove '+ ratings' and convert to int
Swiggy_data['rating_count'] = Swiggy_data['rating_count'].str.extract('(\d+)')  # extract numbers only

  Swiggy_data['rating_count'] = Swiggy_data['rating_count'].str.extract('(\d+)')  # extract numbers only


In [26]:
#Convert to integer safely using 'Int64' (Nullable integer type)
Swiggy_data['rating_count'] = Swiggy_data['rating_count'].astype('Int64')

In [27]:
# remove  ruppe symbol and spaces , convert to number
Swiggy_data['cost'] = Swiggy_data['cost'].str.replace('₹', '').str.strip()
Swiggy_data['cost'] = Swiggy_data['cost'].astype('Int64')

In [28]:
cuisines = Swiggy_data['cuisine'].dropna().astype(str).str.split(',').sum()

In [29]:
# strip spaces and get unique set
unique_cuisines = set([c.strip() for c in cuisines])

In [30]:
Swiggy_data.tail(20)

Unnamed: 0,id,name,rating,rating_count,cost,cuisine,city_names,location
24980,562415,Anand Sweet & Bakers,,0,200,"Bakery,Desserts",Bela-pratapgarh,General
24981,551958,Paratha corner,,0,200,North Indian,Bela-pratapgarh,General
24982,550575,The Second Wife,3.5,20,200,"North Indian,Chinese",Bela-pratapgarh,General
24983,550588,Bombay Sandwich And Pizza Shop,,0,200,"Fast Food,Pizzas",Bela-pratapgarh,General
24984,557728,Brothers Family Restaurant,,0,200,"North Indian,Chinese",Bela-pratapgarh,General
24985,569766,Pizza Lovers,,0,200,"Pizzas,Chinese",Bela-pratapgarh,General
24986,567742,Biryani Shawarma.com,,0,200,"Biryani,Lebanese",Bela-pratapgarh,General
24987,311486,Rolls Fusion,3.9,20,100,"Beverages,Continental",Belgaum,General
24988,296025,Camp Purohit Sweets - Camp,4.2,100,100,Sweets,Belgaum,General
24989,328703,Kareem s Kitchen,4.0,500,250,"Biryani,Indian",Belgaum,General


In [31]:
Swiggy_data.isnull().sum()

Unnamed: 0,0
id,0
name,0
rating,13950
rating_count,0
cost,12
cuisine,0
city_names,0
location,0


In [32]:
Swiggy_data = Swiggy_data.dropna(subset=['rating'])


In [33]:
Swiggy_data['cost'] = Swiggy_data['cost'].fillna(Swiggy_data['cost'].median())


In [34]:
Swiggy_data.isnull().sum()

Unnamed: 0,0
id,0
name,0
rating,0
rating_count,0
cost,0
cuisine,0
city_names,0
location,0


In [35]:
Swiggy_data.head()


Unnamed: 0,id,name,rating,rating_count,cost,cuisine,city_names,location
1,531342,Janta Sweet House,4.4,50,200,"Sweets,Bakery",Abohar,General
2,158203,theka coffee desi,3.8,100,100,Beverages,Abohar,General
3,187912,Singh Hut,3.7,20,250,"Fast Food,Indian",Abohar,General
5,158204,Sam Uncle,3.6,20,200,Continental,Abohar,General
6,156588,shere punjab veg,4.0,100,150,North Indian,Abohar,General


In [36]:
#Save the cleaned file
Swiggy_data.to_csv("Swiggy_cleaned_data.csv", index=False)

In [37]:
Swiggy_cleaned_data = pd.read_csv("/content/Swiggy_cleaned_data.csv")

In [38]:
Swiggy_cleaned_data.head()

Unnamed: 0,id,name,rating,rating_count,cost,cuisine,city_names,location
0,531342,Janta Sweet House,4.4,50,200,"Sweets,Bakery",Abohar,General
1,158203,theka coffee desi,3.8,100,100,Beverages,Abohar,General
2,187912,Singh Hut,3.7,20,250,"Fast Food,Indian",Abohar,General
3,158204,Sam Uncle,3.6,20,200,Continental,Abohar,General
4,156588,shere punjab veg,4.0,100,150,North Indian,Abohar,General


In [39]:
# Select categorical columns
categorical_cols = ['name', 'city_names', 'location', 'cuisine']

In [40]:
## Initialize encoder (note: use sparse_output=False for newer scikit-learn)
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
encoded_array = encoder.fit_transform(Swiggy_cleaned_data[categorical_cols])

In [41]:
# Convert to DataFrame
Swiggy_encoded = pd.DataFrame(encoded_array, columns=encoder.get_feature_names_out(categorical_cols))

In [42]:
# Combine with numerical features
numerical_data = Swiggy_cleaned_data[['rating', 'rating_count', 'cost']].reset_index(drop=True)
final_encoded = pd.concat([numerical_data, Swiggy_encoded], axis=1)

In [43]:
#save to CSv
final_encoded.to_csv("encoded_data.csv", index=False)

In [44]:
# Save encoder and encoded data
with open('encoder.pkl', 'wb') as f:
    pickle.dump(encoder, f)

final_encoded.to_csv("encoded_data.csv", index=False)