In [1]:
import pandas as pd
import numpy as np
df = pd.read_csv(r"swiggy.csv")



In [2]:
df.head()

Unnamed: 0,id,name,city,rating,rating_count,cost,cuisine,lic_no,link,address,menu
0,567335,AB FOODS POINT,Abohar,--,Too Few Ratings,₹ 200,"Beverages,Pizzas",22122700000000.0,https://www.swiggy.com/restaurants/ab-foods-po...,"AB FOODS POINT, NEAR RISHI NARANG DENTAL CLINI...",Menu/567335.json
1,531342,Janta Sweet House,Abohar,4.4,50+ ratings,₹ 200,"Sweets,Bakery",12117200000000.0,https://www.swiggy.com/restaurants/janta-sweet...,"Janta Sweet House, Bazar No.9, Circullar Road,...",Menu/531342.json
2,158203,theka coffee desi,Abohar,3.8,100+ ratings,₹ 100,Beverages,22121700000000.0,https://www.swiggy.com/restaurants/theka-coffe...,"theka coffee desi, sahtiya sadan road city",Menu/158203.json
3,187912,Singh Hut,Abohar,3.7,20+ ratings,₹ 250,"Fast Food,Indian",22119700000000.0,https://www.swiggy.com/restaurants/singh-hut-n...,"Singh Hut, CIRCULAR ROAD NEAR NEHRU PARK ABOHAR",Menu/187912.json
4,543530,GRILL MASTERS,Abohar,--,Too Few Ratings,₹ 250,"Italian-American,Fast Food",12122200000000.0,https://www.swiggy.com/restaurants/grill-maste...,"GRILL MASTERS, ADA Heights, Abohar - Hanumanga...",Menu/543530.json


In [3]:
df.isnull().sum()

id                0
name             86
city              0
rating           86
rating_count     86
cost            131
cuisine          99
lic_no          229
link              0
address          86
menu              0
dtype: int64

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 148541 entries, 0 to 148540
Data columns (total 11 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   id            148541 non-null  int64 
 1   name          148455 non-null  object
 2   city          148541 non-null  object
 3   rating        148455 non-null  object
 4   rating_count  148455 non-null  object
 5   cost          148410 non-null  object
 6   cuisine       148442 non-null  object
 7   lic_no        148312 non-null  object
 8   link          148541 non-null  object
 9   address       148455 non-null  object
 10  menu          148541 non-null  object
dtypes: int64(1), object(10)
memory usage: 12.5+ MB


In [5]:
duplicate_rows = df[df.duplicated()]
print(f"Number of duplicate rows: {duplicate_rows.shape[0]}")


Number of duplicate rows: 0


In [6]:
df.columns

Index(['id', 'name', 'city', 'rating', 'rating_count', 'cost', 'cuisine',
       'lic_no', 'link', 'address', 'menu'],
      dtype='object')

In [7]:
df = df.drop(['id','link','address','menu','lic_no'], axis=1)


In [8]:
df

Unnamed: 0,name,city,rating,rating_count,cost,cuisine
0,AB FOODS POINT,Abohar,--,Too Few Ratings,₹ 200,"Beverages,Pizzas"
1,Janta Sweet House,Abohar,4.4,50+ ratings,₹ 200,"Sweets,Bakery"
2,theka coffee desi,Abohar,3.8,100+ ratings,₹ 100,Beverages
3,Singh Hut,Abohar,3.7,20+ ratings,₹ 250,"Fast Food,Indian"
4,GRILL MASTERS,Abohar,--,Too Few Ratings,₹ 250,"Italian-American,Fast Food"
...,...,...,...,...,...,...
148536,The Food Delight,Yavatmal,--,Too Few Ratings,₹ 200,"Fast Food,Snacks"
148537,MAITRI FOODS & BEVERAGES,Yavatmal,--,Too Few Ratings,₹ 300,Pizzas
148538,Cafe Bella Ciao,Yavatmal,--,Too Few Ratings,₹ 300,"Fast Food,Snacks"
148539,GRILL ZILLA,Yavatmal,--,Too Few Ratings,₹ 250,Continental


In [9]:
df.dropna(inplace=True)

In [10]:
# 5. Clean 'cost' column: remove ₹ and commas, convert to float
df['cost'] = df['cost'].replace('[₹,]', '', regex=True).astype(float)

# 6. Clean 'rating_count' column
def convert_rating_count(value):
    if pd.isnull(value):
        return np.nan
    value = value.strip()
    if value == 'Too Few Ratings':
        return np.random.randint(0, 10)
    elif '20+' in value:
        return np.random.randint(20, 50)
    elif '50+' in value:
        return np.random.randint(50, 100)
    elif '100+' in value:
        return np.random.randint(100, 500)
    elif '500+' in value:
        return np.random.randint(500, 1000)
    elif '1K+' in value:
        return np.random.randint(1000, 5000)
    elif '5K+' in value:
        return np.random.randint(5000, 10000)
    elif '10K+' in value:
        return np.random.randint(10000, 15000)
    else:
        try:
            return int(''.join(filter(str.isdigit, value)))
        except:
            return np.nan

df['rating_count'] = df['rating_count'].apply(convert_rating_count)


In [11]:
# 7. Clean 'rating' column: remove 'K' if present, convert to float
def clean_rating(val):
    if isinstance(val, str):
        val = val.strip().replace('K', '')
        try:
            return float(val)
        except:
            return np.nan
    return val

df['rating'] = df['rating'].apply(clean_rating)

# Drop rows with invalid/missing rating
df.dropna(subset=['rating'], inplace=True)


In [12]:
# 8. Split 'city' column into 'city' and 'main_city'
def split_city(value):
    parts = str(value).split(',')
    city = parts[0].strip()
    main_city = parts[1].strip() if len(parts) > 1 else city
    return pd.Series([city, main_city])

df[['city', 'main_city']] = df['city'].apply(split_city)


# 10. Save cleaned data
df.to_csv("cleaned_swiggy_data.csv", index=False)

In [13]:
df.shape

(61421, 7)

In [14]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
import pandas as pd
import pickle

df_model = df[['name','city', 'cuisine', 'rating', 'rating_count', 'cost']].copy()

# Label Encoding
le = LabelEncoder()
df_model['name_encoded'] = le.fit_transform(df['name'])

with open("name_label_encoder.pkl", "wb") as f:
    pickle.dump(le, f)

# One-Hot Encoding: city & cuisine
categorical_cols = ['city', 'cuisine']
encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

# Fit and transform
encoded_array = encoder.fit_transform(df_model[categorical_cols])

with open("encoder.pkl", "wb") as f:
    pickle.dump(encoder, f)

# DataFrame of encoded features
encoded_df = pd.DataFrame(encoded_array, columns=encoder.get_feature_names_out(categorical_cols))

# Adding back numerical columns including name_encoded
numerical_df = df_model[['name_encoded', 'rating', 'rating_count', 'cost']].reset_index(drop=True)
final_encoded_df = pd.concat([encoded_df, numerical_df], axis=1)


final_encoded_df.to_csv("encoded_data.csv", index=False)
print("✅ Encoded dataset saved as 'encoded_data.csv'")

✅ Encoded dataset saved as 'encoded_data.csv'


In [15]:
encoded_df.shape

(61421, 2415)

In [16]:
import pickle

# Assuming these are your encoder objects
le = LabelEncoder()
ohe = OneHotEncoder()

# Save both encoders into one pickle file
with open("encoder.pkl", "wb") as f:
    pickle.dump({
        "label_encoder": le,
        "one_hot_encoder": ohe
    }, f)


In [17]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

cleaned_df = pd.read_csv("cleaned_swiggy_data.csv")


# 5. Scale features and save scaler
scaler = StandardScaler()
scaled_data = scaler.fit_transform(encoded_array)

# 9. Train and save KMeans
kmeans = KMeans(n_clusters=10, random_state=42)
clusters = kmeans.fit_predict(scaled_data)


# 10. Add clusters to original dataframe
cleaned_df['cluster'] = clusters

# 11. Save final dataset with clusters
#cleaned_df.to_csv("cleaned_with_clusters.csv", index=False)


# 6. Train KMeans model and save it
kmeans = KMeans(n_clusters=10, random_state=42)
clusters = kmeans.fit_predict(scaled_data)

with open('kmeans.pkl', 'wb') as f:
    pickle.dump(kmeans, f)

# Add cluster info to cleaned_df
cleaned_df['cluster'] = clusters

# 7. Save cleaned_df with cluster info for mapping results
cleaned_df.to_csv("cleaned_with_clusters.csv", index=False)



In [18]:
cleaned_df.drop(columns=['city'])

cleaned_df

Unnamed: 0,name,city,rating,rating_count,cost,cuisine,main_city,cluster
0,Janta Sweet House,Abohar,4.4,50,200.0,"Sweets,Bakery",Abohar,1
1,theka coffee desi,Abohar,3.8,476,100.0,Beverages,Abohar,1
2,Singh Hut,Abohar,3.7,24,250.0,"Fast Food,Indian",Abohar,3
3,Sam Uncle,Abohar,3.6,23,200.0,Continental,Abohar,1
4,shere punjab veg,Abohar,4.0,187,150.0,North Indian,Abohar,1
...,...,...,...,...,...,...,...,...
61416,Jain Varities & Icecream Corner,Yavatmal,3.7,78,150.0,"Snacks,Fast Food",Yavatmal,1
61417,Ranade Bandhu,Yavatmal,4.7,42,100.0,"Sweets,Fast Food",Yavatmal,1
61418,Satkar Dinning Hall,Yavatmal,3.6,315,200.0,"Maharashtrian,North Indian",Yavatmal,1
61419,Suraj Hotel,Yavatmal,3.0,30,200.0,"North Indian,Fast Food",Yavatmal,1


In [19]:
import pickle

# Load encoder.pkl
with open("encoder.pkl", "rb") as f:
    encoders = pickle.load(f)  # Contains: 'label_encoder', 'one_hot_encoder'

# Load kmeans.pkl
with open("kmeans.pkl", "rb") as f:
    kmeans = pickle.load(f)  # A single KMeans object

# Combine into one dictionary
combined = {
    "label_encoder": encoders["label_encoder"],
    "one_hot_encoder": encoders["one_hot_encoder"],
    "kmeans": kmeans
}

# Save into one new file
with open("models.pkl", "wb") as f:
    pickle.dump(combined, f)


In [20]:
import pickle

with open("models.pkl", "wb") as f:
    pickle.dump({
        "kmeans": kmeans,
        "encoder": encoder,
        "encoded_data": encoded_df
    }, f)


In [87]:
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder()
ohe.fit(df[['cuisine']])  # Fit before saving

with open("encoder.pkl", "wb") as f:
    pickle.dump({"label_encoder": le, "one_hot_encoder": ohe}, f)


In [21]:
import pickle

models = {
    "kmeans": kmeans,
    "scaler": scaler,
    "clusters": kmeans.labels_  # This is just a NumPy array
}

with open("models.pkl", "wb") as f:
    pickle.dump(models, f)
