In [1]:
import pandas as pd
import numpy as np

# Load the raw Swiggy data
df = pd.read_csv("swiggy.csv")

# Remove any columns with "license" in the column name (case insensitive)
df = df.loc[:, ~df.columns.str.contains('license', case=False)]

# Remove rows where any cell contains "license" (case insensitive)
df = df[~df.apply(lambda row: row.astype(str).str.contains("license", case=False)).any(axis=1)]

# Clean ratings
df['rating'] = df['rating'].replace('--', np.nan)
df['rating'] = pd.to_numeric(df['rating'], errors='coerce')

# Clean rating count (e.g., '100+ ratings' → 100)
df['rating_count'] = df['rating_count'].astype(str).str.extract('(\d+)').astype(float)

# Clean cost (remove ₹ and whitespace)
df['cost'] = df['cost'].replace('[₹, ]', '', regex=True).astype(float)

# Drop rows with essential missing values
df.dropna(subset=['rating', 'rating_count', 'cost', 'cuisine', 'city'], inplace=True)

# Save the cleaned data
df.to_csv("cleaned_data.csv", index=False)

print("✅ Cleaned data saved as cleaned_data.csv (license data removed).")


  df['rating_count'] = df['rating_count'].astype(str).str.extract('(\d+)').astype(float)


✅ Cleaned data saved as cleaned_data.csv (license data removed).


In [1]:
import pandas as pd

df = pd.read_csv("cleaned_data.csv")


In [2]:
drop_cols = ['restaurant_id', 'license', 'url', 'address', 'menu_path']
df = df.drop(columns=[col for col in drop_cols if col in df.columns])


In [3]:
categorical_cols = ['name', 'city', 'cuisine']
numerical_cols = ['rating', 'rating_count', 'cost']


In [6]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
import pickle

# Load cleaned data
df = pd.read_csv("cleaned_data.csv")

# Drop unnecessary columns
drop_cols = ['restaurant_id', 'license', 'url', 'address', 'menu_path', 'name']
df = df.drop(columns=[col for col in drop_cols if col in df.columns])

# Split multi-cuisine into separate dummy columns
df['cuisine'] = df['cuisine'].str.split(',')
df = df.explode('cuisine')
df['cuisine'] = df['cuisine'].str.strip()  # remove extra spaces

# One-hot encode 'city' and 'cuisine'
encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
encoded_array = encoder.fit_transform(df[['city', 'cuisine']])

# Save encoder
with open('encoder.pkl', 'wb') as f:
    pickle.dump(encoder, f)

# Create encoded DataFrame
encoded_df = pd.DataFrame(encoded_array, columns=encoder.get_feature_names_out(['city', 'cuisine']), index=df.index)

# Combine with numerical columns
numerical_cols = ['rating', 'rating_count', 'cost']
final_encoded_df = pd.concat([df[numerical_cols], encoded_df], axis=1)

# Drop duplicates after explode to reduce size
final_encoded_df = final_encoded_df.drop_duplicates()

# Save
final_encoded_df.to_csv("encoded_data.csv", index=False)
