In [1]:
import pandas as pd
import numpy as np

# Load the raw Swiggy data
df = pd.read_csv("swiggy.csv")

# Remove any columns with "license" in the column name (case insensitive)
df = df.loc[:, ~df.columns.str.contains('license', case=False)]

# Remove rows where any cell contains "license" (case insensitive)
df = df[~df.apply(lambda row: row.astype(str).str.contains("license", case=False)).any(axis=1)]

# Clean ratings
df['rating'] = df['rating'].replace('--', np.nan)
df['rating'] = pd.to_numeric(df['rating'], errors='coerce')

# Clean rating count (e.g., '100+ ratings' → 100)
df['rating_count'] = df['rating_count'].astype(str).str.extract('(\d+)').astype(float)

# Clean cost (remove ₹ and whitespace)
df['cost'] = df['cost'].replace('[₹, ]', '', regex=True).astype(float)

# Drop rows with essential missing values
df.dropna(subset=['rating', 'rating_count', 'cost', 'cuisine', 'city'], inplace=True)

# Save the cleaned data
df.to_csv("cleaned_data.csv", index=False)

print("✅ Cleaned data saved as cleaned_data.csv (license data removed).")


  df['rating_count'] = df['rating_count'].astype(str).str.extract('(\d+)').astype(float)


✅ Cleaned data saved as cleaned_data.csv (license data removed).


In [1]:
import pandas as pd

df = pd.read_csv("cleaned_data.csv")


In [2]:
drop_cols = ['restaurant_id', 'license', 'url', 'address', 'menu_path']
df = df.drop(columns=[col for col in drop_cols if col in df.columns])


In [3]:
categorical_cols = ['name', 'city', 'cuisine']
numerical_cols = ['rating', 'rating_count', 'cost']


In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
import pickle

# Load the cleaned data
df = pd.read_csv("cleaned_data.csv")

# Optional: reset index to align properly
df.reset_index(drop=True, inplace=True)

# Simplify cuisine: only use first cuisine per row
df['primary_cuisine'] = df['cuisine'].apply(lambda x: x.split(',')[0] if isinstance(x, str) else x)

# Categorical columns
categorical_cols = ['city', 'primary_cuisine']
numerical_cols = ['rating', 'rating_count', 'cost']

# One-hot encode categorical columns
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
encoded_cat = encoder.fit_transform(df[categorical_cols])

# Create DataFrame from encoded array
encoded_cat_df = pd.DataFrame(encoded_cat, columns=encoder.get_feature_names_out(categorical_cols))

# Join with numerical features
encoded_final_df = pd.concat([df[numerical_cols], encoded_cat_df], axis=1)

# Save encoded data and encoder
encoded_final_df.to_csv("encoded_data.csv", index=False)
with open("encoder.pkl", "wb") as f:
    pickle.dump(encoder, f)

print("✅ Encoding complete. 'encoded_data.csv' and 'encoder.pkl' saved.")


✅ Encoding complete. 'encoded_data.csv' and 'encoder.pkl' saved.
