In [None]:
import pandas as pd
import numpy as np
import pickle
import scipy.sparse
import sklearn
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
# from category_encoders import HashingEncoder # type: ignore
from sklearn.preprocessing import OneHotEncoder



In [17]:
# Load dataset
df_swiggy = pd.read_csv(r"C:\Users\v-dhramaraj\Desktop\Python\Projects\Assignment4_RestRecomendation\swiggy.csv")

# print(df_swiggy.info())
# print(df_swiggy.head())

print(df_swiggy.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 148541 entries, 0 to 148540
Data columns (total 11 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   id            148541 non-null  int64 
 1   name          148455 non-null  object
 2   city          148541 non-null  object
 3   rating        148455 non-null  object
 4   rating_count  148455 non-null  object
 5   cost          148410 non-null  object
 6   cuisine       148442 non-null  object
 7   lic_no        148312 non-null  object
 8   link          148541 non-null  object
 9   address       148455 non-null  object
 10  menu          148541 non-null  object
dtypes: int64(1), object(10)
memory usage: 12.5+ MB
None


In [18]:
# Remove duplicates
df_swiggy = df_swiggy.drop_duplicates()

# Convert cost to numeric (remove currency symbols)
df_swiggy['cost'] = df_swiggy['cost'].replace('[₹,]', '', regex=True)
df_swiggy['cost'] = pd.to_numeric(df_swiggy['cost'], errors='coerce')  # Converts non-numeric values to NaN

# Function to clean rating_count values
def clean_rating_count(value):
    value = str(value).strip()  # Ensure it's a string and remove spaces
    
    if 'Too Few Ratings' in value:
        return 0  # Assign 0 for low-rated restaurants
    elif 'K' in value:  # Handle "1K" format correctly
        numeric_value = value.replace('K', '').strip()
        return int(float(numeric_value) * 1000) if numeric_value.replace('.', '', 1).isdigit() else 0
    elif '+' in value:  # Handle "50+ ratings" or "1+ ratings"
        numeric_part = value.replace('+', '').replace('ratings', '').strip()
        return int(numeric_part) if numeric_part.isdigit() else 0
    elif value.isdigit():  # If it's already a pure number
        return int(value)
    else:
        return 0  # Default case for unknown formats

# Apply function to clean column
df_swiggy['rating_count'] = df_swiggy['rating_count'].apply(clean_rating_count)


# Convert rating to numeric (replace '--' with NaN)
df_swiggy['rating'] = pd.to_numeric(df_swiggy['rating'], errors='coerce')

# Handle missing values efficiently
# Fill missing values correctly by explicitly assigning them
df_swiggy['cost'] = df_swiggy['cost'].fillna(df_swiggy['cost'].median())
df_swiggy['rating'] = df_swiggy['rating'].fillna(df_swiggy['rating'].median())


# Drop rows with missing categorical values
df_swiggy.dropna(subset=['name', 'city', 'cuisine'], inplace=True)
# Remove unnecessary columns
df_swiggy.drop(columns=['id', 'link', 'menu'], inplace=True)



In [19]:
# Save cleaned dataset
df_swiggy.to_csv("cleaned_data.csv", index=False)
print("Data Cleaning Completed! Saved as cleaned_data.csv")

Data Cleaning Completed! Saved as cleaned_data.csv


In [None]:
# # Frequency Encoding for categorical variables
# """
# Frequency Encoding is a technique used to convert categorical variables into numerical values based on 
# how often each category appears in the dataset. 
# Instead of creating a new column for every unique category (like One-Hot Encoding), it replaces each category with its relative frequency, making it more memory-efficient
# Example Before Encoding
# | city | cuisine | 
# | Delhi | Chinese | 
# | Mumbai | Italian | 
# | Delhi | Indian | 
# After Encoding
# | city | city_freq | cuisine | cuisine_freq | 
# | Delhi | 0.60 | Chinese | 0.40 | 
# | Mumbai | 0.40 | Italian | 0.20 | 
# | Delhi | 0.60 | Indian | 0.20 | 

# Why Use Frequency Encoding?
# -- Memory Efficient: Works well for high-cardinality categorical features (like city names).
# -- Captures Importance: More frequent categories get higher values, which can improve machine learning performance.
# -- Avoids Overfitting: Compared to One-Hot Encoding, it prevents excessive feature creation.

# """
# # Count Occurrences of Each Category
# # .map(df_swiggy['city'].value_counts(normalize=True)) replaces each city name in df_swiggy['city'] with its frequency value, does the same for cuisine.
# df_swiggy['city_freq'] = df_swiggy['city'].map(df_swiggy['city'].value_counts(normalize=True)) # calculates how often each city appears in the dataset
# df_swiggy['cuisine_freq'] = df_swiggy['cuisine'].map(df_swiggy['cuisine'].value_counts(normalize=True))

In [None]:
# One-Hot Encoding with sparse output
""" 
One-Hot Encoding is a technique used to convert categorical variables into numerical form. It creates binary columns for each unique category, where:
- 1 represents the presence of the category.
- 0 represents absence.
For example, city column has ["Delhi", "Mumbai", "Chennai"], One-Hot Encoding will generate: 
City| Delhi | Mumbai | Chennai | |------------|---------|------|------| 
    | Delhi | 1       | 0    | 0    | | Mumbai  | 0       | 1    | 0    | | Chennai  | 0       | 0    | 1    |
"""
# One-Hot Encoding for 'city' and 'cuisine'
df_encoded = pd.get_dummies(df_swiggy, columns=['city', 'cuisine'], dtype=int)

# Save encoded dataset
df_encoded.to_csv("encoded_data.csv", index=False)

# Initialize One-Hot Encoder & Save as Pickle File
encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
encoder.fit(df_swiggy[['city', 'cuisine']])

with open("encoder.pkl", "wb") as f:
    pickle.dump(encoder, f)

# Verify if the file was created
import os
print(os.path.exists("encoder.pkl"))  # Should print True
print("Encoding Completed! Saved as encoded_data.csv & encoder.pkl")


True


In [None]:
# Step 3: Clustering & Recommendation System
# Load cleaned dataset for clustering
df_encoded = pd.read_csv("cleaned_data.csv")

# Ensure only numerical features are used for clustering
# Convert to numeric and replace NaN values
df_cluster = df_encoded.drop(columns=['city', 'cuisine', 'lic_no','address'])
df_cluster = df_cluster.apply(pd.to_numeric, errors='coerce')  # Coerce non-numeric values to NaN
df_cluster = df_cluster.replace([np.inf, -np.inf], np.nan)  # Replace infinities with NaN
df_cluster = df_cluster.fillna(df_cluster.median())  # Fill NaN with median values

# StandardScaler transforms numerical features so they all have zero mean and unit variance, improving clustering quality.
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df_cluster)

# Apply K-Means clustering
""" Creates 10 restaurant clusters using K-Means, grouping similar restaurants based on numerical features on similar characteristics.
The assigned cluster labels (df_encoded['cluster']) help categorize restaurants into distinct groups.
- Purpose: Groups restaurants into clusters based on shared characteristics like cost, cuisine, and rating.
- Why K-Means? K-Means finds similarities between restaurants and categorizes them into 10 clusters, making recommendations more structured.
- How It Helps: Instead of comparing all restaurants, recommendations can now focus only on those within relevant clusters, improving accuracy.
"""

kmeans = KMeans(n_clusters=10, random_state=42)
df_encoded['cluster'] = kmeans.fit_predict(df_scaled)

# Save clustered dataset
df_encoded.to_csv("clustered_data.csv", index=False)
print("Clustering Completed! Saved as clustered_data.csv")

"""
- Purpose: Finds the most similar restaurants based on user input.
- Why Cosine Similarity? It measures how close a restaurant's feature vector is to a user's preferences, ensuring relevant matches.
- How It Helps: Even within a cluster, cosine similarity ranks restaurants to suggest the most relevant options.
"""

# Function to recommend similar restaurants
def recommend_restaurants(input_data, top_n=5):
    # Convert input_data to a format matching encoded data
    input_df = pd.DataFrame([input_data], columns=df_cluster.columns)
    
    # Standardize user input
    input_vector = scaler.transform(input_df)

    # Compute cosine similarity
    similarities = cosine_similarity(input_vector, df_scaled)
    
    # Get top N recommendations based on similarity scores
    recommendations = df_encoded.iloc[similarities.argsort()[0][-top_n:]]
    
    return recommendations[['name', 'city', 'rating', 'cost', 'cuisine']]

print("Recommendation system initialized.")

Clustering Completed! Saved as clustered_data.csv
Recommendation system initialized.
