In [None]:
#Data Cleaning Script for Swiggy Dataset
# This script cleans the Swiggy dataset by removing duplicates, handling missing values, and formatting columns.
# It also converts certain columns to appropriate data types for further analysis.
# Import necessary libraries
import pandas as pd
import numpy as np
import re
from sklearn.preprocessing import LabelEncoder
import pickle
from sklearn.metrics.pairwise import cosine_similarity
# Load data
df = pd.read_csv("swiggy.csv")

# Drop duplicates
df_cleaned = df.drop_duplicates()

# Drop rows with missing critical info
df_cleaned = df_cleaned.dropna(subset=['name', 'city', 'rating', 'rating_count', 'cost', 'cuisine'])

# Replace '--' in rating and convert to float
df_cleaned['rating'] = df_cleaned['rating'].replace('--', np.nan).astype(float)

# Clean cost column (remove ₹ and convert to float)
df_cleaned['cost'] = df_cleaned['cost'].str.replace('₹', '').str.replace(',', '').astype(float)

# Clean rating_count (remove "+", "ratings", etc.)
df_cleaned['rating_count'] = df_cleaned['rating_count'].apply(lambda x: re.sub(r'[^\d]', '', str(x)))
df_cleaned['rating_count'] = pd.to_numeric(df_cleaned['rating_count'], errors='coerce').fillna(0).astype(int)

# Final clean up: drop rows with any remaining nulls in essential columns
df_cleaned = df_cleaned.dropna(subset=['rating', 'cost'])

# Save cleaned data
df_cleaned.to_csv("cleaned_data.csv", index=False)


In [None]:
#One-Hot Encoding for Categorical Features
# Load your cleaned data
df = pd.read_csv("cleaned_data.csv")

# Select columns to encode
categorical_cols = ['name', 'city', 'cuisine']

# Initialize LabelEncoder for each categorical column
encoded_dfs = []
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    encoded_dfs.append(df[col])

# Combine encoded columns into a DataFrame
encoded_df = pd.concat(encoded_dfs, axis=1)

# Combine with numerical columns
numerical_cols = ['rating', 'rating_count', 'cost']
final_encoded_df = pd.concat([df[numerical_cols].reset_index(drop=True), encoded_df], axis=1)

# Save encoded data and encoder
final_encoded_df.to_csv("encoded_data.csv", index=False)

with open("encoder.pkl", "wb") as f:
    pickle.dump(encoder, f)


In [None]:
# Load encoded dataset and cleaned dataset
encoded_df = pd.read_csv("encoded_data.csv")
cleaned_df = pd.read_csv("cleaned_data.csv")

# Load the encoder
with open('encoder.pkl', 'rb') as f:
    encoder = pickle.load(f)

# Define recommendation function
def recommend_restaurants(user_input, encoder, encoded_df, cleaned_df, top_n=5):
    # Convert user input into DataFrame
    user_df = pd.DataFrame([user_input])
    
    # Encode the user input
    user_encoded = encoder.transform(user_df)
    
    # Create a DataFrame for user_encoded with correct columns
    user_encoded_df = pd.DataFrame(user_encoded, columns=encoder.get_feature_names_out())

    # Make sure user_encoded_df has same columns as encoded_df
    # Add missing columns with 0
    for col in encoded_df.columns:
        if col not in user_encoded_df.columns:
            user_encoded_df[col] = 0
            
    # Reorder columns exactly like encoded_df
    user_encoded_df = user_encoded_df[encoded_df.columns]
    
    # Compute cosine similarity
    similarities = cosine_similarity(user_encoded_df, encoded_df)
    
    # Get top N restaurants
    top_indices = similarities[0].argsort()[-top_n:][::-1]
    recommendations = cleaned_df.iloc[top_indices]
    
    return recommendations

# Example: Correct user input
user_preferences = {
    'name': 'Pizza Hut',
    'city': 'Bangalore',
    'cuisine': 'Pizzas'
}

# Get recommendations
recommendations = recommend_restaurants(user_preferences, encoder, encoded_df, cleaned_df, top_n=5)
print(recommendations[['name', 'city', 'cuisine', 'rating', 'cost']])


                        name      city                     cuisine  rating  \
0          Janta Sweet House    Abohar               Sweets,Bakery     4.4   
61420  FOOD WORLD RESTAURANT  Yavatmal                North Indian     2.6   
61419            Suraj Hotel  Yavatmal      North Indian,Fast Food     3.0   
61418    Satkar Dinning Hall  Yavatmal  Maharashtrian,North Indian     3.6   
61417          Ranade Bandhu  Yavatmal            Sweets,Fast Food     4.7   

        cost  
0      200.0  
61420  150.0  
61419  200.0  
61418  200.0  
61417  100.0  
