In [None]:
import pandas as pd
import numpy as np
import re
from sklearn.preprocessing import OneHotEncoder
import pickle

In [None]:
# ===== Step 1: Load CSV =====
df = pd.read_csv("C:/Users/dell/Desktop/project/Swiggy/swiggy.csv")
print(f"Original shape: {df.shape}")

In [None]:
# ===== Step 2: Drop duplicates =====
key_cols = ['name', 'city', 'cuisine','address']

# Find all duplicate rows for these columns (including the first occurrence)
duplicates = df[df.duplicated(subset=key_cols, keep=False)]

# Sort
duplicates = duplicates.sort_values(by=key_cols)

# Display
print(f"Number of duplicate rows (based on {key_cols}): {duplicates.shape[0]}")
print(duplicates)

In [None]:
df.drop_duplicates(subset=key_cols, keep='first', inplace=True)

In [None]:
df

In [None]:
print(df[['rating']].value_counts())

In [None]:
# ===== Step 3: Clean numeric fields =====
# Clean 'rating'  replace '--' with NaN
df['rating'] = df['rating'].replace(['--'], np.nan)
df['rating'] = pd.to_numeric(df['rating'], errors='coerce')

In [None]:
print(df[['rating_count']].value_counts())

In [None]:
# Clean 'rating_count' replace 'Too Few Ratings' with NaN
df['rating_count'] = df['rating_count'].replace('Too Few Ratings', np.nan)

In [None]:
# Clean 'rating_count' - extract numeric ('50K+ ratings' to 50000)

def clean_rating_count(val):
    if pd.isna(val):
        return None
    
    val_str = val.split()[0]
    val_str = val_str.replace('+', '') 
    
    if 'K' in val_str.upper():
        number = val_str.upper().replace('K', '')
        return int(float(number) * 1000)
    elif val_str.isdigit():
        return int(val_str)
    else:
        return None

df['rating_count'] = df['rating_count'].apply(clean_rating_count)

In [None]:
print(df[['rating_count']].value_counts())

In [None]:
print(df[['cost']].value_counts())

In [None]:
# Clean 'cost' remove 'Rupees symbol'
def clean_cost(val):
    if pd.isna(val):
        return None
    val_str = str(val).strip()
    # Remove currency symbol and commas
    val_str = val_str.replace('₹', '').replace(',', '')
    # Take only the first number part
    first_part = val_str.split()[0]
    # Convert to float if it's numeric
    return float(first_part) if first_part.replace('.', '', 1).isdigit() else None

df['cost'] = df['cost'].apply(clean_cost)

In [None]:
numeric_cols = ['rating', 'rating_count', 'cost']
null_counts = df[numeric_cols].isnull().sum()
print(null_counts)

In [None]:
df['rating'] = df['rating'].fillna(df['rating'].median())
df['rating_count'] = df['rating_count'].fillna(df['rating_count'].median())
df['cost'] = df['cost'].fillna(df['cost'].median())

In [None]:
null_counts_all = df.isnull().sum()

In [None]:
null_counts_all

In [None]:
df = df.dropna(subset=['name', 'cuisine', 'address'])
df['lic_no'] = df['lic_no'].fillna("Unknown")

In [None]:
print(df[['city']].value_counts())

In [None]:
df['city_parts_count'] = df['city'].apply(lambda x: len(str(x).split(',')))

# See value counts
print(df['city_parts_count'].value_counts())

# Optional: Check rows that don't have exactly 2 parts
invalid_rows = df[df['city_parts_count'] != 2]
print(invalid_rows[['city']])

In [None]:
# Function to split city into locality and city_main
def split_city(val):
    parts = str(val).split(',')
    if len(parts) == 2:  # locality, city
        return parts[0].strip(), parts[1].strip()
    elif len(parts) == 1:  # only city
        return "Unknown", parts[0].strip()
    elif len(parts) >= 3:  # locality, sub-locality, city
        return parts[0].strip(), parts[-1].strip()
    else:
        return "Unknown", "Unknown"

# Apply the function to each row
df[['locality', 'city_main']] = df['city'].apply(lambda x: pd.Series(split_city(x)))

# Drop city_parts_count
df.drop(columns=['city_parts_count'], errors='ignore', inplace=True)

print(df[['city', 'locality', 'city_main']].head(10))


In [None]:
print(df['city_main'].value_counts())

In [None]:
df

In [None]:
df = df.drop(columns=['menu', 'link', 'lic_no', 'city'], errors='ignore')

In [None]:
df.to_csv("cleaned_data.csv", index=False)

In [None]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
import pickle

# Load cleaned data
df = pd.read_csv("cleaned_data.csv")


In [None]:
label_encoder = LabelEncoder()
df['name_encoded'] = label_encoder.fit_transform(df['name'])

In [None]:
#  One-Hot Encode city_main 
ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
ohe_encoded = ohe.fit_transform(df[['city_main']])
ohe_columns = ohe.get_feature_names_out(['city_main'])
ohe_df = pd.DataFrame(ohe_encoded, columns=ohe_columns, index=df.index)

In [None]:
# List of keywords that indicate it's not a cuisine
exclude_keywords = [
    "offer", "discount", "code", "free delivery", "default", "combo",
    "popular brand store", "limited stocks", "use", "bill over"
]

def is_valid_cuisine(cuisine):
    cuisine_lower = cuisine.lower()
    return not any(kw in cuisine_lower for kw in exclude_keywords)

# Filter the list
unique_cuisines = [c.strip() for c in unique_cuisines if is_valid_cuisine(c)]

# Normalize capitalisation
unique_cuisines = sorted(set(c.title() for c in unique_cuisines))

In [None]:
# Normalize capitalisation
unique_cuisines = sorted(set(c.title() for c in unique_cuisines))

In [None]:
unique_cuisines

In [None]:
# Split cuisines into lists in the DataFrame
df['cuisine_list'] = df['cuisine'].str.split(',')

# Multi-hot encode in one go
cuisine_df = pd.DataFrame({
    f'cuisine_{c}': df['cuisine_list'].apply(lambda x: int(c in [i.strip().title() for i in x]))
    for c in unique_cuisines
})

# Merge with main DataFrame
df = pd.concat([df, cuisine_df], axis=1)

In [None]:
# One-Hot Encode city_main
ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
city_encoded = ohe.fit_transform(df[['city_main']])

# Add to DataFrame
city_encoded_df = pd.DataFrame(city_encoded, columns=ohe.get_feature_names_out(['city_main']), index=df.index)

# Merge and drop original city_main
df = pd.concat([df.drop(columns=['city_main']), city_encoded_df], axis=1)

In [None]:
# Drop non-model columns if needed
model_df = df.drop(columns=['address', 'cuisine', 'cuisine_list','name', 'locality'], errors='ignore')

model_df.to_csv("encoded_data.csv", index=False)

In [None]:
# create encoder.pkl
encoders = {
    'name_encoder': label_encoder,
    'city_main_encoder': ohe,
    'unique_cuisines': unique_cuisines
}

# Save all in one pickle file
with open("encoder.pkl", "wb") as f:
    pickle.dump(encoders, f)

In [None]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

# Load data
df_encoded = pd.read_csv("encoded_data.csv")
df_cleaned = pd.read_csv("cleaned_data.csv")

# Compute similarity matrix (features only, drop id)
X = df_encoded.drop(columns=['id'])
similarity_matrix = cosine_similarity(X)

def get_similar_restaurants(restaurant_id, top_n=5):
    # Find index of the restaurant
    matches = df_encoded.index[df_encoded['id'] == restaurant_id]
    if matches.empty:
        return f"No restaurant found with ID {restaurant_id}"
    
    idx = matches[0]
    
    # Get the city of the input restaurant
    input_city = df_cleaned.loc[df_cleaned['id'] == restaurant_id, 'city_main'].values[0]
    
    # Get similarity scores
    sim_scores = list(enumerate(similarity_matrix[idx]))
    
    # Sort by similarity (highest first) and skip the restaurant itself
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:]
    
    # Filter results to the same city
    same_city_scores = [s for s in sim_scores if df_cleaned.iloc[s[0]]['city_main'] == input_city]
    
    # Take top N
    top_matches = same_city_scores[:top_n]
    
    # Get full details from cleaned data
    similar_restaurants = df_cleaned.iloc[[i[0] for i in top_matches]]
    
    return similar_restaurants[['id', 'name', 'rating', 'rating_count', 'cost', 'cuisine', 'address', 'city_main']]

# Example usage
print(get_similar_restaurants(567335, top_n=5))


In [None]:
import pandas as pd
from sklearn.cluster import MiniBatchKMeans
import pickle

# Load df_encoded in chunks
def load_encoded(file_path, chunk_size=100_000):
    chunks = []
    for chunk in pd.read_csv(file_path, chunksize=chunk_size):
        # convert float64 -> float32 to save memory
        float_cols = chunk.select_dtypes('float64').columns
        chunk[float_cols] = chunk[float_cols].astype('float32')
        chunks.append(chunk)
    return pd.concat(chunks, ignore_index=True)

print("Loading encoded data...")
df_encoded = load_encoded("encoded_data.csv")

# Features for clustering
X = df_encoded.drop(columns=['id'])

print("Fitting MiniBatchKMeans...")
kmeans = MiniBatchKMeans(n_clusters=10, random_state=42, batch_size=1000)
df_encoded["cluster"] = kmeans.fit_predict(X)

# Save model
with open("kmeans_model.pkl", "wb") as f:
    pickle.dump(kmeans, f)

# Save encoded data with clusters
df_encoded.to_csv("encoded_with_clusters.csv", index=False)
print("Saved encoded_with_clusters.csv with cluster assignments.")

In [None]:
def get_cluster_recommendations(restaurant_id, top_n=5):
    # Load df_cleaned on demand
    df_cleaned = pd.read_csv("cleaned_data.csv", dtype={'id':'int32', 'city_main':'category'})
    
    # Find the restaurant cluster
    matches = df_encoded.index[df_encoded['id'] == restaurant_id]
    if matches.empty:
        return f"No restaurant found with ID {restaurant_id}"
    idx = matches[0]
    
    cluster_id = df_encoded.loc[idx, 'cluster']
    input_city = df_cleaned.loc[df_cleaned['id'] == restaurant_id, 'city_main'].values[0]
    
    cluster_restaurants = df_encoded[df_encoded['cluster'] == cluster_id]
    same_city_ids = cluster_restaurants['id'].values
    
    filtered = df_cleaned[df_cleaned['id'].isin(same_city_ids)]
    filtered = filtered[filtered['city_main'] == input_city]
    filtered = filtered[filtered['id'] != restaurant_id]
    
    return filtered[['id','name','rating','rating_count','cost','cuisine','address','city_main']].sort_values(
        by='rating', ascending=False
    ).head(top_n)

# Example usage
print(get_cluster_recommendations(156602, top_n=5))


In [None]:
def get_cluster_recommendations(restaurant_id, top_n=5):
    # Load df_cleaned on demand
    df_cleaned = pd.read_csv("cleaned_data.csv", dtype={'id':'int32', 'city_main':'category'})
    
    # Find the restaurant cluster
    matches = df_encoded.index[df_encoded['id'] == restaurant_id]
    if matches.empty:
        return f"No restaurant found with ID {restaurant_id}"
    idx = matches[0]
    
    cluster_id = df_encoded.loc[idx, 'cluster']
    input_cuisine = df_cleaned.loc[df_cleaned['id'] == restaurant_id, 'cuisine'].values[0]
    
    cluster_restaurants = df_encoded[df_encoded['cluster'] == cluster_id]
    same_cuisine_ids = cluster_restaurants['id'].values
    
    filtered = df_cleaned[df_cleaned['id'].isin(same_cuisine_ids)]
    filtered = filtered[filtered['cuisine'] == input_cuisine]
    filtered = filtered[filtered['id'] != restaurant_id]
    
    return filtered[['id','name','rating','rating_count','cost','cuisine','address','city_main']].sort_values(
        by='rating', ascending=False
    ).head(top_n)

# Example usage
print(get_cluster_recommendations(156602, top_n=5))