In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


**Import Libraries**

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

**Load Dataset**

In [None]:
# Load data
df = pd.read_csv('/content/drive/MyDrive/technonext/zomato.csv', encoding='latin-1')

# 1. Data Cleaning and Preprocessing (Only for recommendation-relevant columns)
print("Cleaning recommendation-relevant data...")

# Focus only on columns that matter for recommendations
recommendation_columns = ['name', 'rate', 'approx_cost(for two people)', 'online_order',
                         'book_table', 'votes', 'cuisines', 'rest_type', 'location']

**Cleaning and Preprocessing the data**

In [None]:
df_rec = df[recommendation_columns].copy()

# Clean 'rate' column
df_rec['rate'] = df_rec['rate'].apply(lambda x: str(x).split('/')[0] if isinstance(x, str) else x)
df_rec['rate'] = pd.to_numeric(df_rec['rate'], errors='coerce')
df_rec['rate'].fillna(df_rec['rate'].median(), inplace=True)

# Clean 'approx_cost(for two people)' column
df_rec['approx_cost(for two people)'] = df_rec['approx_cost(for two people)'].replace({'₹': '', ',': ''}, regex=True)
df_rec['approx_cost(for two people)'] = pd.to_numeric(df_rec['approx_cost(for two people)'], errors='coerce')
df_rec['approx_cost(for two people)'].fillna(df_rec['approx_cost(for two people)'].median(), inplace=True)

# Handle other missing values
df_rec['cuisines'].fillna('Unknown', inplace=True)
df_rec['rest_type'].fillna('Unknown', inplace=True)
df_rec['votes'].fillna(0, inplace=True)

# 2. Feature Extraction (Only for recommendation features)
print("Extracting recommendation-specific features...")

# Create content features that define restaurant similarity
df_rec['restaurant_profile'] = (
    df_rec['cuisines'].fillna('') + ' ' +
    df_rec['rest_type'].fillna('') + ' ' +
    df_rec['location'].fillna('')
)

# Encode categorical variables for numerical features
df_rec['online_order_encoded'] = (df_rec['online_order'] == 'Yes').astype(int)
df_rec['book_table_encoded'] = (df_rec['book_table'] == 'Yes').astype(int)

# 3. Creating Similarity Features for Recommendation
print("Creating similarity features...")

# TF-IDF on restaurant profiles (only the content that defines similarity)
tfidf = TfidfVectorizer(
    stop_words='english',
    max_features=500,  # Limit features to prevent memory issues
    ngram_range=(1, 2)  # Include both single words and pairs
)

tfidf_matrix = tfidf.fit_transform(df_rec['restaurant_profile'])

# Sample first 3000 restaurants to prevent memory crash (adjust as needed)
sample_size = min(3000, len(df_rec))
tfidf_sample = tfidf_matrix[:sample_size]

# Calculate cosine similarity matrix (much smaller now)
print(f"Computing similarity matrix for {sample_size} restaurants...")
cosine_sim = cosine_similarity(tfidf_sample, tfidf_sample)

# 4. Feature Scaling for Numerical Features
print("Scaling numerical features...")

scaler = StandardScaler()
df_rec[['rate_scaled', 'cost_scaled', 'votes_scaled']] = scaler.fit_transform(
    df_rec[['rate', 'approx_cost(for two people)', 'votes']]
)

# 5. Create Final Feature Set for Recommendations
print("Creating final recommendation features...")

# Combine all recommendation features into a single matrix
# Weight different feature types based on importance
feature_weights = {
    'content_similarity': 0.6,    # Cuisine/type/location similarity
    'services': 0.2,              # Online order and booking
    'ratings': 0.1,               # Rating similarity
    'cost': 0.1                   # Price similarity
}

# Create weighted feature matrix for enhanced recommendations
enhanced_features = np.hstack([
    tfidf_sample.toarray() * feature_weights['content_similarity'],
    df_rec[['online_order_encoded', 'book_table_encoded']][:sample_size].values * feature_weights['services'],
    df_rec[['rate_scaled']][:sample_size].values * feature_weights['ratings'],
    df_rec[['cost_scaled']][:sample_size].values * feature_weights['cost']
])

# Compute enhanced similarity
print("Computing enhanced similarity matrix...")
enhanced_cosine_sim = cosine_similarity(enhanced_features, enhanced_features)

# 6. Recommendation Function
def get_recommendations(restaurant_index, n_recommendations=5, similarity_matrix=enhanced_cosine_sim):
    """Get similar restaurants based on computed similarity matrix"""
    # Get similarity scores for the restaurant
    sim_scores = list(enumerate(similarity_matrix[restaurant_index]))

    # Sort by similarity score
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get top N similar restaurants (excluding the restaurant itself)
    sim_scores = sim_scores[1:n_recommendations+1]

    # Get restaurant indices
    restaurant_indices = [i[0] for i in sim_scores]

    # Return recommended restaurants
    return df_rec[['name', 'cuisines', 'rate', 'approx_cost(for two people)']].iloc[restaurant_indices]

# Test the recommendation system
print("\nTesting recommendation system...")
print("Sample recommendation for restaurant #100:")
try:
    recommendations = get_recommendations(100, n_recommendations=5)
    print(recommendations)
except:
    print("Trying restaurant #0 instead...")
    recommendations = get_recommendations(0, n_recommendations=5)
    print(recommendations)

# 7. Save Only What's Needed for Recommendations
print("Saving recommendation features...")

# Save the similarity matrix and processed data
recommendation_data = {
    'similarity_matrix': enhanced_cosine_sim,
    'restaurant_names': df_rec['name'][:sample_size].values,
    'restaurant_info': df_rec[['name', 'cuisines', 'rate', 'approx_cost(for two people)', 'location']][:sample_size]
}

# Save processed recommendation data
df_rec.to_csv('zomato_recommendation_features.csv', index=False)
print(f"Recommendation features saved for {sample_size} restaurants!")

print("\nFinal recommendation features created:")
print(f"- Content similarity: {tfidf_sample.shape[1]} dimensions")
print(f"- Service features: 2 dimensions (online_order, book_table)")
print(f"- Rating features: 1 dimension")
print(f"- Cost features: 1 dimension")
print(f"- Total feature dimensions: {enhanced_features.shape[1]}")
print(f"- Restaurants in recommendation system: {sample_size}")

Cleaning recommendation-relevant data...
Extracting recommendation-specific features...
Creating similarity features...
Computing similarity matrix for 3000 restaurants...
Scaling numerical features...
Creating final recommendation features...
Computing enhanced similarity matrix...

Testing recommendation system...
Sample recommendation for restaurant #100:
                                 name      cuisines  rate  \
693  Hanumanthanagar Biryani Junction  South Indian   3.5   
701                 Kidambi's Kitchen  South Indian   3.5   
667                   Simple Thindies  South Indian   3.5   
700                  SVKP Daily Fresh  South Indian   3.3   
749                 New Prakash Hotel  South Indian   3.7   

     approx_cost(for two people)  
693                        300.0  
701                        300.0  
667                        200.0  
700                        350.0  
749                        250.0  
Saving recommendation features...
Recommendation features save

**Display Clean Data**

In [14]:
df2 = pd.read_csv('/content/zomato_recommendation_features.csv')
df2.head()

Unnamed: 0,name,rate,approx_cost(for two people),online_order,book_table,votes,cuisines,rest_type,location,restaurant_profile,online_order_encoded,book_table_encoded,rate_scaled,cost_scaled,votes_scaled
0,Jalsa,4.1,800.0,Yes,Yes,775,"North Indian, Mughlai, Chinese",Casual Dining,Banashankari,"North Indian, Mughlai, Chinese Casual Dining B...",1,1,1.010753,0.561314,0.611201
1,Spice Elephant,4.1,800.0,Yes,No,787,"Chinese, North Indian, Thai",Casual Dining,Banashankari,"Chinese, North Indian, Thai Casual Dining Bana...",1,0,1.010753,0.561314,0.62613
2,San Churro Cafe,3.8,800.0,Yes,No,918,"Cafe, Mexican, Italian","Cafe, Casual Dining",Banashankari,"Cafe, Mexican, Italian Cafe, Casual Dining Ban...",1,0,0.252002,0.561314,0.789099
3,Addhuri Udupi Bhojana,3.7,300.0,No,No,88,"South Indian, North Indian",Quick Bites,Banashankari,"South Indian, North Indian Quick Bites Banasha...",0,0,-0.000915,-0.581388,-0.243456
4,Grand Village,3.8,600.0,No,No,166,"North Indian, Rajasthani",Casual Dining,Basavanagudi,"North Indian, Rajasthani Casual Dining Basavan...",0,0,0.252002,0.104233,-0.146421


In [15]:
df2.shape

(51717, 15)