## Loading Dataset and Filtering

In [4]:
import json
import gzip
from collections import Counter
from tqdm import tqdm
import os


DATA_PATH = 'D:/Projects/Yelp-Recommendation-System/data/raw'

# Define file paths
business_fp = os.path.join(DATA_PATH, 'business.json')
user_fp     = os.path.join(DATA_PATH, 'user.json')
review_fp   = os.path.join(DATA_PATH, 'review.json')

# STEP 1: Load business_id → categories mapping
print("Loading business categories...")

business_categories = {}
with open(business_fp, 'r', encoding='utf-8') as f:
    for line in f:
        data = json.loads(line)
        bid = data['business_id']
        cats = data.get('categories')
        if cats:
            business_categories[bid] = [c.strip() for c in cats.split(',')]

print(f"Loaded categories for {len(business_categories):,} businesses")

# STEP 2: Count reviews per category by streaming review.json
print("Streaming reviews and counting per category...")

category_review_count = Counter()

with open(review_fp, 'r', encoding='utf-8') as f:
    for line in tqdm(f, total=6000000):  # approx. total lines
        data = json.loads(line)
        bid = data['business_id']
        cats = business_categories.get(bid)
        if cats:
            for cat in cats:
                category_review_count[cat] += 1

# STEP 3: Show top categories
print("\nTop 20 categories by review count:")
for cat, count in category_review_count.most_common(20):
    print(f"{cat}: {count:,}")


Loading business categories...
Loaded categories for 150,243 businesses
Streaming reviews and counting per category...


6990280it [01:33, 74477.55it/s]                              


Top 20 categories by review count:
Restaurants: 4,724,471
Food: 1,813,593
Nightlife: 1,539,757
Bars: 1,455,553
American (Traditional): 1,011,646
American (New): 984,540
Breakfast & Brunch: 867,430
Sandwiches: 691,864
Seafood: 620,247
Event Planning & Services: 609,553
Shopping: 523,254
Pizza: 475,819
Burgers: 445,895
Coffee & Tea: 442,348
Italian: 439,358
Mexican: 431,020
Beauty & Spas: 370,121
Arts & Entertainment: 345,059
Cocktail Bars: 339,102
Salad: 333,444





In [29]:
import json
import pandas as pd
from tqdm import tqdm

# Define categories
selected_categories = {'Restaurants', 'Nightlife', 'Bars', 'Shopping', 'Beauty & Spas'}

# Step 1: Load businesses with selected categories
business_meta = {}

with open(business_fp, 'r', encoding='utf-8') as f:
    for line in f:
        data = json.loads(line)
        bid = data['business_id']
        cats = data.get('categories')
        city = data.get('city')
        if cats:
            parsed_cats = [c.strip() for c in cats.split(',')]
            selected = [c for c in parsed_cats if c in selected_categories]
            if selected:
                business_meta[bid] = {
                    'primary_category': selected[0],  # use first matched
                    'city': city
                }

print(f"Businesses matched: {len(business_meta):,}")


filtered = []

with open(review_fp, 'r', encoding='utf-8') as f:
    for line in tqdm(f, total=6990000):
        data = json.loads(line)
        bid = data['business_id']
        if bid in business_meta:
            filtered.append({
                'user_id': data['user_id'],
                'business_id': bid,
                'stars': data['stars'],
                'primary_category': business_meta[bid]['primary_category'],
                'business_city': business_meta[bid]['city']
            })

df_filtered = pd.DataFrame(filtered)
print(f"Filtered reviews: {len(df_filtered):,}")



Businesses matched: 91,610


6990280it [01:38, 70788.63it/s]                              


Filtered reviews: 5,660,538


In [31]:
import pandas as pd

# Step 1: Define category-specific business thresholds
category_thresholds = {
    'Restaurants': 200,
    'Nightlife': 50,
    'Bars': 50,
    'Shopping': 20,
    'Beauty & Spas': 20
}

# Step 2: Filter per category using business thresholds
filtered_parts = []

for category, min_reviews in category_thresholds.items():
    # Create a safe copy of this category's data
    df_cat = df_filtered[df_filtered['primary_category'] == category].copy()
    
    # Business frequency filtering
    business_counts = df_cat['business_id'].value_counts()
    valid_businesses = business_counts[business_counts >= min_reviews].index

    # Filter only valid businesses
    df_cat_filtered = df_cat[df_cat['business_id'].isin(valid_businesses)].copy()
    
    # Store this category's filtered data
    filtered_parts.append(df_cat_filtered)

# Step 3: Combine all filtered categories
df_combined = pd.concat(filtered_parts, ignore_index=True).copy()

# Step 4: Apply global user threshold after business filtering
MIN_USER_REVIEWS = 10
user_counts = df_combined['user_id'].value_counts()
valid_users = user_counts[user_counts >= MIN_USER_REVIEWS].index

df_filtered_final = df_combined[df_combined['user_id'].isin(valid_users)].copy()

# Step 5: Output statistics
print(f"Final filtered reviews: {len(df_filtered_final):,}")
print(f"Unique users: {df_filtered_final['user_id'].nunique():,}")
print(f"Unique businesses: {df_filtered_final['business_id'].nunique():,}")
print("Category distribution after filtering:")
print(df_filtered_final['primary_category'].value_counts())


Final filtered reviews: 1,252,016
Unique users: 50,935
Unique businesses: 19,371
Category distribution after filtering:
primary_category
Restaurants      719209
Nightlife        189260
Bars             168344
Shopping         112875
Beauty & Spas     62328
Name: count, dtype: int64


In [32]:
df_filtered_final['user_id'].value_counts()

user_id
_BcWyKQL16ndpBdggh2kNA    970
ET8n-r7glWYqZhuR6GcdNw    926
0Igx-a1wAstiBDerGxXk2A    689
1HM81n6n4iPIFU5d2Lokhw    685
bYENop4BuQepBjM1-BI3fA    666
                         ... 
h4pt1pleLJfMo1XBhn-E3g     10
OaKr_CcotOyd2e3BmfoVqg     10
0pOWasstHx2-k4bexUOj5A     10
wvmyHVWHWzRumGk-yuN0Fg     10
IWixoDe6Vbyo7IRTbJj_KQ     10
Name: count, Length: 50935, dtype: int64

## Summary 


### Rationale for Category Selection

The Yelp dataset contains a wide range of business types. Rather than attempting to model all categories simultaneously, we focused on a subset that is highly relevant to user interests and has sufficient review volume to support learning. The categories selected are:

- Restaurants
- Nightlife
- Bars
- Shopping
- Beauty & Spas

These categories are commonly interacted with by users and often contain subjective and experience-based feedback, making them particularly suitable for recommender systems.

### Motivation for Category-Specific Thresholding

Initial filtering based on a single threshold across all categories led to the exclusion of many useful businesses in smaller categories. To address this, we introduced category-specific filtering thresholds for businesses:

- Restaurants: minimum 200 reviews per business
- Nightlife: minimum 50 reviews
- Bars: minimum 50 reviews
- Shopping: minimum 20 reviews
- Beauty & Spas: minimum 20 reviews

A consistent user-level filter was applied: users must have at least 10 reviews to be included. This approach retains informative users while ensuring enough data remains in less frequently reviewed categories.

### Category Grouping for Modeling

For modeling purposes, the categories are grouped as follows:

- Group A: Restaurants
- Group B: Bars and NightLife - these are grouped together to increase data availability and due to similar user engagement patterns.
- Group C: Shopping and Beauty & Spas – these are grouped together to increase data availability and due to similar user engagement patterns.

This grouping balances data representation across categories and supports the design of shared or multi-task recommendation architectures.

### Final Dataset Statistics

After applying the filtering and grouping strategy:

Final filtered reviews: 1,252,016
Unique users: 50,935
Unique businesses: 19,371

This dataset is now well-prepared for the next steps of ID encoding, train-test splitting, and building baseline and advanced recommendation models.

#### Why Business-First Then User Filtering?

We chose to filter businesses first, before filtering users, to make sure we are working only with businesses that have a good number of reviews. This helps in a few important ways:

1. **Remove low-quality businesses early**: Some businesses might only have a few reviews, which may not be enough for a recommendation model to learn from. Removing these early avoids wasting effort on businesses with very little data.

2. **Avoid keeping users with weak interactions**: If we filtered users first, we might keep users who have written reviews for businesses that are later removed (because they were inactive). This would reduce the number of useful reviews that user has, and they might become less relevant.

3. **Focus on strong user-business connections**: By filtering out weak businesses first, we make sure that when we check a user’s activity, it’s only based on reviews of good, active businesses. This gives us a clearer picture of how active and useful each user is.

4. **Create a denser, higher-quality dataset**: In the final step, we keep users who have reviewed at least a minimum number of the valid businesses. This leads to a better, cleaner dataset where each user and business has enough interactions to be useful in training our models.

Overall, this approach helps improve the quality of the data and ensures our recommendation model learns from strong and meaningful interactions.


In [37]:
df_filtered_final.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1252016 entries, 3 to 3453505
Data columns (total 5 columns):
 #   Column            Non-Null Count    Dtype  
---  ------            --------------    -----  
 0   user_id           1252016 non-null  object 
 1   business_id       1252016 non-null  object 
 2   stars             1252016 non-null  float64
 3   primary_category  1252016 non-null  object 
 4   business_city     1252016 non-null  object 
dtypes: float64(1), object(4)
memory usage: 57.3+ MB


In [38]:
## Finally save the dataset for further use and re-producability

import os

# Define folder path and create it if it doesn't exist
output_dir = DATA_PATH
os.makedirs(output_dir, exist_ok=True)

# Save to CSV
output_path = os.path.join(output_dir, 'yelp_filtered_reviews.csv')
df_filtered_final.to_csv(output_path, index=False)

print(f"Filtered dataset saved to: {output_path}")


Filtered dataset saved to: D:/Projects/Yelp-Recommendation-System/data/raw\yelp_filtered_reviews.csv


## Data Pre-processing and EDA functions 

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
import mlflow
import dagshub
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import faiss
from collections import defaultdict
import scipy.sparse as sp
from google.colab import drive
import pandas as pd
import os

# ================= CONFIGURATION =================
# Defines all essential parameters for the pipeline,
# including data paths, MLflow settings, model hyperparameters,
# and training specifics.
CONFIG = {
    "csv_path": "D:/Projects/Yelp-Recomendation-System/data/raw/yelp_filtered_reviews.csv", # Path to the raw review data CSV
    "mlflow_tracking_uri": "https://dagshub.com/Adarshreddy99/Yelp-Recommendation-System.mlflow", # MLflow tracking server URI
    "dagshub_repo_owner": "Adarshreddy99", # DagsHub repository owner
    "dagshub_repo_name": "Yelp-Recommendation-System", # DagsHub repository name
    "experiment_name": "Hybrid_Recommendation_System", # Name of the MLflow experiment
    "embedding_dim": 64,    # Dimension of user and item embeddings
    "hidden_dim": 128,      # Dimension of hidden layers in networks
    "batch_size": 1024,     # Batch size for training
    "learning_rate": 0.001, # Learning rate for optimizer
    "epochs": 20,           # Number of training epochs
    "test_size": 0.2,       # Proportion of data for the test set
    "device": torch.device("cuda" if torch.cuda.is_available() else "cpu") # Device to run training (GPU if available, else CPU)
}

# ============= MLflow Setup =============
def setup_mlflow():
    """
    Sets up MLflow tracking by defining the tracking URI and initializing DagsHub
    for seamless integration with MLflow.
    """
    mlflow.set_tracking_uri(CONFIG["mlflow_tracking_uri"])
    # Initialize DagsHub for MLflow integration
    dagshub.init(repo_owner='Adarshreddy99', repo_name='Yelp-Recommendation-System', mlflow=True)
    mlflow.set_experiment(CONFIG["experiment_name"])
    print("MLflow setup complete")

# ============= Data Loading & Preprocessing =============

def load_data(file_path_on_drive):
    """
    Mounts Google Drive, loads the dataset from the given path, and performs initial data validation.
    Ensures required columns are present and handles missing values.

    Args:
        file_path_on_drive (str): The relative file path within Google Drive (e.g., 'MyDrive/data/file.csv').

    Returns:
        pd.DataFrame: The loaded and initially validated DataFrame.
    """
    # Mount Google Drive
    drive.mount('/content/drive')

    full_path = os.path.join('/content/drive', file_path_on_drive)
    print(f"Loading data from: {full_path}")
    
    df = pd.read_csv(full_path)
    print(f"Loaded data: {df.shape[0]} rows, {df.shape[1]} columns")
    
    # Basic validation: Check for essential columns
    required_cols = ['user_id', 'business_id', 'stars']
    missing_cols = [col for col in required_cols if col not in df.columns]
    if missing_cols:
        raise ValueError(f"Missing required columns: {missing_cols}")
    
    # Remove rows with null values in essential columns
    initial_rows = len(df)
    df = df.dropna(subset=required_cols)
    print(f"Removed {initial_rows - len(df)} rows with nulls in essential columns. After cleaning: {df.shape}")
    
    return df

def categorize_businesses(df):
    """
    Categorizes businesses into predefined main categories (Restaurants, Bars/Nightlife, Shopping/Beauty).
    It looks for keywords in a 'categories' column. If the column is missing, it assigns random
    categories for demonstration.

    Args:
        df (pd.DataFrame): The input DataFrame containing business data.

    Returns:
        pd.DataFrame: The DataFrame with a new 'category' column and 'Other' categories filtered out.
    """
    # Define keywords for each category
    restaurant_keywords = ['restaurant', 'food', 'pizza', 'burger', 'cafe', 'diner', 'bistro', 'steak', 'sushi', 'deli']
    nightlife_keywords = ['bar', 'pub', 'nightlife', 'club', 'lounge', 'brewery', 'cocktail', 'wine']
    beauty_shopping_keywords = ['beauty', 'spa', 'salon', 'shopping', 'retail', 'boutique', 'fashion', 'store', 'mall', 'hair']
    
    def assign_category(categories_str):
        """Helper function to assign a single category based on keywords."""
        if pd.isna(categories_str):
            return 'Other'
        
        categories_lower = str(categories_str).lower()
        
        if any(keyword in categories_lower for keyword in restaurant_keywords):
            return 'Restaurants'
        elif any(keyword in categories_lower for keyword in nightlife_keywords):
            return 'BarsNightlife'
        elif any(keyword in categories_lower for keyword in beauty_shopping_keywords):
            return 'ShoppingBeauty'
        else:
            return 'Other'
    
    # Apply categorization if 'categories' column exists
    if 'categories' in df.columns:
        df['category'] = df['categories'].apply(assign_category)
    else:
        # Fallback for demonstration if 'categories' column is not found
        print("Warning: No 'categories' column found. Randomly assigning categories for demo purposes.")
        categories = ['Restaurants', 'BarsNightlife', 'ShoppingBeauty']
        df['category'] = np.random.choice(categories, size=len(df))
    
    # Filter out businesses classified as 'Other' to focus on relevant categories
    initial_rows = len(df)
    df = df[df['category'] != 'Other'].copy()
    print(f"Filtered out {initial_rows - len(df)} 'Other' category businesses. After categorization: {df.shape}")
    print(f"Category distribution:\n{df['category'].value_counts()}")
    
    return df

def create_mappings(df):
    """
    Creates unique integer mappings for users and category-specific items.
    This is essential for embedding layers in neural networks.

    Args:
        df (pd.DataFrame): The DataFrame with 'user_id', 'business_id', and 'category' columns.

    Returns:
        tuple: A tuple containing:
            - pd.DataFrame: The DataFrame with 'user_idx' and 'item_idx' columns.
            - dict: A mapping from original user_id to integer user_idx.
            - dict: A dictionary of category-specific item mappings (original item_id to integer item_idx).
    """
    # Create a global mapping for all unique users
    user2idx = {u: i for i, u in enumerate(df['user_id'].unique())}
    
    # Create category-specific mappings for items (businesses)
    item_mappings = {}
    for category in df['category'].unique():
        cat_items = df[df['category'] == category]['business_id'].unique()
        item_mappings[category] = {item: i for i, item in enumerate(cat_items)}
    
    # Apply the created mappings to the DataFrame
    df['user_idx'] = df['user_id'].map(user2idx)
    
    # Apply category-specific item mappings
    def map_item_idx(row):
        """Helper function to map item_id based on its category."""
        # Use .get() with a default value of -1 to identify items not found in mapping
        return item_mappings[row['category']].get(row['business_id'], -1)
    
    df['item_idx'] = df.apply(map_item_idx, axis=1)
    
    # Remove any rows where mapping failed (e.g., business_id not found in its category's mapping)
    initial_rows = len(df)
    df = df[df['item_idx'] != -1].copy()
    print(f"Removed {initial_rows - len(df)} rows due to mapping failures. Final data shape: {df.shape}")
    print(f"Total number of unique users: {len(user2idx)}")
    for cat, mapping in item_mappings.items():
        print(f"Total number of unique items in '{cat}': {len(mapping)}")
    
    return df, user2idx, item_mappings

def split_data(df, test_size=0.2):
    """
    Splits the preprocessed DataFrame into training and testing sets.
    Stratifies the split by 'category' to ensure balanced category distribution in both sets.

    Args:
        df (pd.DataFrame): The input DataFrame with 'category' column.
        test_size (float): The proportion of the dataset to include in the test split.

    Returns:
        tuple: A tuple containing the training and testing DataFrames.
    """
    train_df, test_df = train_test_split(
        df, 
        test_size=test_size, 
        stratify=df['category'], # Ensure category distribution is preserved
        random_state=42 # For reproducibility
    )
    
    print(f"Train set size: {train_df.shape}")
    print(f"Test set size: {test_df.shape}")
    
    return train_df, test_df

# ============= Exploratory Data Analysis (EDA) Functions =============
def perform_eda(df):
    """
    Performs comprehensive Exploratory Data Analysis (EDA) on the DataFrame.
    Generates and saves various plots (rating distribution, category distribution, user/item activity).
    Logs these plots as artifacts to MLflow.
    Calculates and prints sparsity metrics for each category.

    Args:
        df (pd.DataFrame): The input DataFrame after preprocessing.

    Returns:
        dict: A dictionary containing sparsity metrics for each category.
    """
    print("Starting EDA...")

    # 1. Rating Distribution
    plt.figure(figsize=(10, 6))
    sns.histplot(df['stars'], bins=5, kde=True, color='skyblue')
    plt.title("Distribution of Ratings (Stars)", fontsize=16)
    plt.xlabel("Stars Rating", fontsize=12)
    plt.ylabel("Frequency", fontsize=12)
    plt.grid(axis='y', alpha=0.75)
    plt.xticks([1, 2, 3, 4, 5])
    plt.tight_layout()
    plt.savefig("rating_distribution.png", dpi=300, bbox_inches='tight')
    mlflow.log_artifact("rating_distribution.png")
    plt.close() # Close plot to free memory
    print("Logged 'rating_distribution.png'")
    
    # 2. Category Distribution
    plt.figure(figsize=(10, 6))
    df['category'].value_counts().plot(kind='bar', color='lightcoral')
    plt.title("Distribution of Business Categories", fontsize=16)
    plt.xlabel("Category", fontsize=12)
    plt.ylabel("Number of Reviews", fontsize=12)
    plt.xticks(rotation=45, ha='right') # Rotate labels for better readability
    plt.grid(axis='y', alpha=0.75)
    plt.tight_layout()
    plt.savefig("category_distribution.png", dpi=300, bbox_inches='tight')
    mlflow.log_artifact("category_distribution.png")
    plt.close()
    print("Logged 'category_distribution.png'")
    
    # 3. User and Item Activity Distributions
    user_counts = df['user_idx'].value_counts()
    
    fig, axes = plt.subplots(2, 2, figsize=(15, 12)) # Adjusted for better spacing
    
    # User activity distribution (linear scale)
    sns.histplot(user_counts, bins=50, ax=axes[0,0], color='lightgreen')
    axes[0,0].set_title("User Activity Distribution (Linear Scale)", fontsize=14)
    axes[0,0].set_xlabel("Number of Reviews per User", fontsize=11)
    axes[0,0].set_ylabel("Number of Users", fontsize=11)
    
    # User activity distribution (log scale)
    sns.histplot(user_counts, bins=50, ax=axes[0,1], color='lightgreen')
    axes[0,1].set_yscale('log') # Logarithmic scale for frequency
    axes[0,1].set_title("User Activity Distribution (Log Scale)", fontsize=14)
    axes[0,1].set_xlabel("Number of Reviews per User", fontsize=11)
    axes[0,1].set_ylabel("Number of Users (Log Scale)", fontsize=11)
    
    # Item activity by category
    colors = ['cornflowerblue', 'orange', 'purple'] # Define specific colors for categories
    for i, category in enumerate(['Restaurants', 'BarsNightlife', 'ShoppingBeauty']):
        if category in df['category'].values: # Check if category exists in the data
            cat_data = df[df['category'] == category]
            item_counts = cat_data['item_idx'].value_counts()
            
            # Dynamically assign subplot for item activity
            if i == 0: # First category to axes[1,0]
                ax = axes[1,0]
            elif i == 1: # Second category to axes[1,1]
                ax = axes[1,1]
            else: # Third category might need a new subplot or handle differently if more than 2x2 layout
                # For this specific 2x2 layout, if there are more than 2 categories
                # for the bottom row, it might overlap or need a different subplot layout.
                # Assuming max 2 for bottom row or will handle dynamically.
                # For simplicity with 3 categories, one might be placed in the 2nd row, 1st col, next in 2nd row, 2nd col.
                # If a third one exists, it would be placed in a new subplot outside the initial 2x2 grid,
                # which can lead to layout issues. A better way would be fig.add_subplot(2,3,i+1+3) for a 2x3 grid,
                # but we'll stick to the original structure for now.
                # Let's assign the third category to the second subplot of the second row,
                # if there are only 2 main categories shown on row 2, and overlay them.
                # Or, if this function is only called once and three items are there,
                # the third category data won't be plotted.
                # To ensure it plots, let's adjust for a specific 2x2 layout or ensure
                # the loop only iterates over relevant categories.
                pass # This part might need explicit subplot creation if >2 categories for the last row
            
            sns.histplot(item_counts, bins=30, ax=ax, alpha=0.7, label=category, color=colors[i % len(colors)])
            ax.set_title(f"{category} Item Activity", fontsize=14)
            ax.set_xlabel("Number of Reviews per Item", fontsize=11)
            ax.set_ylabel("Number of Items", fontsize=11)
            ax.legend()

    plt.tight_layout() # Adjust subplot parameters for a tight layout
    plt.savefig("activity_distributions.png", dpi=300, bbox_inches='tight')
    mlflow.log_artifact("activity_distributions.png")
    plt.close()
    print("Logged 'activity_distributions.png'")
    
    # 4. Calculate and log sparsity metrics
    sparsity_metrics = {}
    print("\nCalculating data sparsity:")
    for category in df['category'].unique():
        cat_data = df[df['category'] == category]
        n_users_cat = cat_data['user_idx'].nunique()
        n_items_cat = cat_data['item_idx'].nunique()
        n_ratings_cat = len(cat_data)
        
        # Avoid division by zero if n_users or n_items is 0 for a category
        if n_users_cat > 0 and n_items_cat > 0:
            sparsity = 1 - (n_ratings_cat / (n_users_cat * n_items_cat))
            sparsity_metrics[f'{category}_sparsity'] = sparsity
            print(f"  Category '{category}':")
            print(f"    Unique Users: {n_users_cat}, Unique Items: {n_items_cat}, Total Ratings: {n_ratings_cat}")
            print(f"    Sparsity: {sparsity:.4f}")
        else:
            print(f"  Category '{category}': No data or no unique users/items found.")
            sparsity_metrics[f'{category}_sparsity'] = 1.0 # Fully sparse if no interactions
            
    print("EDA complete.")
    return sparsity_metrics

# ============= Model Architecture =============
class UserNetwork(nn.Module):
    """
    Neural network for generating user embeddings.
    It takes user IDs as input, passes them through an embedding layer,
    and then through a fully connected network with BatchNorm and ReLU activation.
    """
    def __init__(self, num_users, emb_dim=64, hidden_dim=128):
        """
        Initializes the UserNetwork.

        Args:
            num_users (int): Total number of unique users.
            emb_dim (int): Desired dimension of the user embedding.
            hidden_dim (int): Dimension of the hidden layer in the FC block.
        """
        super(UserNetwork, self).__init__()
        # Embedding layer for users
        self.embedding = nn.Embedding(num_users, emb_dim)
        # Fully connected layers to process embeddings
        self.fc = nn.Sequential(
            nn.Linear(emb_dim, hidden_dim),
            nn.BatchNorm1d(hidden_dim), # Batch Normalization for stability
            nn.ReLU(),                  # ReLU activation
            nn.Dropout(0.2),            # Dropout for regularization
            nn.Linear(hidden_dim, emb_dim) # Output layer to the embedding dimension
        )
        
    def forward(self, user_ids):
        """
        Forward pass for the UserNetwork.

        Args:
            user_ids (torch.Tensor): Tensor of user indices.

        Returns:
            torch.Tensor: User embeddings.
        """
        x = self.embedding(user_ids)
        return self.fc(x)

class ItemNetwork(nn.Module):
    """
    Neural network for generating item embeddings.
    Similar to UserNetwork but for items, allowing category-specific item representations.
    """
    def __init__(self, num_items, emb_dim=64, hidden_dim=128):
        """
        Initializes the ItemNetwork.

        Args:
            num_items (int): Total number of unique items for a given category.
            emb_dim (int): Desired dimension of the item embedding.
            hidden_dim (int): Dimension of the hidden layer in the FC block.
        """
        super(ItemNetwork, self).__init__()
        # Embedding layer for items
        self.embedding = nn.Embedding(num_items, emb_dim)
        # Fully connected layers to process embeddings
        self.fc = nn.Sequential(
            nn.Linear(emb_dim, hidden_dim),
            nn.BatchNorm1d(hidden_dim),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(hidden_dim, emb_dim)
        )
        
    def forward(self, item_ids):
        """
        Forward pass for the ItemNetwork.

        Args:
            item_ids (torch.Tensor): Tensor of item indices.

        Returns:
            torch.Tensor: Item embeddings.
        """
        x = self.embedding(item_ids)
        return self.fc(x)

class HybridRecommendationModel(nn.Module):
    """
    The main hybrid recommendation model combining a shared user network
    and category-specific item networks. It predicts ratings based on the
    interaction between user and item embeddings.
    """
    def __init__(self, num_users, category_item_counts, emb_dim=64, hidden_dim=128):
        """
        Initializes the HybridRecommendationModel.

        Args:
            num_users (int): Total number of unique users across all categories.
            category_item_counts (dict): A dictionary where keys are category names
                                         and values are the number of unique items in that category.
            emb_dim (int): Dimension of user and item embeddings.
            hidden_dim (int): Dimension of hidden layers in the networks.
        """
        super(HybridRecommendationModel, self).__init__()
        
        # Shared user network for all users
        self.user_net = UserNetwork(num_users, emb_dim, hidden_dim)
        
        # ModuleDict to hold category-specific item networks
        # This allows dynamic access to the correct item network based on category
        self.item_nets = nn.ModuleDict({
            category: ItemNetwork(count, emb_dim, hidden_dim)
            for category, count in category_item_counts.items()
        })
        
        # Output layer for rating prediction (maps interaction to a single rating value)
        self.rating_predictor = nn.Sequential(
            nn.Linear(emb_dim, 32), # Intermediate linear layer
            nn.ReLU(),              # Activation
            nn.Dropout(0.1),        # Dropout
            nn.Linear(32, 1),       # Output to a single scalar
            nn.Sigmoid()            # Sigmoid to normalize output between 0 and 1
        )
        
    def forward(self, user_ids, item_ids, category):
        """
        Forward pass for the HybridRecommendationModel.

        Args:
            user_ids (torch.Tensor): Tensor of user indices.
            item_ids (torch.Tensor): Tensor of item indices (specific to the given category).
            category (str): The category name (e.g., 'Restaurants', 'BarsNightlife').

        Returns:
            tuple: A tuple containing:
                - torch.Tensor: Predicted ratings (scaled to 1-5).
                - torch.Tensor: User embeddings.
                - torch.Tensor: Item embeddings (from the specific category network).
        """
        # Get user embeddings from the shared user network
        user_emb = self.user_net(user_ids)
        
        # Get category-specific item embeddings using the correct item network
        # The category argument selects which item network to use from ModuleDict
        item_emb = self.item_nets[category](item_ids)
        
        # Compute interaction between user and item embeddings (e.g., element-wise product)
        interaction = user_emb * item_emb
        
        # Predict rating using the rating predictor head
        # Scale the sigmoid output (0-1) to the 1-5 rating scale
        rating = self.rating_predictor(interaction).squeeze() * 4 + 1
        
        return rating, user_emb, item_emb

# ============= Dataset and DataLoader =============
class RecommendationDataset(Dataset):
    """
    Custom PyTorch Dataset for handling recommendation data.
    It wraps a pandas DataFrame and provides data for training/evaluation.
    """
    def __init__(self, df):
        """
        Initializes the RecommendationDataset.

        Args:
            df (pd.DataFrame): The DataFrame containing 'user_idx', 'item_idx', 'stars', and 'category'.
        """
        self.df = df.reset_index(drop=True) # Reset index to ensure integer-based indexing
        
    def __len__(self):
        """
        Returns the total number of samples in the dataset.
        """
        return len(self.df)
    
    def __getitem__(self, idx):
        """
        Retrieves a single sample from the dataset.

        Args:
            idx (int): The index of the sample to retrieve.

        Returns:
            dict: A dictionary containing 'user_id', 'item_id', 'rating', and 'category' for the sample.
        """
        row = self.df.iloc[idx]
        return {
            'user_id': row['user_idx'],
            'item_id': row['item_idx'],
            'rating': row['stars'],
            'category': row['category']
        }

def collate_fn(batch):
    """
    Custom collate function for DataLoader.
    It groups samples by category, which is necessary because each category
    uses a different item embedding network in the model.

    Args:
        batch (list): A list of samples (dictionaries) from the RecommendationDataset.

    Returns:
        defaultdict: A dictionary where keys are categories and values are lists of samples
                     belonging to that category.
    """
    category_batches = defaultdict(list)
    
    for item in batch:
        category_batches[item['category']].append(item)
    
    return category_batches

# ============= Training Functions =============
def train_epoch(model, dataloader, optimizer, criterion, device):
    """
    Performs one epoch of training for the model.

    Args:
        model (nn.Module): The recommendation model.
        dataloader (DataLoader): DataLoader for the training data.
        optimizer (torch.optim.Optimizer): Optimizer for model parameters.
        criterion (nn.Module): Loss function (e.g., MSELoss).
        device (torch.device): The device (CPU/GPU) to run the training on.

    Returns:
        float: The average training loss for the epoch.
    """
    model.train() # Set model to training mode
    total_loss = 0
    num_batches = 0
    
    for batch_dict in dataloader:
        optimizer.zero_grad() # Clear gradients from previous step
        epoch_loss = 0
        
        # Iterate through each category in the batch (grouped by collate_fn)
        for category, items in batch_dict.items():
            if not items:
                continue # Skip if no items for this category in the current batch
            
            # Convert list of dicts to tensors
            user_ids = torch.tensor([item['user_id'] for item in items], 
                                    dtype=torch.long, device=device)
            item_ids = torch.tensor([item['item_id'] for item in items], 
                                    dtype=torch.long, device=device)
            ratings = torch.tensor([item['rating'] for item in items], 
                                   dtype=torch.float, device=device)
            
            # Forward pass: get predicted ratings
            pred_ratings, _, _ = model(user_ids, item_ids, category)
            
            # Compute loss for this category and add to epoch loss
            loss = criterion(pred_ratings, ratings)
            epoch_loss += loss
        
        # Backward pass and optimize if there was any loss accumulated
        if epoch_loss > 0:
            epoch_loss.backward() # Accumulate gradients
            optimizer.step()      # Update model parameters
            total_loss += epoch_loss.item() # Add loss to total
            num_batches += 1
    
    # Return average loss, handling cases where num_batches might be zero
    return total_loss / max(num_batches, 1)

def evaluate_model(model, dataloader, device):
    """
    Evaluates the model on a given dataset.

    Args:
        model (nn.Module): The recommendation model.
        dataloader (DataLoader): DataLoader for the evaluation data.
        device (torch.device): The device (CPU/GPU) to run the evaluation on.

    Returns:
        dict: A dictionary containing overall and category-specific evaluation metrics (MSE, MAE, RMSE).
    """
    model.eval() # Set model to evaluation mode
    predictions = []
    actuals = []
    category_metrics = defaultdict(list) # To store predictions and actuals per category
    
    with torch.no_grad(): # Disable gradient calculation for evaluation
        for batch_dict in dataloader:
            for category, items in batch_dict.items():
                if not items:
                    continue
                    
                user_ids = torch.tensor([item['user_id'] for item in items], 
                                        dtype=torch.long, device=device)
                item_ids = torch.tensor([item['item_id'] for item in items], 
                                        dtype=torch.long, device=device)
                ratings = torch.tensor([item['rating'] for item in items], 
                                       dtype=torch.float, device=device)
                
                pred_ratings, _, _ = model(user_ids, item_ids, category)
                
                # Collect overall predictions and actuals
                predictions.extend(pred_ratings.cpu().numpy())
                actuals.extend(ratings.cpu().numpy())
                
                # Collect category-specific predictions and actuals
                category_metrics[category].extend(list(zip(
                    pred_ratings.cpu().numpy(), 
                    ratings.cpu().numpy()
                )))
    
    # Calculate overall metrics
    mse = mean_squared_error(actuals, predictions)
    mae = mean_absolute_error(actuals, predictions)
    rmse = np.sqrt(mse)
    
    # Calculate category-specific metrics
    cat_metrics = {}
    for category, preds_actual in category_metrics.items():
        if preds_actual: # Ensure there are predictions for this category
            cat_preds, cat_actuals = zip(*preds_actual) # Unzip predictions and actuals
            cat_metrics[f'{category}_mse'] = mean_squared_error(cat_actuals, cat_preds)
            cat_metrics[f'{category}_mae'] = mean_absolute_error(cat_actuals, cat_preds)
            cat_metrics[f'{category}_rmse'] = np.sqrt(cat_metrics[f'{category}_mse'])
        else:
            print(f"Warning: No predictions for category '{category}' in evaluation.")
    
    return {
        'mse': mse,
        'mae': mae,
        'rmse': rmse,
        **cat_metrics # Unpack category-specific metrics into the main dictionary
    }

# ============= FAISS Index Functions =============
def build_faiss_indices(model, item_mappings, device):
    """
    Builds FAISS (Facebook AI Similarity Search) indices for each category.
    This allows for efficient nearest-neighbor search for item recommendations.

    Args:
        model (nn.Module): The trained recommendation model.
        item_mappings (dict): Dictionary of category-specific item mappings.
        device (torch.device): The device (CPU/GPU) the model is on.

    Returns:
        dict: A dictionary where keys are category names and values are the corresponding FAISS indices.
    """
    model.eval() # Set model to evaluation mode
    indices = {}
    
    with torch.no_grad():
        for category, item_mapping in item_mappings.items():
            n_items = len(item_mapping)
            # Create a tensor of item indices for the current category
            item_ids = torch.arange(n_items, dtype=torch.long, device=device)
            
            embeddings = []
            batch_size = 1024 # Process items in batches to manage memory
            
            # Extract item embeddings in batches
            for i in range(0, n_items, batch_size):
                batch_ids = item_ids[i:i+batch_size]
                # To get item embeddings, we need to pass dummy user_ids and category
                # The user_ids are not used by the item_net directly, just for the forward signature
                _, _, item_emb = model(
                    torch.zeros_like(batch_ids), # Dummy user_ids (values don't matter here)
                    batch_ids, 
                    category
                )
                embeddings.append(item_emb.cpu().numpy())
            
            # Concatenate all collected embeddings
            if embeddings: # Check if embeddings list is not empty
                all_embeddings = np.vstack(embeddings)
                
                # Build FAISS index: IndexFlatL2 for L2 distance (Euclidean)
                # D = embedding dimension
                index = faiss.IndexFlatL2(all_embeddings.shape[1]) 
                index.add(all_embeddings.astype('float32')) # Add embeddings to the index
                indices[category] = index
                
                print(f"Built FAISS index for '{category}' with {all_embeddings.shape[0]} items and {all_embeddings.shape[1]} dimensions.")
            else:
                print(f"No embeddings generated for category '{category}'. Skipping FAISS index creation.")
    
    return indices

def get_recommendations(model, user_id, category, faiss_index, item_mappings, 
                        device, k=10):
    """
    Retrieves top-k item recommendations for a given user within a specific category
    using the trained model and FAISS index.

    Args:
        model (nn.Module): The trained recommendation model.
        user_id (int): The mapped integer ID of the user.
        category (str): The category for which to retrieve recommendations.
        faiss_index (faiss.Index): The FAISS index for the specified category.
        item_mappings (dict): The dictionary of category-specific item mappings (to map back item_idx to original item_id).
        device (torch.device): The device (CPU/GPU) the model is on.
        k (int): The number of top recommendations to retrieve.

    Returns:
        tuple: A tuple containing:
            - np.array: Distances to the recommended items.
            - np.array: Mapped item indices of the recommended items.
    """
    model.eval() # Set model to evaluation mode
    
    with torch.no_grad():
        # Get the user embedding from the model
        user_tensor = torch.tensor([user_id], dtype=torch.long, device=device)
        # Dummy item_id is needed for the forward pass, but its value doesn't affect user_emb
        dummy_item = torch.tensor([0], dtype=torch.long, device=device) 
        
        _, user_emb, _ = model(user_tensor, dummy_item, category)
        user_emb_np = user_emb.cpu().numpy().astype('float32') # Convert to numpy for FAISS
        
        # Search for nearest neighbors (items) to the user embedding in the FAISS index
        distances, item_indices = faiss_index.search(user_emb_np, k)
        
        return distances[0], item_indices[0] # Return the first (and only) user's results

# ============= Main Pipeline Execution =============
def main_pipeline():
    """
    Orchestrates the entire hybrid recommendation system pipeline:
    1. Sets up MLflow tracking.
    2. Loads and preprocesses data.
    3. Performs EDA and logs results.
    4. Splits data into train/test sets.
    5. Initializes the PyTorch model.
    6. Trains the model, logging metrics per epoch.
    7. Evaluates the trained model.
    8. Builds and saves FAISS indices.
    9. Demonstrates sample recommendations.
    """
    
    # Start an MLflow run to log all aspects of this pipeline execution
    with mlflow.start_run():
        print("Starting Hybrid Recommendation System Pipeline...")
        
        # Log all configuration parameters for reproducibility
        mlflow.log_params(CONFIG)
        print("Configuration parameters logged to MLflow.")
        
        # 1. Load and preprocess data
        print("\n1. Loading and preprocessing data...")
        try:
            df = load_data('MyDrive/yelp_filtered_reviews.csv')
            df = categorize_businesses(df)
            df, user2idx, item_mappings = create_mappings(df)
        except ValueError as e:
            print(f"Error during data loading/preprocessing: {e}")
            mlflow.log_param("data_load_status", "failed")
            return None, None, None # Exit pipeline if data loading fails
        
        # Log key data statistics to MLflow
        mlflow.log_metric("total_users_after_mapping", len(user2idx))
        mlflow.log_metric("total_interactions_after_mapping", len(df))
        for category, mapping in item_mappings.items():
            mlflow.log_metric(f"{category}_total_items", len(mapping))
        print("Data loaded, preprocessed, and mappings created.")
        
        # 2. Perform EDA
        print("\n2. Performing EDA...")
        sparsity_metrics = perform_eda(df)
        mlflow.log_metrics(sparsity_metrics) # Log sparsity metrics
        print("EDA complete and plots logged to MLflow.")
        
        # 3. Split data into training and testing sets
        print("\n3. Splitting data into train and test sets...")
        train_df, test_df = split_data(df, CONFIG["test_size"])
        mlflow.log_metric("train_data_size", len(train_df))
        mlflow.log_metric("test_data_size", len(test_df))
        print("Data split complete.")
        
        # 4. Create PyTorch datasets and dataloaders
        print("\n4. Creating PyTorch datasets and dataloaders...")
        train_dataset = RecommendationDataset(train_df)
        test_dataset = RecommendationDataset(test_df)
        
        train_loader = DataLoader(
            train_dataset, 
            batch_size=CONFIG["batch_size"], 
            shuffle=True, # Shuffle training data
            collate_fn=collate_fn, # Use custom collate function for category grouping
            pin_memory=True if CONFIG["device"].type == 'cuda' else False # Pin memory for faster GPU transfer
        )
        test_loader = DataLoader(
            test_dataset, 
            batch_size=CONFIG["batch_size"], 
            shuffle=False, # No need to shuffle test data
            collate_fn=collate_fn,
            pin_memory=True if CONFIG["device"].type == 'cuda' else False
        )
        print("Datasets and DataLoaders created.")
        
        # 5. Initialize the Hybrid Recommendation Model
        print("\n5. Initializing the model...")
        category_item_counts = {cat: len(mapping) for cat, mapping in item_mappings.items()}
        
        model = HybridRecommendationModel(
            num_users=len(user2idx),
            category_item_counts=category_item_counts,
            emb_dim=CONFIG["embedding_dim"],
            hidden_dim=CONFIG["hidden_dim"]
        ).to(CONFIG["device"]) # Move model to the specified device
        
        # Log model parameters to MLflow
        total_params = sum(p.numel() for p in model.parameters())
        mlflow.log_metric("model_total_parameters", total_params)
        print(f"Model initialized with {total_params:,} trainable parameters.")
        
        # 6. Set up training components: optimizer and loss function
        print("\n6. Setting up training optimizer and loss function...")
        optimizer = torch.optim.Adam(model.parameters(), lr=CONFIG["learning_rate"])
        criterion = nn.MSELoss() # Mean Squared Error Loss for regression
        print("Optimizer (Adam) and Criterion (MSELoss) set up.")
        
        # 7. Training loop
        print("\n7. Starting model training...")
        best_val_rmse = float('inf') # Track best validation RMSE for saving model
        
        for epoch in range(CONFIG["epochs"]):
            # Train for one epoch
            train_loss = train_epoch(model, train_loader, optimizer, criterion, CONFIG["device"])
            
            # Evaluate model periodically and at the last epoch
            if epoch % 5 == 0 or epoch == CONFIG["epochs"] - 1:
                val_metrics = evaluate_model(model, test_loader, CONFIG["device"])
                
                print(f"--- Epoch {epoch+1}/{CONFIG['epochs']} ---")
                print(f"  Train Loss: {train_loss:.4f}")
                print(f"  Val RMSE: {val_metrics['rmse']:.4f}")
                print(f"  Val MAE: {val_metrics['mae']:.4f}")
                
                # Log epoch-specific metrics to MLflow
                mlflow.log_metrics({
                    f"train_loss": train_loss, # Log current epoch loss
                    f"val_rmse": val_metrics['rmse'],
                    f"val_mae": val_metrics['mae']
                }, step=epoch)
                
                # Log category-specific metrics for the current epoch
                for metric_name, value in val_metrics.items():
                    if 'mse' in metric_name or 'mae' in metric_name or 'rmse' in metric_name:
                        mlflow.log_metric(f"{metric_name}", value, step=epoch) # Log category metrics
                
                # Save the model state_dict if current validation RMSE is the best so far
                if val_metrics['rmse'] < best_val_rmse:
                    best_val_rmse = val_metrics['rmse']
                    model_save_path = 'best_model.pth'
                    torch.save(model.state_dict(), model_save_path)
                    mlflow.log_artifact(model_save_path) # Log saved model to MLflow
                    print(f"  Saved best model with Val RMSE: {best_val_rmse:.4f}")
        print("Model training complete.")
        
        # 8. Final evaluation on the test set
        print("\n8. Performing final evaluation on the test set...")
        final_metrics = evaluate_model(model, test_loader, CONFIG["device"])
        
        print("\nFinal Test Metrics:")
        for metric, value in final_metrics.items():
            print(f"  {metric}: {value:.4f}")
            mlflow.log_metric(f"final_{metric}", value) # Log final metrics
        print("Final evaluation complete.")
        
        # 9. Build FAISS indices for efficient item retrieval
        print("\n9. Building FAISS indices for each category...")
        faiss_indices = build_faiss_indices(model, item_mappings, CONFIG["device"])
        
        # Save FAISS indices as artifacts
        for category, index in faiss_indices.items():
            index_file_path = f'faiss_index_{category}.index'
            faiss.write_index(index, index_file_path)
            mlflow.log_artifact(index_file_path)
            print(f"Saved FAISS index for '{category}' to '{index_file_path}'.")
        print("FAISS indices built and logged.")
        
        # 10. Demonstrate sample recommendations
        print("\n10. Generating sample recommendations...")
        # Choose a sample user (e.g., the first user in our mapped user IDs)
        # Note: In a real application, you'd look up a user's ID from user2idx
        sample_user_idx = 0 
        
        # Reverse item mappings for displaying original business IDs
        idx2item_mappings = {}
        for category, item_map in item_mappings.items():
            idx2item_mappings[category] = {v: k for k, v in item_map.items()}

        for category in item_mappings.keys():
            if category in faiss_indices: # Ensure FAISS index exists for this category
                distances, item_indices = get_recommendations(
                    model, sample_user_idx, category, faiss_indices[category], 
                    item_mappings, CONFIG["device"], k=5
                )
                
                print(f"\nTop 5 {category} recommendations for user_idx {sample_user_idx}:")
                for i, (dist, item_idx) in enumerate(zip(distances, item_indices)):
                    original_business_id = idx2item_mappings[category].get(item_idx, "Unknown")
                    print(f"  {i+1}. Item Index: {item_idx} (Original ID: {original_business_id}), Distance: {dist:.4f}")
            else:
                print(f"Skipping recommendations for category '{category}' as no FAISS index was built.")
        
        print("\nPipeline completed successfully! Check MLflow UI for detailed results.")
        
        return model, item_mappings, faiss_indices

# ============= Execute Pipeline =============
if __name__ == "__main__":
    # Ensure all necessary directories exist for saving artifacts if running locally
    os.makedirs('mlruns', exist_ok=True) # MLflow will create its own structure, but good practice
    
    # Setup MLflow tracking
    setup_mlflow()
    
    # Run the main pipeline
    # The returned model, item_mappings, and faiss_indices can be used for further inference
    # or deployment purposes outside this script.
    trained_model, final_item_mappings, final_faiss_indices = main_pipeline()


AttributeError: partially initialized module 'torch' has no attribute 'types' (most likely due to a circular import)

In [51]:
df = pd.read_csv("D:/Projects/Yelp-Recommendation-System/data/raw/yelp_filtered_reviews.csv")
df.head()

Unnamed: 0,user_id,business_id,stars,primary_category,business_city
0,smOvOajNG0lS4Pq7d8g4JQ,RZtGWDLCAtuipwaZ-UfjmQ,4.0,Restaurants,Philadelphia
1,4Uh27DgGzsp6PqrH913giQ,otQS34_MymijPTdNBoBdCw,4.0,Restaurants,Tucson
2,IQsF3Rc6IgCzjVV9DE8KXg,eFvzHawVJofxSnD7TgbZtg,5.0,Restaurants,Philadelphia
3,4hBhtCSgoxkrFgHa4YAD-w,bbEXAEFr4RYHLlZ-HFssTA,5.0,Restaurants,Goleta
4,EZjT2qJN0mOXypMAqZdSrQ,A2q7d-CBM2-81tVkmS4JMw,2.0,Restaurants,Reno


In [57]:
df['primary_category'].value_counts()

primary_category
Restaurants    719209
Nightlife      189260
Bars           168344
Shopping       112875
beauty_spa      62328
Name: count, dtype: int64

In [55]:
df['primary_category'] = df['primary_category'].replace('Beauty & Spas', 'beauty_spa')


In [56]:
df.head()

Unnamed: 0,user_id,business_id,stars,primary_category,business_city
0,smOvOajNG0lS4Pq7d8g4JQ,RZtGWDLCAtuipwaZ-UfjmQ,4.0,Restaurants,Philadelphia
1,4Uh27DgGzsp6PqrH913giQ,otQS34_MymijPTdNBoBdCw,4.0,Restaurants,Tucson
2,IQsF3Rc6IgCzjVV9DE8KXg,eFvzHawVJofxSnD7TgbZtg,5.0,Restaurants,Philadelphia
3,4hBhtCSgoxkrFgHa4YAD-w,bbEXAEFr4RYHLlZ-HFssTA,5.0,Restaurants,Goleta
4,EZjT2qJN0mOXypMAqZdSrQ,A2q7d-CBM2-81tVkmS4JMw,2.0,Restaurants,Reno


In [58]:
## Finally save the dataset for further use and re-producability

import os

# Define folder path and create it if it doesn't exist
output_dir = DATA_PATH
os.makedirs(output_dir, exist_ok=True)

# Save to CSV
output_path = os.path.join(output_dir, 'yelp_filtered_reviews.csv')
df.to_csv(output_path, index=False)

print(f"Filtered dataset saved to: {output_path}")


Filtered dataset saved to: D:/Projects/Yelp-Recommendation-System/data/raw\yelp_filtered_reviews.csv
