<a href="https://colab.research.google.com/github/Bryan-Az/YumTum/blob/main/Restaurant_Recommender.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Loading the Datasets using SQL Warehouse and Spark

In [0]:
import pyspark.pandas as ps

In [0]:
# reading in test as a pd df
test_full = ps.read_table('test_full')
test_full.head()

In [0]:
# reading in train as a pd df
train_full = ps.read_table('train_full')
train_full.head()

# Processing the Data

In [0]:
train_full.columns

In [0]:
type(test_full)

In [0]:
#Sampling
from pyspark.sql import Window
from pyspark.sql import functions as F
from pyspark.sql.functions import lit, concat_ws, col
seed=1
# Define a comprehensive set of stratifying variables
stratifying_vars = ['gender', 'language', 'location_type']

# Function to perform stratified sampling
def stratified_sample(df, stratify_cols, frac=0.25):
    df = df.to_spark()
    # Concatenate the stratifying columns to create a composite key
    df = df.withColumn('composite_key', concat_ws('_', *stratify_cols))

    # Generate fractions for each unique combination of the stratifying columns
    fractions = (
        df.select('composite_key')
          .distinct()
          .withColumn("fraction", lit(frac))
          .rdd
          .map(lambda row: (row['composite_key'], row['fraction']))
          .collectAsMap()
    )

    # Use the composite key for stratified sampling
    sampled_df = df.stat.sampleBy('composite_key', fractions, seed)
    return sampled_df

# Applying stratified sampling on train and test datasets
train_sampled = stratified_sample(train_full, stratifying_vars, frac=0.25)
test_sampled = stratified_sample(test_full, stratifying_vars, frac=0.25)

In [0]:
train_sampled_pd = train_sampled.to_pandas_on_spark()
train_sampled_pd.head()

In [0]:
test_sampled_pd = test_sampled.to_pandas_on_spark()
train_sampled_pd.head()

In [0]:
#Feature reduction
# Updated columns to keep for the training dataset
train_columns_to_keep = [
    'customer_id', 'gender', 'language',
    'location_number', 'location_type', 'latitude_x', 'longitude_x',
    'id', 'latitude_y', 'longitude_y', 'vendor_category_en', 'vendor_rating',
    'delivery_charge', 'serving_distance', 'is_open', 'discount_percentage',
    'display_orders',
    'vendor_tag_name','sunday_from_time1', 'sunday_to_time1', 'sunday_from_time2',
    'sunday_to_time2', 'monday_from_time1', 'monday_to_time1', 'monday_from_time2',
    'monday_to_time2', 'tuesday_from_time1', 'tuesday_to_time1', 'tuesday_from_time2',
    'tuesday_to_time2', 'wednesday_from_time1', 'wednesday_to_time1', 'wednesday_from_time2',
    'wednesday_to_time2', 'thursday_from_time1', 'thursday_to_time1', 'thursday_from_time2',
    'thursday_to_time2', 'friday_from_time1', 'friday_to_time1', 'friday_from_time2',
    'friday_to_time2', 'saturday_from_time1', 'saturday_to_time1', 'saturday_from_time2',
    'saturday_to_time2',
    'target'  # 'target' column is specific to the training dataset
]

# Updated columns to keep for the testing dataset
test_columns_to_keep = [
    'customer_id', 'gender', 'language',
    'location_number', 'location_type', 'latitude_x', 'longitude_x',
    'id', 'latitude_y', 'longitude_y', 'vendor_category_en', 'vendor_rating',
    'delivery_charge', 'serving_distance', 'is_open', 'discount_percentage',
    'display_orders',  # Including 'display_orders'
    'vendor_tag_name','sunday_from_time1', 'sunday_to_time1', 'sunday_from_time2',
    'sunday_to_time2', 'monday_from_time1', 'monday_to_time1', 'monday_from_time2',
    'monday_to_time2', 'tuesday_from_time1', 'tuesday_to_time1', 'tuesday_from_time2',
    'tuesday_to_time2', 'wednesday_from_time1', 'wednesday_to_time1', 'wednesday_from_time2',
    'wednesday_to_time2', 'thursday_from_time1', 'thursday_to_time1', 'thursday_from_time2',
    'thursday_to_time2', 'friday_from_time1', 'friday_to_time1', 'friday_from_time2', 'friday_to_time2',
    'saturday_from_time1', 'saturday_to_time1', 'saturday_from_time2', 'saturday_to_time2'
]


# Reduce features in the training dataset
train_reduced = train_sampled_pd[train_columns_to_keep]

# Reduce features in the testing dataset
test_reduced = test_sampled_pd[test_columns_to_keep]

# Save the reduced datasets (dbfs)
train_re



In [0]:
train_reduced.head()


In [0]:
test_reduced.head()

In [0]:
len(train_reduced)

In [0]:
import pandas as pd

# Load the dataset
train_data = pd.read_csv("C:/Users/Joash/Desktop/recommender/train_reduced_cleaned.csv")

# Display a sample of the specific columns
sample_data = train_data[['vendor_category_en', 'vendor_tag_name', 'vendor_rating']].head(10)
print(sample_data)


In [0]:
# Analyzing missing values for train data
missing_values = train_reduced.isnull().sum()
percent_missing = (train_reduced.isnull().sum() / len(train_reduced)) * 100

# Displaying the analysis
missing_analysis = pd.DataFrame({'missing_values': missing_values, 'percent_missing': percent_missing})
print(missing_analysis)

In [0]:
# Analyzing missing values for test data
missing_values = train_reduced.isnull().sum()
percent_missing = (test_reduced.isnull().sum() / len(test_reduced)) * 100

# Displaying the analysis
missing_analysis = pd.DataFrame({'missing_values': missing_values, 'percent_missing': percent_missing})
print(missing_analysis)

In [0]:
# Create independent copies to avoid SettingWithCopyWarning
train_reduced = train_reduced.copy()
test_reduced = test_reduced.copy()

# Handling missing values in 'location_type'
train_reduced['location_type'].fillna('Unknown', inplace=True)
test_reduced['location_type'].fillna('Unknown', inplace=True)

# Handling missing values in 'latitude_x' and 'longitude_x'
train_reduced = train_reduced.dropna(subset=['latitude_x', 'longitude_x'])
test_reduced = test_reduced.dropna(subset=['latitude_x', 'longitude_x'])

# Save the datasets after handling missing values
train_reduced.to_csv("C:/Users/Joash/Desktop/recommender/train_reduced_cleaned.csv", index=False)
test_reduced.to_csv("C:/Users/Joash/Desktop/recommender/test_reduced_cleaned.csv", index=False)

In [0]:
# Load the cleaned datasets
train_reduced_cleaned = pd.read_csv("C:/Users/Joash/Desktop/recommender/train_reduced_cleaned.csv")
test_reduced_cleaned = pd.read_csv("C:/Users/Joash/Desktop/recommender/test_reduced_cleaned.csv")

# Check for missing values in the training dataset
missing_values_train = train_reduced_cleaned.isnull().sum()
print("Missing values in training dataset:\n", missing_values_train)

# Check for missing values in the testing dataset
missing_values_test = test_reduced_cleaned.isnull().sum()
print("\nMissing values in testing dataset:\n", missing_values_test)


In [0]:
import numpy as np

# Haversine formula to calculate distance between two lat-lon points
def haversine(lat1, lon1, lat2, lon2):
    R = 6371  # Earth radius in kilometers
    dLat = np.radians(lat2 - lat1)
    dLon = np.radians(lon2 - lon1)
    a = np.sin(dLat/2) * np.sin(dLat/2) + np.cos(np.radians(lat1)) * np.cos(np.radians(lat2)) * np.sin(dLon/2) * np.sin(dLon/2)
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1-a))
    distance = R * c
    return distance

# Load the datasets
train_reduced_cleaned = pd.read_csv("C:/Users/Joash/Desktop/recommender/train_reduced_cleaned.csv")
test_reduced_cleaned = pd.read_csv("C:/Users/Joash/Desktop/recommender/test_reduced_cleaned.csv")

# Apply the function to calculate distance
train_reduced_cleaned['distance_customer_to_restaurant'] = train_reduced_cleaned.apply(lambda row: haversine(row['latitude_x'], row['longitude_x'], row['latitude_y'], row['longitude_y']), axis=1)
test_reduced_cleaned['distance_customer_to_restaurant'] = test_reduced_cleaned.apply(lambda row: haversine(row['latitude_x'], row['longitude_x'], row['latitude_y'], row['longitude_y']), axis=1)

# Save the datasets with the new feature
train_reduced_cleaned.to_csv("C:/Users/Joash/Desktop/recommender/train_feature_engineered.csv", index=False)
test_reduced_cleaned.to_csv("C:/Users/Joash/Desktop/recommender/test_feature_engineered.csv", index=False)


In [0]:
# Function to vectorize open hours calculation
def vectorized_open_hours(from_time1, to_time1, from_time2, to_time2):
    from_time1 = pd.to_datetime(from_time1, errors='coerce')
    to_time1 = pd.to_datetime(to_time1, errors='coerce')
    from_time2 = pd.to_datetime(from_time2, errors='coerce')
    to_time2 = pd.to_datetime(to_time2, errors='coerce')

    duration1 = np.where(pd.notna(from_time1) & pd.notna(to_time1), (to_time1 - from_time1).dt.total_seconds() / 3600, 0)
    duration2 = np.where(pd.notna(from_time2) & pd.notna(to_time2), (to_time2 - from_time2).dt.total_seconds() / 3600, 0)

    return duration1 + duration2

# Load the datasets
train_reduced_cleaned = pd.read_csv("C:/Users/Joash/Desktop/recommender/train_feature_engineered.csv")
test_reduced_cleaned = pd.read_csv("C:/Users/Joash/Desktop/recommender/test_feature_engineered.csv")

# List of days
days = ['sunday', 'monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday']

# Apply vectorized function for each day
for day in days:
    from_time1_col = f'{day}_from_time1'
    to_time1_col = f'{day}_to_time1'
    from_time2_col = f'{day}_from_time2'
    to_time2_col = f'{day}_to_time2'

    train_reduced_cleaned[f'{day}_open_hours'] = vectorized_open_hours(
        train_reduced_cleaned[from_time1_col],
        train_reduced_cleaned[to_time1_col],
        train_reduced_cleaned[from_time2_col],
        train_reduced_cleaned[to_time2_col])

    test_reduced_cleaned[f'{day}_open_hours'] = vectorized_open_hours(
        test_reduced_cleaned[from_time1_col],
        test_reduced_cleaned[to_time1_col],
        test_reduced_cleaned[from_time2_col],
        test_reduced_cleaned[to_time2_col])

# Save the datasets with new features
train_reduced_cleaned.to_csv("C:/Users/Joash/Desktop/recommender/train_full_feature_engineered.csv", index=False)
test_reduced_cleaned.to_csv("C:/Users/Joash/Desktop/recommender/test_full_feature_engineered.csv", index=False)



In [0]:
import pandas as pd

# Load the datasets -TODO change the read and writing to use the pyspark and databrick / dbutils file system
train_reduced_cleaned = pd.read_csv("C:/Users/Joash/Desktop/recommender/train_full_feature_engineered.csv")
test_reduced_cleaned = pd.read_csv("C:/Users/Joash/Desktop/recommender/test_full_feature_engineered.csv")

# Feature: Binary indicator if the customer has multiple orders
train_reduced_cleaned['multiple_orders_history'] = (train_reduced_cleaned['display_orders'] > 1).astype(int)
test_reduced_cleaned['multiple_orders_history'] = (test_reduced_cleaned['display_orders'] > 1).astype(int)

# Save the datasets with the new feature
train_reduced_cleaned.to_csv("C:/Users/Joash/Desktop/recommender/train_customer_feature_engineered.csv", index=False)
test_reduced_cleaned.to_csv("C:/Users/Joash/Desktop/recommender/test_customer_feature_engineered.csv", index=False)


In [0]:
import pandas as pd

# Load the datasets
train_reduced_cleaned = pd.read_csv("C:/Users/Joash/Desktop/recommender/train_customer_feature_engineered.csv")
test_reduced_cleaned = pd.read_csv("C:/Users/Joash/Desktop/recommender/test_customer_feature_engineered.csv")

# Assuming 'location_number' represents different locations for a customer
# Feature: Indicating if the customer uses multiple locations (as a proxy for location density)
train_reduced_cleaned['uses_multiple_locations'] = (train_reduced_cleaned['location_number'] > 1).astype(int)
test_reduced_cleaned['uses_multiple_locations'] = (test_reduced_cleaned['location_number'] > 1).astype(int)

# Save the datasets with the new feature
train_reduced_cleaned.to_csv("C:/Users/Joash/Desktop/recommender/train_location_density_feature_engineered.csv", index=False)
test_reduced_cleaned.to_csv("C:/Users/Joash/Desktop/recommender/test_location_density_feature_engineered.csv", index=False)


In [0]:
import pandas as pd

# Load the datasets
train_reduced_cleaned = pd.read_csv("C:/Users/Joash/Desktop/recommender/train_location_density_feature_engineered.csv")
test_reduced_cleaned = pd.read_csv("C:/Users/Joash/Desktop/recommender/test_location_density_feature_engineered.csv")

# Function to calculate cuisine diversity score
def calculate_cuisine_diversity(cuisine_list):
    # Assuming the cuisine list is a string of comma-separated values
    cuisines = cuisine_list.split(',') if pd.notna(cuisine_list) else []
    return len(set(cuisines))  # Count of unique cuisines

# Apply the function to calculate cuisine diversity
train_reduced_cleaned['cuisine_diversity_score'] = train_reduced_cleaned['vendor_tag_name'].apply(calculate_cuisine_diversity)
test_reduced_cleaned['cuisine_diversity_score'] = test_reduced_cleaned['vendor_tag_name'].apply(calculate_cuisine_diversity)

# Save the datasets with new features
train_reduced_cleaned.to_csv("C:/Users/Joash/Desktop/recommender/train_cuisine_feature_engineered.csv", index=False)
test_reduced_cleaned.to_csv("C:/Users/Joash/Desktop/recommender/test_cuisine_feature_engineered.csv", index=False)


In [0]:
import pandas as pd

# Load the dataset
train_data = pd.read_csv("C:/Users/Joash/Desktop/recommender/train_full_feature_engineered.csv")

# Select time-related columns for inspection
time_columns = ['sunday_from_time1', 'sunday_to_time1', 'monday_from_time1', 'monday_to_time1',  # Add other time columns here
                # ... continue with other days
               ]

# Display the first few rows of these columns
print(train_data[time_columns].head())


In [0]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

# Function to convert time string to minutes past midnight
def time_to_minutes(time_str):
    if pd.isna(time_str):
        return None
    h, m, s = map(int, time_str.split(':'))
    return h * 60 + m

# Load the dataset with all features
train_data = pd.read_csv("C:/Users/Joash/Desktop/recommender/train_cuisine_feature_engineered.csv")
test_data = pd.read_csv("C:/Users/Joash/Desktop/recommender/test_cuisine_feature_engineered.csv")

# Convert time columns to numerical format
time_columns = [
    'sunday_from_time1', 'sunday_to_time1', 'sunday_from_time2',
    'sunday_to_time2', 'monday_from_time1', 'monday_to_time1', 'monday_from_time2',
    'monday_to_time2', 'tuesday_from_time1', 'tuesday_to_time1', 'tuesday_from_time2',
    'tuesday_to_time2', 'wednesday_from_time1', 'wednesday_to_time1', 'wednesday_from_time2',
    'wednesday_to_time2', 'thursday_from_time1', 'thursday_to_time1', 'thursday_from_time2',
    'thursday_to_time2', 'friday_from_time1', 'friday_to_time1', 'friday_from_time2',
    'friday_to_time2', 'saturday_from_time1', 'saturday_to_time1', 'saturday_from_time2',
    'saturday_to_time2',
               ]
for col in time_columns:
    train_data[col] = train_data[col].apply(time_to_minutes)
    test_data[col] = test_data[col].apply(time_to_minutes)

# Update the list of numerical columns to include the time columns
numerical_cols = ['latitude_x', 'longitude_x', 'latitude_y', 'longitude_y',
                  'delivery_charge', 'serving_distance', 'vendor_rating',
                  'distance_customer_to_restaurant', 'cuisine_diversity_score'] + time_columns

# Apply StandardScaler
scaler = StandardScaler()
train_data[numerical_cols] = scaler.fit_transform(train_data[numerical_cols])
test_data[numerical_cols] = scaler.transform(test_data[numerical_cols])

# Save the fully preprocessed datasets
train_data.to_csv("C:/Users/Joash/Desktop/recommender/train_preprocessed.csv", index=False)
test_data.to_csv("C:/Users/Joash/Desktop/recommender/test_preprocessed.csv", index=False)


In [0]:
import pandas as pd

# Load the dataset
train_data = pd.read_csv("C:/Users/Joash/Desktop/recommender/train_preprocessed.csv")

# Check unique values in 'is_open'
unique_values = train_data['is_open'].unique()
print("Unique values in 'is_open':", unique_values)


In [0]:
import pandas as pd

# Load the preprocessed datasets
train_data = pd.read_csv("C:/Users/Joash/Desktop/recommender/train_preprocessed.csv")
test_data = pd.read_csv("C:/Users/Joash/Desktop/recommender/test_preprocessed.csv")

# List of categorical columns to be encoded
categorical_cols = ['gender', 'language', 'location_type',
                    'vendor_category_en', 'vendor_tag_name']

# Apply one-hot encoding
train_data = pd.get_dummies(train_data, columns=categorical_cols, drop_first=True)
test_data = pd.get_dummies(test_data, columns=categorical_cols, drop_first=True)

# Ensure the same set of columns in both datasets
train_data, test_data = train_data.align(test_data, join='left', axis=1, fill_value=0)

# Save the datasets after encoding
train_data.to_csv("C:/Users/Joash/Desktop/recommender/train_encoded.csv", index=False)
test_data.to_csv("C:/Users/Joash/Desktop/recommender/test_encoded.csv", index=False)


In [0]:
import pandas as pd

# Load the encoded dataset
train_data = pd.read_csv("C:/Users/Joash/Desktop/recommender/train_encoded.csv")

# List of numerical columns to check for skewness
numerical_cols = ['delivery_charge', 'serving_distance', 'vendor_rating',
                  'sunday_from_time1', 'sunday_to_time1', 'sunday_from_time2',
    'sunday_to_time2', 'monday_from_time1', 'monday_to_time1', 'monday_from_time2',
    'monday_to_time2', 'tuesday_from_time1', 'tuesday_to_time1', 'tuesday_from_time2',
    'tuesday_to_time2', 'wednesday_from_time1', 'wednesday_to_time1', 'wednesday_from_time2',
    'wednesday_to_time2', 'thursday_from_time1', 'thursday_to_time1', 'thursday_from_time2',
    'thursday_to_time2', 'friday_from_time1', 'friday_to_time1', 'friday_from_time2',
    'friday_to_time2', 'saturday_from_time1', 'saturday_to_time1', 'saturday_from_time2',
    'saturday_to_time2','latitude_x', 'longitude_x', 'latitude_y', 'longitude_y' ,
                  'distance_customer_to_restaurant', 'cuisine_diversity_score',
                 ]

# Calculate and print skewness for each numerical column
for col in numerical_cols:
    skewness = train_data[col].skew()
    print(f"Skewness for {col}: {skewness}")


In [0]:
# Load the dataset
train_data = pd.read_csv("C:/Users/Joash/Desktop/recommender/train_encoded.csv")

# Check for non-positive values in skewed columns
skewed_cols = ['latitude_x', 'latitude_y', 'longitude_y']
for col in skewed_cols:
    non_positive_count = train_data[train_data[col] <= 0][col].count()
    print(f"Non-positive values in {col}: {non_positive_count}")


In [0]:
from sklearn.preprocessing import MinMaxScaler

# Load the dataset
train_data = pd.read_csv("C:/Users/Joash/Desktop/recommender/train_encoded.csv")
test_data = pd.read_csv("C:/Users/Joash/Desktop/recommender/test_encoded.csv")

# Columns for Min-Max Normalization
normalization_cols = ['latitude_x', 'latitude_y', 'longitude_y']

# Initialize the MinMaxScaler
scaler = MinMaxScaler()

# Normalize specified columns
train_data[normalization_cols] = scaler.fit_transform(train_data[normalization_cols])
test_data[normalization_cols] = scaler.transform(test_data[normalization_cols])

# Save the datasets after normalization
train_data.to_csv("C:/Users/Joash/Desktop/recommender/train_normalized.csv", index=False)
test_data.to_csv("C:/Users/Joash/Desktop/recommender/test_normalized.csv", index=False)


In [0]:
from sklearn.model_selection import train_test_split


# Load the normalized training dataset
train_data = pd.read_csv("C:/Users/Joash/Desktop/recommender/train_normalized.csv")

# Separate features and target variable
# Assuming 'target' is your label column. Adjust the column name if necessary
X = train_data.drop('target', axis=1)  # Drop the target column to create feature set
y = train_data['target']  # Target variable

# Split the data into training and validation sets
# Here, 20% of the data is reserved for validation, and 80% for training
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Optionally, you can check the size of each set
print(f"Training set size: {X_train.shape}")
print(f"Validation set size: {X_val.shape}")

# At this point, you can proceed to train your model using X_train and y_train
# and validate it using X_val and y_val



In [0]:
from surprise import SVD
print("scikit-surprise imported successfully!")



In [0]:
!pip install pandas

In [0]:
import pandas as pd
train_data = pd.read_csv("C:/Users/Joash/Desktop/recommender/train_normalized.csv")
from surprise import Reader, Dataset, SVD
from surprise.model_selection import cross_validate

# Reshape the data for Surprise
reader = Reader(rating_scale=(0, 1))  # Assuming ratings are binary (0 or 1)
data = Dataset.load_from_df(train_data[['customer_id', 'id', 'target']], reader)

# Define the SVD algorithm
svd = SVD()

# Perform cross-validation (you can later use train-test split or full train)
cross_validate(svd, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)


In [0]:
!conda list scikit-surprise


In [0]:
!conda list


In [0]:
#Collaborative Filtering Implementation:

In [0]:
from surprise import Reader, Dataset, SVD
from surprise.model_selection import cross_validate, train_test_split

# Load the dataset
train_data = pd.read_csv("C:/Users/Joash/Desktop/recommender/train_normalized.csv")

# Prepare the dataset for Surprise
reader = Reader(rating_scale=(train_data['target'].min(), train_data['target'].max()))
data = Dataset.load_from_df(train_data[['customer_id', 'id', 'target']], reader)

# Split the data into training and test sets
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

# Use SVD algorithm
model = SVD()

# Train the model
model.fit(trainset)

# Make predictions on the test set
predictions = model.test(testset)

# Calculate and print RMSE
accuracy = accuracy.rmse(predictions)
print(f'RMSE: {accuracy}')

In [0]:
#content based filtering

In [0]:
import pandas as pd

# Load the dataset
file_path = "C:/Users/Joash/Desktop/recommender/train_normalized.csv"
data = pd.read_csv(file_path)

# Display the first few customer IDs
print("Some customer IDs from the dataset:")
print(data['customer_id'].unique()[:10])


In [0]:

import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

# Function to load and preprocess data
def load_and_preprocess_data(file_path):
    data = pd.read_csv(file_path)

    # Selecting relevant features
    features = [
        'distance_customer_to_restaurant', 'cuisine_diversity_score',
        'uses_multiple_locations', 'multiple_orders_history',
        'sunday_open_hours', 'monday_open_hours', 'tuesday_open_hours',
        'wednesday_open_hours', 'thursday_open_hours', 'friday_open_hours',
        'saturday_open_hours', 'latitude_x', 'longitude_x',
        'latitude_y', 'longitude_y', 'vendor_rating',
        'delivery_charge', 'serving_distance'
    ]

    # Assuming 'target' is the column to predict
    X = data[features]
    y = data['target']

    # Scaling features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    return X_scaled, y

# Function for content-based filtering
def content_based_filtering(user_id, data, X_scaled, top_n=10):
    # Filter data for the specific user
    user_data = data[data['customer_id'] == user_id]
    user_index = user_data.index

    # Calculate similarity
    similarity_scores = cosine_similarity(X_scaled[user_index], X_scaled)

    # Create a DataFrame for similarity scores
    similarity_df = pd.DataFrame(similarity_scores, columns=data.index, index=user_index)

    # Remove the user's own interactions
    similarity_df = similarity_df.drop(user_index)

    # Get top N similar items
    top_items = similarity_df.mean().nlargest(top_n).index

    return data.loc[top_items]

# Load data
file_path = "C:/Users/Joash/Desktop/recommender/train_normalized.csv"
X_scaled, y = load_and_preprocess_data(file_path)

# Load original data to get restaurant details
original_data = pd.read_csv(file_path)

# Example usage
user_id = 'M758NNC'  # Example user ID (replace with a real ID from your dataset)
recommended_items = content_based_filtering(user_id, original_data, X_scaled, top_n=10)
print(recommended_items)



In [0]:
      # Should not be empty
