# Recommendation System:
the following code recommends top5 restaurants(with over 2000+ 5 star ratings) to about 1000 users you have rated restaurants over 4 stars.

# Summary Of actions:


1.   Read the Business.csv file, read and combine 5 csv files(User details)
2.   Recommendation system: is based on the Cosine Similarity of the TF-IDF vectors.
3.   evaluation methodology: leave-one-out evaluation. In this approach, we simulate a test scenario by holding out one item from each user's preferences as the "ground truth" and then evaluating the recommendations based on the remaining items.




In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
import os
import pandas as pd

# Define the directory path containing the CSV files
directory_path = '/content/drive/MyDrive/Yelp/yelp-cleaned_user_reviews'

# List all CSV files in the directory
csv_files = [file for file in os.listdir(directory_path) if file.endswith('.csv')]

# Initialize an empty dictionary to store DataFrames
dfs = {}

# Read each CSV file into separate DataFrames and store them in the dictionary
for idx, file in enumerate(csv_files):
    file_path = os.path.join(directory_path, file)
    # Read the CSV file into a DataFrame
    df = pd.read_csv(file_path, header=None, names=['UserID', 'topwords'])
    # Store the DataFrame in the dictionary with a key based on the index
    dfs[f'df_{idx}'] = df

# Combine all DataFrames into one
users_df = pd.concat(dfs.values(), ignore_index=True)

# Now `combined_df` contains all the data from the CSV files combined into one DataFrame


In [5]:
users_df.head()

Unnamed: 0,UserID,topwords
0,---udAKDsn0yQXmzbWQNSw,"[everything, flavorful, reasonable, price, fav..."
1,--44NNdtngXMzsxyN7ju6Q,"[delicious, cornbread, muffin, absolutely, die..."
2,--7gZYIAVGCaPT4k0qbbrw,"[three, fine, meal, recently, happy, see, rest..."
3,--Awhttr0FhVud5prVpVlA,"[outdoor, area, enjoy, summer, weather, couple..."
4,--CIuK7sUpaNzalLAlHJKA,"[stayed, california, hotel, july, weekend, rea..."


In [6]:

# Define the directory path containing the CSV files
directory_path = '/content/drive/MyDrive/Yelp/yelp-cleaned_business_topwords1'

# List all CSV files in the directory
csv_files = [file for file in os.listdir(directory_path) if file.endswith('.csv')]

# Initialize an empty dictionary to store DataFrames
dfs = {}

# Read each CSV file into separate DataFrames and store them in the dictionary
for idx, file in enumerate(csv_files):
    file_path = os.path.join(directory_path, file)
    # Read the CSV file into a DataFrame
    df = pd.read_csv(file_path, header=None, names=['BusinessID', 'topwords'])
    # Store the DataFrame in the dictionary with a key based on the index
    dfs[f'df_{idx}'] = df

# Combine all DataFrames into one
business_df = pd.concat(dfs.values(), ignore_index=True)

# Now `combined_df` contains all the data from the CSV files combined into one DataFrame

In [7]:
business_df.head()

Unnamed: 0,BusinessID,topwords
0,TMf3yJqOGGXsPFA3Ql2ltw,"tea, bubble, place, flavor, love, time, always..."
1,sWh-N7K3ebRHZKhhH01mJQ,"staff, hospital, nurse, er, experience, time, ..."
2,q41a20zmo6bFmYBzjUABtA,"tire, service, big, back, time, guy, car, grea..."
3,q4mx-ff4_LvU3NnMvNIrZA,"great, food, burger, place, time, good, sammie..."
4,-7PX_FOoCwktlunImRyZdg,"pizza, place, great, delicious, taglio, best, ..."


In [15]:
first_1000_df = users_df.head(1000)

In [16]:
first_1000_df.head()

Unnamed: 0,UserID,topwords
0,---udAKDsn0yQXmzbWQNSw,"[everything, flavorful, reasonable, price, fav..."
1,--44NNdtngXMzsxyN7ju6Q,"[delicious, cornbread, muffin, absolutely, die..."
2,--7gZYIAVGCaPT4k0qbbrw,"[three, fine, meal, recently, happy, see, rest..."
3,--Awhttr0FhVud5prVpVlA,"[outdoor, area, enjoy, summer, weather, couple..."
4,--CIuK7sUpaNzalLAlHJKA,"[stayed, california, hotel, july, weekend, rea..."


In [10]:
business_df.columns = ['BusinessID', 'Reviews']

In [11]:
business_df.head()

Unnamed: 0,BusinessID,Reviews
0,TMf3yJqOGGXsPFA3Ql2ltw,"tea, bubble, place, flavor, love, time, always..."
1,sWh-N7K3ebRHZKhhH01mJQ,"staff, hospital, nurse, er, experience, time, ..."
2,q41a20zmo6bFmYBzjUABtA,"tire, service, big, back, time, guy, car, grea..."
3,q4mx-ff4_LvU3NnMvNIrZA,"great, food, burger, place, time, good, sammie..."
4,-7PX_FOoCwktlunImRyZdg,"pizza, place, great, delicious, taglio, best, ..."


In [17]:
business_df.info()
first_1000_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22737 entries, 0 to 22736
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   BusinessID  22737 non-null  object
 1   Reviews     22737 non-null  object
dtypes: object(2)
memory usage: 355.4+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   UserID    1000 non-null   object
 1   topwords  1000 non-null   object
dtypes: object(2)
memory usage: 15.8+ KB


In [18]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# # Load datasets
# business_df = pd.read_excel('Business.xlsx')
# user_df = pd.read_excel('User.xlsx')

# Aggregate top words for businesses and users
business_profiles = business_df.groupby('BusinessID')['Reviews'].apply(lambda x: ' '.join(x)).reset_index()
user_profiles = first_1000_df.groupby('UserID')['topwords'].apply(lambda x: ' '.join(x)).reset_index()

# Initialize TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform business and user profiles
business_vectors = tfidf_vectorizer.fit_transform(business_profiles['Reviews'])
user_vectors = tfidf_vectorizer.transform(user_profiles['topwords'])

# Calculate cosine similarity between users and businesses
user_business_similarity = cosine_similarity(user_vectors, business_vectors)

# Generate recommendations for each user
recommendations = {}
for i, user_row in user_profiles.iterrows():
    user_id = user_row['UserID']
    user_similarities = user_business_similarity[i]

    # Get top N similar businesses for the user
    top_n_similarities_indices = user_similarities.argsort()[-5:][::-1]  # Get top 5 similar businesses
    top_n_similar_businesses = business_profiles.iloc[top_n_similarities_indices]

    # Store recommendations for the user
    recommendations[user_id] = top_n_similar_businesses

# Display recommendations
for user_id, recommended_businesses in recommendations.items():
    print(f"Recommendations for User {user_id}:")
    for index, business in recommended_businesses.iterrows():
        print(f"- BusinessID: {business['BusinessID']}, Top Words: {business['Reviews']}")
    print()


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
- BusinessID: Z1mLBp6BMlHSQUh364vk1g, Top Words: lobster, great, food, service, good, place, point, oyster, blue, seafood, amazing, cleveland, happy, restaurant, best, time, hour, dinner, excellent, nice, delicious, would, menu, crab, really, shrimp, one, also, meal, u

Recommendations for User -9l-v13ZK2v2ld0dSbAucw:
- BusinessID: VdmHT-RrwYo8TF1YVfEfZg, Top Words: view, vega, tower, ride, go, top, see, get, stratosphere, night, also, deck, time, observation, amazing, hotel, one, best, strip, great, worth, even, la, experience, day, place, would, must, trip, back
- BusinessID: TFdLM8R_7Pf1xVeBNKl0hg, Top Words: casbah, great, food, restaurant, good, service, time, one, delicious, bread, nice, dessert, menu, dinner, brunch, birthday, best, favorite, really, place, dish, also, meal, wine, amazing, duck, ordered, always, entree, get
- BusinessID: KvbvfyI82dJMgo0NZgDgLg, Top Words: restaurant, place, house, food, good, menu,

In [19]:
import numpy as np
# Precision
def precision(actual, recommended):
    intersection = len(set(actual) & set(recommended))
    return intersection / len(recommended)

# Recall
def recall(actual, recommended):
    intersection = len(set(actual) & set(recommended))
    return intersection / len(actual)

# Accuracy
def accuracy(actual, recommended):
    correct = sum(1 for business in recommended if business in actual)
    return correct / len(actual)

# Mean Average Precision (MAP)
def average_precision(actual, recommended):
    precision_sum = 0.0
    hits = 0
    for i, business in enumerate(recommended):
        if business in actual:
            hits += 1
            precision_sum += hits / (i + 1)
    return precision_sum / len(actual)

# Normalized Discounted Cumulative Gain (NDCG)
def dcg_at_k(actual, recommended, k):
    dcg = 0.0
    for i, business in enumerate(recommended[:k]):
        if business in actual:
            dcg += 1 / np.log2(i + 2)
    return dcg

def leave_one_out_evaluation(user_profiles, business_profiles, user_business_similarity):
    precision_scores = []
    recall_scores = []
    accuracy_scores = []
    map_scores = []
    ndcg_scores = []

    for i, user_row in user_profiles.iterrows():
        user_id = user_row['UserID']
        actual = user_row['topwords'].split()  # Ground truth preferences

        # Find the index of the most similar business
        most_similar_index = user_business_similarity[i].argsort()[-1]

        # Remove the most similar business from the user's preferences
        recommended = business_profiles.loc[most_similar_index, 'Reviews'].split()

        # Compute evaluation metrics
        precision_scores.append(precision(actual, recommended))
        recall_scores.append(recall(actual, recommended))
        accuracy_scores.append(accuracy(actual, recommended))
        map_scores.append(average_precision(actual, recommended))
        ndcg_scores.append(dcg_at_k(actual, recommended, k=5))

    mean_precision = np.mean(precision_scores)
    mean_recall = np.mean(recall_scores)
    mean_accuracy = np.mean(accuracy_scores)
    mean_map = np.mean(map_scores)
    mean_ndcg = np.mean(ndcg_scores)

    return mean_precision, mean_recall, mean_accuracy, mean_map, mean_ndcg

#  usage:
mean_precision, mean_recall, mean_accuracy, mean_map, mean_ndcg = leave_one_out_evaluation(user_profiles, business_profiles, user_business_similarity)

print(f"Mean Precision: {mean_precision}")
print(f"Mean Recall: {mean_recall}")
print(f"Mean Accuracy: {mean_accuracy}")
print(f"Mean Average Precision (MAP): {mean_map}")
print(f"Mean Normalized Discounted Cumulative Gain (NDCG): {mean_ndcg}")


Mean Precision: 0.3364666666666667
Mean Recall: 0.2088945710505074
Mean Accuracy: 0.2088945710505074
Mean Average Precision (MAP): 0.09434059178586444
Mean Normalized Discounted Cumulative Gain (NDCG): 1.4227369020532081
