# Preprocessing the data

In [None]:
import LoadUtils
import pandas as pd

PATH_BUSINESS = 'yelp_dataset/yelp_academic_dataset_business.json'
PATH_REVIEW = 'yelp_dataset/yelp_academic_dataset_review.json'

In [None]:
# Recommend to set to a low value like 100,000 for quicker iteration.
# Set to -1 to load the full files, which will make processing SIGNIFICANTLY slower.
N_LINES = 1000000

In [None]:
# Filter for OPEN ONLY businesses in California
data_business = LoadUtils.load_matches(PATH_BUSINESS, n_lines=N_LINES, verbose=True,
                                       func=LoadUtils.fn_all,
                                       args=[(LoadUtils.fn_eq, "state", "CA"),
                                             (LoadUtils.fn_eq, "is_open", 1)])
business_ca_open = pd.DataFrame(data_business)

# Display filtered data
print(f"Number of open businesses in California: {business_ca_open.shape[0]}")
print(business_ca_open.head())

In [None]:
biz_ids = set()
for item in data_business:
    biz_ids.add(item["business_id"])

# full data is 7 million entries, so this takes pretty long (around 3 minutes)
data_review = LoadUtils.load_matches(PATH_REVIEW, n_lines=N_LINES, verbose=True,
                                     func=LoadUtils.fn_in,
                                     args=("business_id", biz_ids))

In [None]:
df_review = pd.DataFrame(data_review)
review_ca_open = df_review.merge(business_ca_open[['business_id', 'name', 'categories']], on='business_id')

# Display filtered reviews
print(f"Number of reviews for open businesses in California: {review_ca_open.shape[0]}")
print(f"These reviews span {len(set(review_ca_open['business_id']))} unique businesses")
print(review_ca_open.head())

In [None]:
# this cell takes pretty long on larger data, 4-5 minutes
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA

# Convert the textual reviews into a numerical representation using TF-IDF (Term Frequency-Inverse Document Frequency)
tfidf = TfidfVectorizer(stop_words='english', max_features=5000)  # Limit to 5000 most important terms
review_tfidf = tfidf.fit_transform(review_ca_open['text'])

print(f"TF-IDF Matrix Shape: {review_tfidf.shape}")  # High-dimensional representation

# Apply Dimentionality Reduction using Principal Component Analysis (PCA)
pca = PCA(n_components=50)  # Reduce to 50 components
reduced_reviews = pca.fit_transform(review_tfidf.toarray())

print(f"Reduced Dimensions Shape: {reduced_reviews.shape}")  # Lower-dimensional representation

# Clustering

In [None]:
# Associate reduced review features with businesses
business_features = pd.DataFrame(reduced_reviews, columns=[f'PC{i+1}' for i in range(reduced_reviews.shape[1])])
business_features['business_id'] = review_ca_open['business_id'].values

In [None]:
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

# Determine the best number of clusters using Elbow Method
inertia = []
k_values = range(2, 30)

print("Now clustering with k=", end='')
for k in k_values:
    print(f" {k}...", end='')
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(business_features.drop(columns=['business_id']))
    inertia.append(kmeans.inertia_)

# Plot the Elbow Curve
plt.figure(figsize=(8, 6))
plt.plot(k_values, inertia, marker='o', linestyle='--')
plt.title('Elbow Method for Optimal k')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Inertia')
plt.grid()
plt.show()

In [None]:
# Apply K-Means with the chosen number of clusters based on the Elbow Curve
optimal_k = 10
kmeans = KMeans(n_clusters=optimal_k, random_state=42)
business_features['cluster'] = kmeans.fit_predict(business_features.drop(columns=['business_id']))

# get each business id's most popular cluster
biz_cluster = {"business_id": [], "cluster": []}
for biz_id in set(business_features["business_id"]):
    item = business_features[business_features["business_id"] == biz_id]
    x = item["cluster"].value_counts().index[0]
    biz_cluster["business_id"].append(biz_id)
    biz_cluster["cluster"].append(x)

# Merge cluster labels back to business data
df_biz_cluster = pd.DataFrame(biz_cluster)
business_clusters = business_ca_open[['business_id', 'name', 'categories']].merge(
    df_biz_cluster, on='business_id'
)

# Display sample results
print(business_clusters.head())

# Output

In [None]:
# Merge user reviews with clusters to determine user interactions
user_cluster_data = review_ca_open.merge(
    business_clusters[['business_id', 'cluster']],
    on='business_id')

# Assign users to their most-interacted cluster
user_cluster_pref = user_cluster_data.groupby(['user_id', 'cluster']).size().reset_index(name='review_count')

# Find the cluster with the highest review count for each user
user_pref_cluster = user_cluster_pref.loc[
    user_cluster_pref.groupby('user_id')['review_count'].idxmax()
].rename(columns={'cluster': 'preferred_cluster'})

# Recommend businesses from the preferred cluster
def recommend_businesses(user_id, num_recommendations=5):
    # Check if the user exists in the preference data
    if user_id not in user_pref_cluster['user_id'].values:
        return f"User {user_id} has no interactions recorded."

    # Get the user's preferred cluster
    preferred_cluster = user_pref_cluster[user_pref_cluster['user_id'] == user_id]['preferred_cluster'].values[0]
    
    # Get all businesses in this cluster
    cluster_businesses = business_clusters[business_clusters['cluster'] == preferred_cluster]
    
    # Exclude businesses the user has already reviewed
    reviewed_businesses = user_cluster_data[user_cluster_data['user_id'] == user_id]['business_id'].unique()
    recommendations = cluster_businesses[~cluster_businesses['business_id'].isin(reviewed_businesses)]
    
    # Select top businesses for recommendation (or fewer if less available)
    return recommendations[['name', 'categories']].sample(n=min(num_recommendations, len(recommendations)))

In [None]:
ILOC = 420

user_id_example = user_pref_cluster['user_id'].iloc[ILOC]
recommendations = recommend_businesses(user_id_example, num_recommendations=5)
print(f"Recommendations for User {user_id_example}")
recommendations