# Preprocessing the data

In [None]:
import LoadUtils
import pandas as pd

PATH_BUSINESS = 'yelp_dataset/yelp_academic_dataset_business.json'
PATH_REVIEW = 'yelp_dataset/yelp_academic_dataset_review.json'

In [None]:
# Recommend to set to a low value like 100,000 for quicker iteration.
# Set to -1 to load the full files, which will make processing SIGNIFICANTLY slower.
N_LINES = 1000000

In [None]:
# Filter for OPEN ONLY businesses in California
data_business = LoadUtils.load_matches(PATH_BUSINESS, n_lines=N_LINES, verbose=True,
                                       func=LoadUtils.fn_all,
                                       args=[(LoadUtils.fn_eq, "state", "CA"),
                                             (LoadUtils.fn_eq, "is_open", 1)])
business_ca_open = pd.DataFrame(data_business)

# Display filtered data
print(f"Number of open businesses in California: {business_ca_open.shape[0]}")
print(business_ca_open.head())

In [None]:
biz_ids = set()
for item in data_business:
    biz_ids.add(item["business_id"])

# full data is 7 million entries, so this takes pretty long (around 3 minutes)
data_review = LoadUtils.load_matches(PATH_REVIEW, n_lines=N_LINES, verbose=True,
                                     func=LoadUtils.fn_in,
                                     args=("business_id", biz_ids))

In [None]:
df_review = pd.DataFrame(data_review)
review_ca_open = df_review.drop(labels=['stars', 'useful', 'funny', 'cool', 'date'], axis=1)

# Display filtered reviews
print(f"{len(set(review_ca_open['user_id']))} users" + \
      f" made {len(set(review_ca_open['review_id']))} reviews" + \
      f" across {len(set(review_ca_open['business_id']))} businesses")
print(review_ca_open.head())

In [None]:
# this cell takes pretty long on larger data, 4-5 minutes
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA

# Convert the textual reviews into a numerical representation using TF-IDF (Term Frequency-Inverse Document Frequency)
tfidf = TfidfVectorizer(stop_words='english', max_features=5000)  # Limit to 5000 most important terms
review_tfidf = tfidf.fit_transform(review_ca_open['text'])

print(f"TF-IDF Matrix Shape: {review_tfidf.shape}")  # High-dimensional representation

# Apply Dimentionality Reduction using Principal Component Analysis (PCA)
pca = PCA(n_components=50)  # Reduce to 50 components
reduced_reviews = pca.fit_transform(review_tfidf.toarray())

print(f"Reduced Dimensions Shape: {reduced_reviews.shape}")  # Lower-dimensional representation

# Clustering

In [None]:
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

# Determine the best number of clusters using Elbow Method
inertia = []
k_values = range(2, 30+1)

print("Now clustering with k=", end='')
for k in k_values:
    print(f" {k}...", end='')
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(reduced_reviews)
    inertia.append(kmeans.inertia_)

# Plot the Elbow Curve
plt.figure(figsize=(8, 6))
plt.plot(k_values, inertia, marker='o', linestyle='--')
plt.title('Elbow Method for Optimal k')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Inertia')
plt.grid()
plt.show()

In [None]:
# Apply K-Means with the chosen number of clusters based on the Elbow Curve
optimal_k = 20
kmeans = KMeans(n_clusters=optimal_k, random_state=42)
clusters = kmeans.fit_predict(reduced_reviews)

clustered_reviews = review_ca_open[['review_id', 'user_id', 'business_id']]
clustered_reviews = clustered_reviews.assign(cluster=pd.Series(clusters))
print(clustered_reviews.head())

In [None]:
def get_cluster_of(k):
    """
    Given a df column name, make a new df that associates the named
    attribute to the cluster that it most commonly belongs to.
    """
    output = clustered_reviews.groupby(
        [k, 'cluster']).size().reset_index(name='occurrence')
    output = output.loc[
        output.groupby(k)['occurrence'].idxmax()
    ]
    return output.drop('occurrence', axis=1)

In [None]:
biz_cluster = get_cluster_of('business_id')
user_cluster = get_cluster_of('user_id')

# Output

In [None]:
def recommend_businesses(user_id, num_recommendations=5):
    # Get the cluster that the user belongs to
    c = user_cluster[user_cluster['user_id'] == user_id]['cluster'].values[0]
    
    # Get all businesses in this cluster
    businesses = biz_cluster[biz_cluster['cluster'] == c]
    
    # Exclude businesses the user has already reviewed
    reviewed_businesses = clustered_reviews[clustered_reviews['user_id'] == user_id]['business_id'].unique()
    recommendations = businesses[~businesses['business_id'].isin(reviewed_businesses)]

    # Select top businesses for recommendation (or fewer if less available)
    recommendations = recommendations.sample(n=min(num_recommendations, len(recommendations)))
    recommendations = recommendations.merge(business_ca_open[['business_id', 'name', 'categories']], on='business_id')
    return recommendations[['name', 'categories']]

In [None]:
uid = user_cluster.iloc[999]['user_id']

recs = recommend_businesses(uid)
print(f"For user with id {uid}, these are the recommendations:")
print(recs)

In [None]:
# Find insights for users and business owners using Association Rules
from mlxtend.frequent_patterns import apriori, association_rules

# Prepare data for Apriori (one-hot encoding)
user_cluster_matrix = clustered_reviews.pivot_table(
    index='user_id', columns='cluster', aggfunc='size', fill_value=0
)
user_cluster_matrix = user_cluster_matrix.applymap(lambda x: True if x > 0 else False)

# Apply Apriori algorithm to find frequent itemsets
frequent_itemsets = apriori(user_cluster_matrix, min_support=0.03, use_colnames=True)

# Generate association rules
rules = association_rules(frequent_itemsets, user_cluster_matrix, metric="lift", min_threshold=1.0)

# Sort rules by lift and display
rules_sorted = rules.sort_values(by="lift", ascending=False)
print(rules_sorted[['antecedents', 'consequents', 'support', 'confidence', 'lift']].head())