In [22]:
import pandas as pd
import numpy as np
from scipy.sparse import coo_matrix
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt

In [23]:
data = pd.read_csv('Feedback_original.csv')

In [24]:
new_column_names = ['user_id'] + [f'restaurant_{i}' for i in range(1, len(data.columns))]
data.columns = new_column_names

In [None]:
print(data.head())
print(data.columns)

In [None]:
print("\nDistribution of ratings for each restaurant:")
for column in data.columns[1:]:
    plt.figure()
    plt.hist(data[column], bins=5, edgecolor='black')
    plt.title(f'Distribution of Ratings for {column}')
    plt.xlabel('Rating')
    plt.ylabel('Frequency')
    plt.show()

In [30]:
users = list(range(len(data)))
## Exclude 'user_id'
restaurants = new_column_names[1:]

In [32]:
# Build a list of tuples: (user, restaurant, rating)
ratings_list = []
for user in users:
    for restaurant in restaurants:
        rating = data.loc[user, restaurant]
        if pd.notna(rating):
            ratings_list.append((user, restaurants.index(restaurant), rating))

# Convert list to COO matrix
row_indices, col_indices, ratings = zip(*ratings_list)
rating_matrix = coo_matrix((ratings, (row_indices, col_indices)), shape=(len(users), len(restaurants))).toarray()

In [33]:
# ALS parameters
num_factors = 8  # Number of latent factors
num_iterations = 50  # Number of iterations
reg_param = 0.2  # Regularization parameter
tol = 1e-4

# ALS function with convergence criteria
def als_train(matrix, num_factors, num_iterations, reg_param, tol=1e-4):
    num_users, num_items = matrix.shape
    user_factors = np.random.normal(scale=1. / num_factors, size=(num_users, num_factors))
    item_factors = np.random.normal(scale=1. / num_factors, size=(num_items, num_factors))
    last_rmse = None

    for iteration in range(num_iterations):
        # Update user factors
        for u in range(num_users):
            relevant_items = matrix[u, :] > 0
            A = item_factors[relevant_items, :]
            V = matrix[u, relevant_items]
            user_factors[u, :] = np.linalg.solve(A.T @ A + reg_param * np.eye(num_factors), A.T @ V)

        # Update item factors
        for i in range(num_items):
            relevant_users = matrix[:, i] > 0
            A = user_factors[relevant_users, :]
            V = matrix[relevant_users, i]
            item_factors[i, :] = np.linalg.solve(A.T @ A + reg_param * np.eye(num_factors), A.T @ V)

        # Compute RMSE to check for convergence
        predicted_ratings = predict(user_factors, item_factors)
        mask = matrix > 0
        current_rmse = np.sqrt(mean_squared_error(matrix[mask], predicted_ratings[mask]))
        print(f"Iteration {iteration + 1}: RMSE = {current_rmse:.4f}")

        # Check for convergence
        if last_rmse and abs(last_rmse - current_rmse) < tol:
            print(f"Converged at iteration {iteration + 1}")
            break
        last_rmse = current_rmse

    return user_factors, item_factors

# Prediction function
def predict(user_factors, item_factors):
    return user_factors @ item_factors.T

# Train the model
user_factors, item_factors = als_train(rating_matrix, num_factors, num_iterations, reg_param, tol)

# Predict the ratings
predicted_ratings = predict(user_factors, item_factors)

# Calculate RMSE on known ratings
mask = rating_matrix > 0
rmse = np.sqrt(mean_squared_error(rating_matrix[mask], predicted_ratings[mask]))
print(f'Final RMSE on known ratings: {rmse:.4f}')

Iteration 1: RMSE = 0.5400
Iteration 2: RMSE = 0.4541
Iteration 3: RMSE = 0.4313
Iteration 4: RMSE = 0.4195
Iteration 5: RMSE = 0.4136
Iteration 6: RMSE = 0.4100
Iteration 7: RMSE = 0.4075
Iteration 8: RMSE = 0.4055
Iteration 9: RMSE = 0.4039
Iteration 10: RMSE = 0.4027
Iteration 11: RMSE = 0.4018
Iteration 12: RMSE = 0.4011
Iteration 13: RMSE = 0.4007
Iteration 14: RMSE = 0.4004
Iteration 15: RMSE = 0.4002
Iteration 16: RMSE = 0.4000
Iteration 17: RMSE = 0.3999
Iteration 18: RMSE = 0.3999
Converged at iteration 18
Final RMSE on known ratings: 0.3999


In [34]:
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.neighbors import NearestNeighbors

In [35]:
# Define global mean and user mean baselines
global_mean = np.mean(rating_matrix[rating_matrix > 0])

# Global Mean RMSE
global_mean_predictions = np.full_like(rating_matrix, global_mean)
global_mean_rmse = np.sqrt(mean_squared_error(rating_matrix[rating_matrix > 0], global_mean_predictions[rating_matrix > 0]))

# User Mean RMSE
user_means = np.array([np.mean(row[row > 0]) if np.any(row > 0) else global_mean for row in rating_matrix])

# Initialize an empty user mean predictions matrix
user_mean_predictions = np.zeros_like(rating_matrix)
for i in range(rating_matrix.shape[0]):
    user_mean_predictions[i, rating_matrix[i] > 0] = user_means[i]

user_mean_rmse = np.sqrt(mean_squared_error(rating_matrix[rating_matrix > 0], user_mean_predictions[rating_matrix > 0]))
print(f'Global Mean RMSE: {global_mean_rmse:.4f}')
print(f'User Mean RMSE: {user_mean_rmse:.4f}')


Global Mean RMSE: 1.2231
User Mean RMSE: 1.0623


In [37]:
#KNN
def knn_predictions(rating_matrix, k):
    knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=k, n_jobs=-1)
    knn.fit(rating_matrix)
    distances, indices = knn.kneighbors(rating_matrix)

    knn_predictions = np.zeros(rating_matrix.shape)
    for i in range(rating_matrix.shape[0]):
        dist = distances[i]
        weights = np.array([1/d if d > 0 else 0 for d in dist])
        norm = np.sum(weights)
        neighbors = indices[i]

        neighbor_ratings = rating_matrix[neighbors, :]
        weighted_ratings = (weights[:, np.newaxis] * neighbor_ratings).sum(axis=0)
        knn_predictions[i] = weighted_ratings / norm if norm > 0 else 0

    return knn_predictions

# Function to compute RMSE for a given k
def compute_rmse(rating_matrix, k):
    knn_pred = knn_predictions(rating_matrix, k)
    return np.sqrt(mean_squared_error(rating_matrix[rating_matrix > 0], knn_pred[rating_matrix > 0]))

# Range of k values to test
k_values = [1, 3, 5, 7, 10]

# Compute RMSE for each k and store the results
rmse_results = []
for k in k_values:
    rmse = compute_rmse(rating_matrix, k)
    rmse_results.append((k, rmse))

# Find the best k value with the lowest RMSE
best_k, best_rmse = min(rmse_results, key=lambda x: x[1])

print(f"Best k value: {best_k}, RMSE: {best_rmse}")

Best k value: 10, RMSE: 1.0273378569666962


In [40]:
from sklearn.cluster import KMeans

In [41]:
#Kmeans
def kmeans_predictions(rating_matrix, n_clusters):
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    user_clusters = kmeans.fit_predict(rating_matrix)

    # Initialize the predictions matrix
    predictions = np.zeros(rating_matrix.shape)

    # Calculate the mean ratings per cluster
    for cluster in range(n_clusters):
        cluster_indices = np.where(user_clusters == cluster)[0]
        cluster_mean = rating_matrix[cluster_indices].mean(axis=0)

        # Assign the cluster's mean ratings to all users in the cluster
        predictions[cluster_indices, :] = cluster_mean

    return predictions

# compute RMSE for different cluster counts
def compute_rmse_kmeans(rating_matrix, n_clusters):
    kmeans_pred = kmeans_predictions(rating_matrix, n_clusters)
    return np.sqrt(mean_squared_error(rating_matrix[rating_matrix > 0], kmeans_pred[rating_matrix > 0]))

# Range of cluster values to test
cluster_values = [2, 3, 5, 7, 10]

# Compute RMSE for each cluster count and store the results
rmse_results = []
for n_clusters in cluster_values:
    rmse = compute_rmse_kmeans(rating_matrix, n_clusters)
    rmse_results.append((n_clusters, rmse))

# Find the best cluster value with the lowest RMSE
best_clusters, best_rmse = min(rmse_results, key=lambda x: x[1])

print(f"Best cluster count: {best_clusters}, RMSE: {best_rmse}")

Best cluster count: 10, RMSE: 0.7110804923572833


