<a href="https://colab.research.google.com/github/Dhwaj-054/lit-college-codes/blob/main/RS_Expt3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Generate a synthetic dataset suitable for building a user-based recommendation system, including user IDs, item IDs, and ratings, and save it as a CSV file.

In [None]:
columns = ['user_id', 'item_id', 'rating']
print(columns)

['user_id', 'item_id', 'rating']


In [None]:
import pandas as pd
import numpy as np

num_users = 100
num_items = 50

user_ids = np.random.randint(1, num_users + 1, size=5000)
item_ids = np.random.randint(1, num_items + 1, size=5000)
ratings = np.random.randint(1, 6, size=5000)

data = {'user_id': user_ids, 'item_id': item_ids, 'rating': ratings}
df = pd.DataFrame(data, columns=['user_id', 'item_id', 'rating'])
display(df.head())

Unnamed: 0,user_id,item_id,rating
0,24,42,4
1,44,29,5
2,32,21,4
3,25,31,2
4,88,31,2


In [None]:
df.to_csv('synthetic_recommendation_data.csv', index=False)

In [None]:
import pandas as pd
import numpy as np

num_movies = 5
num_users = 4

# Generate random ratings (e.g., 1-5)
ratings_matrix = np.random.randint(1, 6, size=(num_movies, num_users)).astype(float) # Use float to allow for NaN

# Introduce some missing values (NaN)
# Let's randomly set some ratings to NaN
missing_rate = 0.2  # Percentage of values to set as missing
mask = np.random.choice([True, False], size=(num_movies, num_users), p=[missing_rate, 1-missing_rate])
ratings_matrix[mask] = np.nan

# Create a DataFrame
# You can customize row and column names if needed
movie_user_df = pd.DataFrame(ratings_matrix,
                             index=[f'movie_{i+1}' for i in range(num_movies)],
                             columns=[f'user_{j+1}' for j in range(num_users)])

print("Generated Movie-User Ratings Matrix (with missing values):")
display(movie_user_df)

Generated Movie-User Ratings Matrix (with missing values):


Unnamed: 0,user_1,user_2,user_3,user_4
movie_1,2.0,3.0,,1.0
movie_2,3.0,2.0,3.0,3.0
movie_3,4.0,3.0,,4.0
movie_4,4.0,4.0,,5.0
movie_5,1.0,,5.0,4.0


In [None]:
from sklearn.metrics.pairwise import cosine_similarity

# Get the indices of the NaN values
nan_indices = movie_user_df.isnull()

# Generate random numbers for the NaN positions within the range of 1 to 5
random_fill_values = np.random.randint(1, 6, size=nan_indices.sum().sum())

# Fill the NaN values with the generated random numbers
# We need to iterate through the columns to fill correctly
col_idx = 0
for col in movie_user_df.columns:
    col_nan_indices = nan_indices[col][nan_indices[col]]
    num_nans_in_col = col_nan_indices.sum()
    if num_nans_in_col > 0:
        movie_user_df.loc[col_nan_indices.index, col] = random_fill_values[col_idx : col_idx + num_nans_in_col]
        col_idx += num_nans_in_col

print("Movie-User Ratings Matrix after filling NaN with random numbers:")
display(movie_user_df)


# Compute User Similarity (compare columns, so transpose the matrix)
user_similarity = cosine_similarity(movie_user_df.T)
user_similarity_df = pd.DataFrame(user_similarity,
                                   index=movie_user_df.columns,
                                   columns=movie_user_df.columns)

print("User Similarity Matrix (Cosine Similarity):")
display(user_similarity_df)

# Compute Movie Similarity (compare rows)
movie_similarity = cosine_similarity(movie_user_df)
movie_similarity_df = pd.DataFrame(movie_similarity,
                                   index=movie_user_df.index,
                                   columns=movie_user_df.index)

print("\nMovie Similarity Matrix (Cosine Similarity):")
display(movie_similarity_df)

Movie-User Ratings Matrix after filling NaN with random numbers:


Unnamed: 0,user_1,user_2,user_3,user_4
movie_1,2.0,3.0,3.0,1.0
movie_2,3.0,2.0,3.0,3.0
movie_3,4.0,3.0,3.0,4.0
movie_4,4.0,4.0,4.0,5.0
movie_5,1.0,3.0,5.0,4.0


User Similarity Matrix (Cosine Similarity):


Unnamed: 0,user_1,user_2,user_3,user_4
user_1,1.0,0.924785,0.858238,0.918658
user_2,0.924785,1.0,0.97288,0.944473
user_3,0.858238,0.97288,1.0,0.948174
user_4,0.918658,0.944473,0.948174,1.0



Movie Similarity Matrix (Cosine Similarity):


Unnamed: 0,movie_1,movie_2,movie_3,movie_4,movie_5
movie_1,1.0,0.898807,0.884652,0.902976,0.875936
movie_2,0.898807,1.0,0.990601,0.987997,0.905392
movie_3,0.884652,0.990601,1.0,0.993127,0.87133
movie_4,0.902976,0.987997,0.993127,1.0,0.917786
movie_5,0.875936,0.905392,0.87133,0.917786,1.0


In [None]:
def get_neighbors(similarity_matrix, k):
    neighbors = []
    for i in range(similarity_matrix.shape[0]):
        # Get top-k similar neighbors (excluding self)
        nearest_indices = np.argsort(similarity_matrix[i])[::-1][1:k+1]
        neighbors.append(nearest_indices)
    return neighbors

k = 5
user_neighbors = get_neighbors(user_similarity, k) # similar users
movie_neighbors = get_neighbors(movie_similarity, k) # similar movies

print(f"Nearest {k} neighbors for each user (indices):")
print(user_neighbors)

print(f"\nNearest {k} neighbors for each movie (indices):")
print(movie_neighbors)

Nearest 5 neighbors for each user (indices):
[array([1, 3, 2]), array([2, 3, 0]), array([1, 3, 0]), array([2, 1, 0])]

Nearest 5 neighbors for each movie (indices):
[array([3, 1, 2, 4]), array([2, 3, 4, 0]), array([3, 1, 0, 4]), array([2, 1, 4, 0]), array([3, 1, 0, 2])]


In [None]:
def predict_rating(user_id, movie_id, ratings, user_neighbors,
                   movie_neighbors, user_similarity, movie_similarity, kind="user"):
    if kind == "user":
        # Get similar users
        neighbors = user_neighbors[user_id]
        sim_scores = user_similarity[user_id, neighbors]
        neighbor_ratings = ratings.iloc[movie_id, neighbors].values
    elif kind == "movie":
        # Get similar movies
        neighbors = movie_neighbors[movie_id]
        sim_scores = movie_similarity[movie_id, neighbors]
        neighbor_ratings = ratings.iloc[neighbors, user_id].values
    else:
        raise ValueError("Invalid 'kind'. Use 'user' or 'movie'.")

    # Weighted average of neighbor ratings
    # Assuming 0 represents no rating or a value to exclude
    valid_mask = neighbor_ratings != 0
    if valid_mask.sum() > 0: # Check if there are any valid ratings
        predicted = np.dot(sim_scores[valid_mask], neighbor_ratings[valid_mask]) / sim_scores[valid_mask].sum()
    else:
        # Fallback: overall mean rating from the matrix (excluding 0s)
        valid_ratings_in_matrix = ratings.values[ratings.values != 0]
        if valid_ratings_in_matrix.size > 0:
             predicted = np.mean(valid_ratings_in_matrix)
        else:
             predicted = 0 # Or some default value if no valid ratings at all
    return predicted

# Example usage (predicting rating for user 1, movie 0 using user-based approach)
# Note: user_id and movie_id here refer to the integer indices (0-based)
predicted_rating_user_based = predict_rating(0, 0, movie_user_df, user_neighbors, movie_neighbors, user_similarity, movie_similarity, kind="user")
print(f"\nPredicted rating for user 1, movie 1 (user-based): {predicted_rating_user_based}")

# Example usage (predicting rating for user 1, movie 0 using movie-based approach)
predicted_rating_movie_based = predict_rating(0, 0, movie_user_df, user_neighbors, movie_neighbors, user_similarity, movie_similarity, kind="movie")
print(f"Predicted rating for user 1, movie 1 (movie-based): {predicted_rating_movie_based}")


Predicted rating for user 1, movie 1 (user-based): 2.31993597130316
Predicted rating for user 1, movie 1 (movie-based): 3.0100373040899204


In [None]:
# Round off the predicted ratings
rounded_predicted_rating_user_based = round(predicted_rating_user_based, 2)
rounded_predicted_rating_movie_based = round(predicted_rating_movie_based, 2)

print(f"\nRounded predicted rating for user 1, movie 1 (user-based): {rounded_predicted_rating_user_based}")
print(f"Rounded predicted rating for user 1, movie 1 (movie-based): {rounded_predicted_rating_movie_based}")


Rounded predicted rating for user 1, movie 1 (user-based): 2.32
Rounded predicted rating for user 1, movie 1 (movie-based): 3.01


In [None]:
def recommend_for_all_users(ratings, user_neighbors, movie_neighbors, user_similarity, movie_similarity, N=3, kind="user"):
    all_recommendations = {}
    num_users = ratings.shape[1]  # number of users
    num_movies = ratings.shape[0]  # number of movies

    for user_id in range(num_users):  # loop over users
        user_ratings = ratings.iloc[:, user_id].values
        unrated_movies_indices = np.where(user_ratings == 0)[0] # Get indices of unrated movies

        preds = []
        for movie_id in unrated_movies_indices:
            predicted_rating = predict_rating(user_id, movie_id, ratings,
                                              user_neighbors, movie_neighbors,
                                              user_similarity, movie_similarity,
                                              kind=kind)
            preds.append((movie_id, predicted_rating))

        top_recs = sorted(preds, key=lambda x: x[1], reverse=True)[:N]
        all_recommendations[f'user_{user_id+1}'] = top_recs

    return all_recommendations

# Example usage (recommend top 3 movies for all users using user-based approach)
top_n = 3
all_user_recommendations = recommend_for_all_users(movie_user_df, user_neighbors, movie_neighbors, user_similarity, movie_similarity, N=top_n, kind="user")

print(f"\nTop {top_n} recommendations for each user (user-based):")
for user, recs in all_user_recommendations.items():
    print(f"{user}: {[(movie_user_df.index[movie_id], round(rating, 2)) for movie_id, rating in recs]}")

# Example usage (recommend top 3 movies for all users using movie-based approach)
all_movie_recommendations = recommend_for_all_users(movie_user_df, user_neighbors, movie_neighbors, user_similarity, movie_similarity, N=top_n, kind="movie")

print(f"\nTop {top_n} recommendations for each user (movie-based):")
for user, recs in all_movie_recommendations.items():
     print(f"{user}: {[(movie_user_df.index[movie_id], round(rating, 2)) for movie_id, rating in recs]}")


Top 3 recommendations for each user (user-based):
user_1: []
user_2: []
user_3: []
user_4: []

Top 3 recommendations for each user (movie-based):
user_1: []
user_2: []
user_3: []
user_4: []


In [None]:
# Get the indices of the NaN values
nan_indices = movie_user_df.isnull()

# Generate random numbers for the NaN positions within the range of 1 to 5
random_fill_values = np.random.randint(1, 6, size=nan_indices.sum().sum())

# Fill the NaN values with the generated random numbers
# We need to iterate through the columns to fill correctly
col_idx = 0
for col in movie_user_df.columns:
    col_nan_indices = nan_indices[col][nan_indices[col]]
    num_nans_in_col = col_nan_indices.sum()
    if num_nans_in_col > 0:
        movie_user_df.loc[col_nan_indices.index, col] = random_fill_values[col_idx : col_idx + num_nans_in_col]
        col_idx += num_nans_in_col

print("Movie-User Ratings Matrix after filling NaN with random numbers:")
display(movie_user_df)

Movie-User Ratings Matrix after filling NaN with random numbers:


Unnamed: 0,user_1,user_2,user_3,user_4
movie_1,2.0,3.0,3.0,1.0
movie_2,3.0,2.0,3.0,3.0
movie_3,4.0,3.0,3.0,4.0
movie_4,4.0,4.0,4.0,5.0
movie_5,1.0,3.0,5.0,4.0


In [None]:
# Get top 3 movie recommendations for each user
recommendations = recommend_for_all_users(movie_user_df, user_neighbors, movie_neighbors, user_similarity, movie_similarity, N=3, kind="user")

# Print human-readable results
for user_id, recs in recommendations.items():
    rec_movie_names = [(movie_user_df.index[movie], round(rating, 2)) for movie, rating in recs]
    print(f"\nTop recommendations for {user_id}: {rec_movie_names}")


Top recommendations for user_1: []

Top recommendations for user_2: []

Top recommendations for user_3: []

Top recommendations for user_4: []


k=2

In [None]:
def get_neighbors(similarity_matrix, k):
    neighbors = []
    for i in range(similarity_matrix.shape[0]):
        # Get top-k similar neighbors (excluding self)
        nearest_indices = np.argsort(similarity_matrix[i])[::-1][1:k+1]
        neighbors.append(nearest_indices)
    return neighbors

k = 2
user_neighbors = get_neighbors(user_similarity, k) # similar users
movie_neighbors = get_neighbors(movie_similarity, k) # similar movies

print(f"Nearest {k} neighbors for each user (indices):")
print(user_neighbors)

print(f"\nNearest {k} neighbors for each movie (indices):")
print(movie_neighbors)

Nearest 2 neighbors for each user (indices):
[array([1, 3]), array([2, 3]), array([1, 3]), array([2, 1])]

Nearest 2 neighbors for each movie (indices):
[array([3, 1]), array([2, 3]), array([3, 1]), array([2, 1]), array([3, 1])]


In [None]:
def predict_rating(user_id, movie_id, ratings, user_neighbors,
                   movie_neighbors, user_similarity, movie_similarity, kind="user"):
    if kind == "user":
        # Get similar users
        neighbors = user_neighbors[user_id]
        sim_scores = user_similarity[user_id, neighbors]
        neighbor_ratings = ratings.iloc[movie_id, neighbors].values
    elif kind == "movie":
        # Get similar movies
        neighbors = movie_neighbors[movie_id]
        sim_scores = movie_similarity[movie_id, neighbors]
        neighbor_ratings = ratings.iloc[neighbors, user_id].values
    else:
        raise ValueError("Invalid 'kind'. Use 'user' or 'movie'.")

    # Weighted average of neighbor ratings
    # Assuming 0 represents no rating or a value to exclude
    valid_mask = neighbor_ratings != 0
    if valid_mask.sum() > 0: # Check if there are any valid ratings
        predicted = np.dot(sim_scores[valid_mask], neighbor_ratings[valid_mask]) / sim_scores[valid_mask].sum()
    else:
        # Fallback: overall mean rating from the matrix (excluding 0s)
        valid_ratings_in_matrix = ratings.values[ratings.values != 0]
        if valid_ratings_in_matrix.size > 0:
             predicted = np.mean(valid_ratings_in_matrix)
        else:
             predicted = 0 # Or some default value if no valid ratings at all
    return predicted

# Example usage (predicting rating for user 1, movie 0 using user-based approach)
# Note: user_id and movie_id here refer to the integer indices (0-based)
predicted_rating_user_based = predict_rating(0, 0, movie_user_df, user_neighbors, movie_neighbors, user_similarity, movie_similarity, kind="user")
print(f"\nPredicted rating for user 1, movie 1 (user-based): {predicted_rating_user_based}")

# Example usage (predicting rating for user 1, movie 0 using movie-based approach)
predicted_rating_movie_based = predict_rating(0, 0, movie_user_df, user_neighbors, movie_neighbors, user_similarity, movie_similarity, kind="movie")
print(f"Predicted rating for user 1, movie 1 (movie-based): {predicted_rating_movie_based}")


Predicted rating for user 1, movie 1 (user-based): 2.003323483668203
Predicted rating for user 1, movie 1 (movie-based): 3.5011569889261276


In [None]:
# Round off the predicted ratings
rounded_predicted_rating_user_based = round(predicted_rating_user_based, 2)
rounded_predicted_rating_movie_based = round(predicted_rating_movie_based, 2)

print(f"\nRounded predicted rating for user 1, movie 1 (user-based): {rounded_predicted_rating_user_based}")
print(f"Rounded predicted rating for user 1, movie 1 (movie-based): {rounded_predicted_rating_movie_based}")


Rounded predicted rating for user 1, movie 1 (user-based): 2.0
Rounded predicted rating for user 1, movie 1 (movie-based): 3.5


In [None]:
# Define the user_id for whom you want to predict ratings
# Assuming we want to predict for the first user (index 0)
user_id = 0
preds = []

# Iterate through the movie indices
for movie_id in range(movie_user_df.shape[0]):
    predicted_rating = predict_rating(user_id, movie_id, movie_user_df,
                                     user_neighbors, movie_neighbors,
                                     user_similarity, movie_similarity,
                                     )
    print(f"user {user_id + 1}, movie {movie_id + 1}, predicted rating: {predicted_rating}")
    preds.append((movie_id, predicted_rating))

user 1, movie 1, predicted rating: 2.003323483668203
user 1, movie 2, predicted rating: 2.498338258165899
user 1, movie 3, predicted rating: 3.4983382581658984
user 1, movie 4, predicted rating: 4.498338258165899
user 1, movie 5, predicted rating: 3.4983382581658984


In [None]:
# Sort the predictions in descending order of predicted rating
sorted_preds = sorted(preds, key=lambda x: x[1], reverse=True)

# Get the top 3 predictions
top_3_preds = sorted_preds[:3]

print(f"\nTop 3 movie suggestions for user 1 based on predictions:")
for movie_id, rating in top_3_preds:
    print(f"- {movie_user_df.index[movie_id]}: {round(rating, 2)}")


Top 3 movie suggestions for user 1 based on predictions:
- movie_4: 4.5
- movie_3: 3.5
- movie_5: 3.5


In [None]:
from sklearn.metrics import mean_squared_error

# We need to compare predicted ratings with actual ratings for the movies that were originally rated.
# First, let's get the indices of the rated movies.
# We can use the original movie_user_df before filling NaNs to identify rated movies.
# However, since we modified movie_user_df in place by filling NaNs, we'll need to
# predict ratings for all movies and then filter for the ones that had original ratings.

# Let's regenerate predictions for user 1 for all movies
user_id_eval = 0
predicted_ratings_eval = []
actual_ratings_eval = []

for movie_id in range(movie_user_df.shape[0]):
    # Get the actual rating (if it exists) from the movie_user_df
    actual_rating = movie_user_df.iloc[movie_id, user_id_eval]

    # Only include this movie in the evaluation if it had an original rating (not NaN before filling)
    # Since we filled NaNs, we need a way to know which were originally rated.
    # A simpler approach for this example is to iterate through the original movie_user_df (if we had it)
    # and predict only for the rated ones.

    # Let's assume for evaluation purposes that the non-NaN values in the current
    # movie_user_df represent the 'actual' ratings we want to compare against.
    # This is not ideal as we filled NaNs with random numbers, but for demonstration
    # of RMSE calculation, we will use these values.

    # In a real scenario, you would split your data into training and testing sets
    # and evaluate predictions on the test set (where actual ratings are known).

    # Predict the rating for the current user and movie
    predicted_rating = predict_rating(user_id_eval, movie_id, movie_user_df,
                                     user_neighbors, movie_neighbors,
                                     user_similarity, movie_similarity,
                                     )

    predicted_ratings_eval.append(predicted_rating)
    actual_ratings_eval.append(actual_rating)

# Calculate RMSE
rmse = np.sqrt(mean_squared_error(actual_ratings_eval, predicted_ratings_eval))

print(f"\nRMSE for user {user_id_eval + 1}: {rmse}")


RMSE for user 1: 1.1826559198474407


In [None]:
def calculate_precision_recall(recommendations, actual_ratings, relevance_threshold=4):
    precision_scores = {}
    recall_scores = {}

    for user_id_str, recs in recommendations.items():
        # Extract user index from the string key (e.g., 'user_1' -> 0)
        user_id = int(user_id_str.split('_')[1]) - 1

        # Get the actual ratings for this user
        user_actual_ratings = actual_ratings.iloc[:, user_id]

        # Identify relevant movies based on the threshold
        # We consider only movies that were actually rated by the user for evaluation
        relevant_movies_indices = user_actual_ratings[user_actual_ratings >= relevance_threshold].index.tolist()

        # Extract recommended movie indices
        recommended_movie_indices = [movie_id for movie_id, rating in recs]

        # Calculate hits (relevant movies that were recommended)
        hits = len(set(recommended_movie_indices) & set(relevant_movies_indices))

        # Calculate precision: hits / total recommendations
        total_recommendations = len(recommended_movie_indices)
        precision = hits / total_recommendations if total_recommendations > 0 else 0

        # Calculate recall: hits / total relevant movies
        total_relevant_movies = len(relevant_movies_indices)
        recall = hits / total_relevant_movies if total_relevant_movies > 0 else 0

        precision_scores[user_id_str] = precision
        recall_scores[user_id_str] = recall

    return precision_scores, recall_scores

# Assuming 'all_user_recommendations' contains the top-N recommendations generated earlier
# And 'movie_user_df' is the DataFrame with actual ratings (including originally non-NaN values)

precision, recall = calculate_precision_recall(all_user_recommendations, movie_user_df)

print("\nPrecision for each user:")
for user, score in precision.items():
    print(f"{user}: {score:.2f}")

print("\nRecall for each user:")
for user, score in recall.items():
    print(f"{user}: {score:.2f}")


Precision for each user:
user_1: 0.00
user_2: 0.00
user_3: 0.00
user_4: 0.00

Recall for each user:
user_1: 0.00
user_2: 0.00
user_3: 0.00
user_4: 0.00


In [None]:
def calculate_similarity(ratings_matrix, metric='cosine'):
    #Computes the user-user similarity matrix using a specified metric (cosine, pearson, or adjusted_cosine).
    num_users = ratings_matrix.shape[1]

    if metric == 'adjusted_cosine':
        item_mean_centered_ratings = ratings_matrix.copy().astype(float)
        for i in range(ratings_matrix.shape[0]):
            row = ratings_matrix.iloc[i].values
            valid_mask = row > 0
            if np.any(valid_mask):
                mean_val = row[valid_mask].mean()
                item_mean_centered_ratings.iloc[i, valid_mask] = row[valid_mask] - mean_val
        ratings_values = item_mean_centered_ratings.T.values
    else:
        ratings_values = ratings_matrix.T.values

    if metric == 'pearson':
        mean_centered_ratings = np.zeros_like(ratings_values, dtype=float)
        for i in range(num_users):
            user_ratings = ratings_values[i, :]
            valid_ratings_mask = (ratings_matrix.T.values[i, :] > 0)
            if np.any(valid_ratings_mask):
                user_mean = user_ratings[valid_ratings_mask].mean()
                mean_centered_ratings[i, valid_ratings_mask] = user_ratings[valid_ratings_mask] - user_mean
        ratings_values = mean_centered_ratings

    similarity_matrix = np.zeros((num_users, num_users))
    for i in range(num_users):
        for j in range(i, num_users):
            if i == j:
                similarity_matrix[i, j] = 1.0
                continue

            vec_i = ratings_values[i, :]
            vec_j = ratings_values[j, :]

            co_rated_mask = (ratings_matrix.T.values[i, :] > 0) & (ratings_matrix.T.values[j, :] > 0)

            if not np.any(co_rated_mask):
                sim = 0.0
            else:
                dot_product = np.dot(vec_i[co_rated_mask], vec_j[co_rated_mask])
                norm_i = np.linalg.norm(vec_i[co_rated_mask])
                norm_j = np.linalg.norm(vec_j[co_rated_mask])
                sim = dot_product / (norm_i * norm_j) if norm_i > 0 and norm_j > 0 else 0.0

            similarity_matrix[i, j] = sim
            similarity_matrix[j, i] = sim

    return similarity_matrix

In [None]:
data = {
    'Movie/User': ['Movie 1', 'Movie 2', 'Movie 3', 'Movie 4', 'Movie 5', 'Movie 6', 'Movie 7'],
    'User 1': [5, 4, 0, 0, 2, 0, 0],
    'User 2': [4, 5, 1, 0, 1, 4, 5],
    'User 3': [0, 0, 5, 4, 0, 0, 1],
    'User 4': [0, 2, 4, 5, 1, 2, 0],
    'User 5': [2, 3, 0, 1, 5, 0, 4],
    'User 6': [1, 2, 4, 0, 5, 0, 5],
}
df = pd.DataFrame(data)
rating_matrix = df.drop('Movie/User', axis=1)

k_full_data = 3
user_similarity_full = calculate_similarity(rating_matrix, metric='cosine')
user_neighbors_full = get_neighbors(user_similarity_full, k_full_data)

print("--- Top Recommendations for All Users (on Full Dataset) ---")
recommendations = recommend_for_all_users(rating_matrix, user_neighbors_full, [], user_similarity_full, [], N=3)

for user_id_str, recs in recommendations.items():
    user_id = int(user_id_str.split('_')[1]) - 1 # Extract the user index from the string
    rec_movie_names = [(df["Movie/User"].iloc[movie_id], f"{rating:.2f}") for movie_id, rating in recs]
    print(f"\nTop recommendations for User {user_id + 1}: {rec_movie_names}")
print("-" * 60)

print("\n--- Specific Prediction Request (on Full Dataset) ---")
user_to_predict = 0
movie_to_predict = 3

predicted_value = predict_rating(user_to_predict, movie_to_predict, rating_matrix, user_neighbors_full, [], user_similarity_full, [])
user_name = f"User {user_to_predict + 1}"
movie_name = df['Movie/User'].iloc[movie_to_predict]
print(f"The predicted rating for '{movie_name}' by '{user_name}' is: {predicted_value:.4f}")
print("-" * 60)

--- Top Recommendations for All Users (on Full Dataset) ---

Top recommendations for User 1: [('Movie 7', '4.56'), ('Movie 4', '3.25'), ('Movie 6', '2.98')]

Top recommendations for User 2: [('Movie 4', '2.87')]

Top recommendations for User 3: [('Movie 5', '3.24'), ('Movie 2', '2.21'), ('Movie 6', '2.00')]

Top recommendations for User 4: [('Movie 1', '3.21'), ('Movie 7', '2.82')]

Top recommendations for User 5: [('Movie 6', '4.00'), ('Movie 3', '2.65')]

Top recommendations for User 6: [('Movie 4', '3.18'), ('Movie 6', '2.00')]
------------------------------------------------------------

--- Specific Prediction Request (on Full Dataset) ---
The predicted rating for 'Movie 4' by 'User 1' is: 3.2550
------------------------------------------------------------


In [None]:
from sklearn.model_selection import train_test_split

def train_test_split_custom(ratings_df, test_size=0.25, random_state=42):
    #Splits known ratings into a training matrix and a list of test data points.
    train_matrix = ratings_df.copy()
    test_data = []

    non_zero_ratings = train_matrix.stack()
    non_zero_ratings = non_zero_ratings[non_zero_ratings > 0]

    ratings_to_split = pd.DataFrame({
        'movie_id': non_zero_ratings.index.get_level_values(0),
        'user_id': pd.to_numeric(non_zero_ratings.index.get_level_values(1).str.replace('User ', '')) - 1,
        'rating': non_zero_ratings.values
    })

    if ratings_to_split.empty:
        return train_matrix, test_data

    _, test_df = train_test_split(ratings_to_split, test_size=test_size, random_state=random_state)

    for _, row in test_df.iterrows():
        movie_idx, user_idx, rating = int(row['movie_id']), int(row['user_id']), row['rating']
        user_col = f"User {user_idx + 1}"
        train_matrix.loc[movie_idx, user_col] = 0
        test_data.append((user_idx, movie_idx, rating))

    return train_matrix, test_data

In [None]:
def calculate_rmse(test_data, train_ratings, user_neighbors, user_similarity):
    #alculates the Root Mean Squared Error between predicted and actual ratings on the test set.
    squared_errors = []
    for user_id, movie_id, actual_rating in test_data:
        predicted = predict_rating(user_id, movie_id, train_ratings, user_neighbors, user_similarity)
        # Handle cases where prediction might be NaN
        if not np.isnan(predicted):
            squared_errors.append((predicted - actual_rating) ** 2)

    if not squared_errors:
        return 0.0

    return np.sqrt(np.mean(squared_errors))

In [None]:
def calculate_precision_recall(test_data, train_ratings, user_neighbors, user_similarity, N, relevance_threshold):
    #Calculates the average Precision and Recall for top-N recommendations across all users.
    test_data_by_user = {}
    for user_id, movie_id, rating in test_data:
        if user_id not in test_data_by_user:
            test_data_by_user[user_id] = []
        test_data_by_user[user_id].append((movie_id, rating))

    user_precisions, user_recalls = [], []
    num_users = train_ratings.shape[1]

    for user_id in range(num_users):
        if user_id not in test_data_by_user:
            continue

        relevant_items = {movie_id for movie_id, rating in test_data_by_user[user_id] if rating >= relevance_threshold}
        if not relevant_items:
            continue

        unrated_movies = np.where(train_ratings.iloc[:, user_id].values == 0)[0]
        preds = [(movie_id, predict_rating(user_id, movie_id, train_ratings, user_neighbors, user_similarity)) for movie_id in unrated_movies]

        # Filter out NaN predictions before sorting
        valid_preds = [p for p in preds if not np.isnan(p[1])]
        top_n_recs = {movie_id for movie_id, _ in sorted(valid_preds, key=lambda x: x[1], reverse=True)[:N]}

        true_positives = len(top_n_recs.intersection(relevant_items))

        precision = true_positives / len(top_n_recs) if top_n_recs else 0.0
        recall = true_positives / len(relevant_items) if relevant_items else 0.0

        user_precisions.append(precision)
        user_recalls.append(recall)

    return np.mean(user_precisions) if user_precisions else 0.0, np.mean(user_recalls) if user_recalls else 0.0