## Clustering

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load data
df_movie = pd.read_csv('/content/movie_df_comm.csv')
df_book = pd.read_csv('/content/book_df_comm.csv')

# Concatenate dataframes
df = pd.concat([df_book, df_movie], ignore_index=True)

# Drop duplicates and missing values
df = df.drop_duplicates(subset=['reviewerID', 'asin'])
df = df.dropna(subset=['reviewerID', 'asin', 'overall'])

# Generate unique user and item IDs
df['user_id'] = df['reviewerID'].astype('category').cat.codes
df['item_id'] = df['asin'].astype('category').cat.codes

# Split data into train and test sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Save the dataframes for later use
train_df.to_pickle("train_df.pkl")
test_df.to_pickle("test_df.pkl")

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
import numpy as np

# Load data
df_movie = pd.read_csv('/content/movie_df_comm.csv')
df_book = pd.read_csv('/content/book_df_comm.csv')

# Concatenate dataframes
df = pd.concat([df_book, df_movie], ignore_index=True)

# Drop duplicates and missing values
df = df.drop_duplicates(subset=['reviewerID', 'asin'])
df = df.dropna(subset=['reviewerID', 'asin', 'overall'])

# Generate unique user and item IDs
df['user_id'] = df['reviewerID'].astype('category').cat.codes
df['item_id'] = df['asin'].astype('category').cat.codes

# Split data into train and test sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Save the dataframes for later use
train_df.to_pickle("train_df.pkl")
test_df.to_pickle("test_df.pkl")

# Create user-item interaction matrix
interaction_matrix = train_df.pivot(index='user_id', columns='item_id', values='overall').fillna(0)

# Cluster users based on their interactions
num_clusters = 50  # Adjust based on your dataset size
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
user_clusters = kmeans.fit_predict(interaction_matrix)

# Create a mapping from user_id to cluster
user_cluster_mapping = {user_id: cluster for user_id, cluster in zip(interaction_matrix.index, user_clusters)}

# Map user clusters to train_df
train_df['user_cluster'] = train_df['user_id'].map(user_cluster_mapping)

# Cluster items based on their interactions (transpose the interaction matrix)
item_clusters = kmeans.fit_predict(interaction_matrix.T)

# Create a mapping from item_id to cluster
item_cluster_mapping = {item_id: cluster for item_id, cluster in zip(interaction_matrix.columns, item_clusters)}

# Map item clusters to train_df
train_df['item_cluster'] = train_df['item_id'].map(item_cluster_mapping)

# Function to recommend items
def recommend_items(user_id, train_df, top_n=10):
    # Check if the user exists in the training data
    if user_id not in train_df['user_id'].values:
        return []  # Return an empty list if the user is not found

    # Get the user's cluster
    user_cluster = train_df[train_df['user_id'] == user_id]['user_cluster'].values[0]

    # Find items in the same cluster as the user
    cluster_items = train_df[train_df['item_cluster'] == user_cluster]['item_id'].unique()

    # Rank items by average rating in the cluster
    item_ratings = train_df[train_df['item_id'].isin(cluster_items)].groupby('item_id')['overall'].mean()
    top_items = item_ratings.sort_values(ascending=False).index[:top_n]

    return top_items

# Function to evaluate recommendations
def evaluate_recommendations(test_df, train_df, top_n=10):
    hit_rate = 0
    ndcg = 0
    precision = 0
    recall = 0
    total_users = test_df['user_id'].nunique()

    for user_id in test_df['user_id'].unique():
        # Get ground truth items for the user
        ground_truth = test_df[test_df['user_id'] == user_id]['item_id'].values

        # Generate recommendations for the user
        recommended_items = recommend_items(user_id, train_df, top_n)

        # Skip evaluation if no recommendations are generated
        if len(recommended_items) == 0:
            continue

        # Calculate Hit Rate
        if len(np.intersect1d(recommended_items, ground_truth)) > 0:
            hit_rate += 1

        # Calculate NDCG
        relevance = np.isin(recommended_items, ground_truth).astype(int)
        if np.sum(relevance) > 0:
            ndcg += ndcg_score([relevance], [np.ones_like(relevance)], k=top_n)

        # Calculate Precision and Recall
        true_positives = len(np.intersect1d(recommended_items, ground_truth))
        precision += true_positives / top_n
        recall += true_positives / len(ground_truth)

    # Average metrics across all users
    hit_rate /= total_users
    ndcg /= total_users
    precision /= total_users
    recall /= total_users

    return hit_rate, ndcg, precision, recall

# Example usage
hit_rate, ndcg, precision, recall = evaluate_recommendations(test_df, train_df, top_n=10)
print(f"Hit Rate: {hit_rate:.4f}")
print(f"NDCG: {ndcg:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")

Hit Rate: 0.0009
NDCG: 0.0007
Precision: 0.0001
Recall: 0.0009
