In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity

# Load the dataset
file_path = '/mnt/data/random_user_movie_ratings.csv'
df = pd.read_csv(file_path)

# Display the dataset
print("Dataset Head:")
print(df.head())

# 1. Adjust ratings to 1-5 scale
print("\nRescaling ratings to 1-5 scale...")
df['rating'] = df['rating'].clip(lower=1, upper=5)

# 2. Total number of users and items
tnu = df['user_id'].nunique()
tni = df['item_id'].nunique()
print(f"\nTotal number of users (tnu): {tnu}")
print(f"Total number of items (tni): {tni}")

# 3. Count ratings for each product
item_rating_counts = df['item_id'].value_counts()
print("\nNumber of ratings per item:")
print(item_rating_counts)

# 4. Select 3 active users (U1, U2, U3) with different missing ratings
print("\nSelecting active users (U1, U2, U3)...")
active_users = {}
active_users['U1'] = df['user_id'].value_counts().index[0]  # User with most ratings
active_users['U2'] = df['user_id'].value_counts().index[1]
active_users['U3'] = df['user_id'].value_counts().index[2]

# 5. Select 2 target items (I1, I2) with missing ratings
print("\nSelecting target items (I1, I2)...")
target_items = {}
item_missing = df['item_id'].value_counts(ascending=True)
target_items['I1'] = item_missing.index[0]  # Item with 4% missing ratings
target_items['I2'] = item_missing.index[1]  # Item with 10% missing ratings

print(f"Active Users: {active_users}")
print(f"Target Items: {target_items}")

# 6. Count co-rated items and common users
print("\nCounting co-rated items and common users...")
user_item_matrix = df.pivot_table(index='user_id', columns='item_id', values='rating')
co_rated_items = (user_item_matrix.notna() & user_item_matrix.notna().loc[active_users['U1']]).sum(axis=1)

common_users = {}
for user in active_users.values():
    common_users[user] = (user_item_matrix.notna().sum(axis=1) >= 1).sum()

# 7. Create a 2D array for descending No_common_users and No_coRated_items
print("\nCreating a 2D array for No_common_users and No_coRated_items...")
no_common_users = co_rated_items.sort_values(ascending=False)
no_cRated_items = user_item_matrix.loc[active_users['U1']].notna().sum()

array_2D = np.column_stack((no_common_users, no_cRated_items))
print(array_2D)

# 8. Visualization: Ratings per item
print("\nDrawing curve for quantity of ratings per item...")
plt.figure(figsize=(10, 5))
plt.plot(item_rating_counts.sort_index(), marker='o')
plt.xlabel('Item ID')
plt.ylabel('Number of Ratings')
plt.title('Quantity of Ratings for Each Item')
plt.grid()
plt.show()

# 9. Threshold Beta for active users
print("\nDetermining threshold Beta for active users...")
thresholds = {}
for user in active_users.values():
    co_rated = (user_item_matrix.notna() & user_item_matrix.notna().loc[user]).sum(axis=1)
    thresholds[user] = (co_rated >= 0.3 * tni).sum()

print(f"Thresholds (Beta) for Active Users: {thresholds}")

# 10. Save results
results = {
    'Total Users': tnu,
    'Total Items': tni,
    
    'Active Users': active_users,
    'Target Items': target_items,
    'Thresholds': thresholds
}
results_df = pd.DataFrame.from_dict(results, orient='index')
results_df.to_csv('results_summary.csv', index=True)
print("\nResults saved to 'results_summary.csv'")


In [None]:
# Import required libraries
from scipy.spatial.distance import pdist, squareform
from scipy.stats import pearsonr

# Helper function for mean-centering
def mean_centering(matrix):
    return matrix.subtract(matrix.mean(axis=1), axis=0)

# Load dataset again (item-based CF)
print("\nLoading dataset for Item-Based Collaborative Filtering...")
item_based_df = df.copy()
item_user_matrix = item_based_df.pivot_table(index='item_id', columns='user_id', values='rating')

# ------------------- CASE STUDY 2.2: ITEM-BASED CF USING COSINE SIMILARITY -------------------

print("\n--- Case Study 2.2: Cosine Similarity with Mean-Centering ---")

# 2.2.1 Apply item-based CF using Cosine similarity with mean-centering
print("Applying mean-centering and computing Cosine similarity...")
mean_centered_item_matrix = mean_centering(item_user_matrix)

# Compute Cosine similarity between items
cosine_sim_matrix = cosine_similarity(mean_centered_item_matrix.fillna(0))
cosine_sim_df = pd.DataFrame(cosine_sim_matrix, index=mean_centered_item_matrix.index, columns=mean_centered_item_matrix.index)

print("\nCosine Similarity Matrix (Sample):")
print(cosine_sim_df.head())

# 2.2.2 Determine top 20% closest items for each target item
top_20_percent_items = {}
for item in target_items.values():
    closest_items = cosine_sim_df[item].sort_values(ascending=False)[1:]  # Exclude itself
    top_20_percent_items[item] = closest_items.head(int(0.2 * len(closest_items)))

print("\nTop 20% Closest Items:")
for item, closest in top_20_percent_items.items():
    print(f"Target Item {item}:\n{closest}")

# 2.2.3 Predict missing ratings for each target item
predicted_ratings = {}
for item in target_items.values():
    closest_items = top_20_percent_items[item]
    known_ratings = item_user_matrix.loc[closest_items.index]
    predicted_ratings[item] = known_ratings.mean(axis=0)

print("\nPredicted Ratings for Target Items (Case 2.2):")
print(predicted_ratings)

# 2.2.4 Compute Discount Factor (DF) and Discounted Similarity (DS)
print("\nComputing Discount Factor and Discounted Similarity...")
thresholds = thresholds  # Beta values computed earlier
discounted_similarity = {}
for item in target_items.values():
    df = 1 / (1 + np.exp(-thresholds[active_users['U1']]))
    discounted_similarity[item] = cosine_sim_df[item] * df

# 2.2.5 Determine top 20% closest items using discounted similarity
top_20_percent_ds_items = {}
for item in target_items.values():
    closest_ds_items = discounted_similarity[item].sort_values(ascending=False)[1:]
    top_20_percent_ds_items[item] = closest_ds_items.head(int(0.2 * len(closest_ds_items)))

print("\nTop 20% Closest Items using Discounted Similarity:")
for item, closest in top_20_percent_ds_items.items():
    print(f"Target Item {item}:\n{closest}")

# 2.2.6 Predict missing ratings using discounted similarity
predicted_ds_ratings = {}
for item in target_items.values():
    closest_items = top_20_percent_ds_items[item]
    known_ratings = item_user_matrix.loc[closest_items.index]
    predicted_ds_ratings[item] = known_ratings.mean(axis=0)

print("\nPredicted Ratings using Discounted Similarity (Case 2.2):")
print(predicted_ds_ratings)

# ------------------- CASE STUDY 2.3: ITEM-BASED CF USING PCC -------------------

print("\n--- Case Study 2.3: Pearson Correlation Coefficient (PCC) ---")

# 2.3.1 Apply item-based CF using PCC
def compute_pcc(matrix):
    items = matrix.index
    pcc_matrix = pd.DataFrame(index=items, columns=items, dtype=float)
    for i in items:
        for j in items:
            if i != j:
                valid_ratings = matrix.loc[[i, j]].dropna(axis=1)
                if len(valid_ratings.columns) > 1:
                    pcc_matrix.loc[i, j] = pearsonr(valid_ratings.loc[i], valid_ratings.loc[j])[0]
    return pcc_matrix.fillna(0)

print("Computing Pearson Correlation Coefficient (PCC)...")
pcc_sim_matrix = compute_pcc(item_user_matrix)

print("\nPCC Similarity Matrix (Sample):")
print(pcc_sim_matrix.head())

# 2.3.2 Determine top 20% closest items for each target item
top_20_percent_pcc_items = {}
for item in target_items.values():
    closest_items = pcc_sim_matrix[item].sort_values(ascending=False)[1:]  # Exclude itself
    top_20_percent_pcc_items[item] = closest_items.head(int(0.2 * len(closest_items)))

print("\nTop 20% Closest Items (PCC):")
for item, closest in top_20_percent_pcc_items.items():
    print(f"Target Item {item}:\n{closest}")

# 2.3.3 Predict missing ratings for target items using PCC
predicted_pcc_ratings = {}
for item in target_items.values():
    closest_items = top_20_percent_pcc_items[item]
    known_ratings = item_user_matrix.loc[closest_items.index]
    predicted_pcc_ratings[item] = known_ratings.mean(axis=0)

print("\nPredicted Ratings for Target Items (PCC):")
print(predicted_pcc_ratings)

# ------------------ COMPARISON OF RESULTS ------------------

# Compare top 20% closest items
print("\nComparison of Top 20% Closest Items:")
for item in target_items.values():
    print(f"\nTarget Item {item}:")
    print(f"Cosine Similarity:\n{top_20_percent_items[item]}")
    print(f"Discounted Cosine Similarity:\n{top_20_percent_ds_items[item]}")
    print(f"PCC Similarity:\n{top_20_percent_pcc_items[item]}")

# Compare predictions
print("\nComparison of Predicted Ratings:")
for item in target_items.values():
    print(f"\nTarget Item {item}:")
    print(f"Cosine Similarity Prediction:\n{predicted_ratings[item]}")
    print(f"Discounted Cosine Similarity Prediction:\n{predicted_ds_ratings[item]}")
    print(f"PCC Prediction:\n{predicted_pcc_ratings[item]}")
