In [1]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Read the data file
data = pd.read_csv('/content/random_user_movie_ratings.csv')

# Extract the ratings matrix (user ratings only)
ratings_matrix = data.iloc[:, 1:]

# Compute cosine similarity between users
cosine_sim = cosine_similarity(ratings_matrix.fillna(0))  # Handle NaN with zeros for similarity computation
cosine_sim_df = pd.DataFrame(cosine_sim, index=data.iloc[:, 0], columns=data.iloc[:, 0])

# Define the top percentage of users to consider for each user
top_percent = 0.2
num_closest_users = int(len(cosine_sim_df) * top_percent)

# Find the closest users for each user
top_users_dict = {}
for user in cosine_sim_df.index:
    similar_users = cosine_sim_df[user].sort_values(ascending=False)
    top_users = similar_users.iloc[1:num_closest_users+1].index.tolist()  # Exclude the user itself
    top_users_dict[user] = top_users


# Function to calculate predictions based on top user ratings
def calculate_prediction(user, top_users, ratings_matrix):
    closest_ratings = ratings_matrix.loc[top_users].mean(axis=0)  # Take the mean rating of top similar users
    return closest_ratings


# Generate predictions using regular cosine similarity
predictions = {}
for user, top_users in top_users_dict.items():
    predictions[user] = calculate_prediction(user, top_users, ratings_matrix)


# Discounted similarity adjustment based on co-rated items
def calculate_discounted_similarity(user, top_users, cosine_sim_df, ratings_matrix, threshold=0.3):
    """
    Apply a discounted factor to user similarity scores based on co-rated items threshold.
    """
    co_rated_items = ratings_matrix.loc[top_users].notnull().sum(axis=1)
    total_items = len(ratings_matrix.columns)
    co_rated_percentage = co_rated_items / total_items

    # Apply a discount: 1 for co-rated >= threshold, 0.5 otherwise
    conditions = [co_rated_percentage >= threshold, co_rated_percentage < threshold]
    discount_factors = [1, 0.5]
    discounts = np.select(conditions, discount_factors)

    discounted_similarity = cosine_sim_df.loc[user, top_users] * discounts
    return discounted_similarity


# Generate predictions using discounted similarity
discounted_predictions = {}
for user, top_users in top_users_dict.items():
    valid_top_users = [u for u in top_users if u in cosine_sim_df.index]  # Ensure users exist in similarity matrix
    discounted_predictions[user] = calculate_prediction(user, valid_top_users, ratings_matrix)


# Create dataframes for predictions
predictions_df = pd.DataFrame(predictions)
discounted_predictions_df = pd.DataFrame(discounted_predictions)

# Display sample predictions
print("Predictions using Cosine Similarity:")
print(predictions_df.head())
print("\nPredictions using Discounted Similarity:")
print(discounted_predictions_df.head())

# Save predictions to CSV files
predictions_df.to_csv('/content/predictions_cosine.csv', index=False)
discounted_predictions_df.to_csv('/content/predictions_discounted.csv', index=False)


FileNotFoundError: [Errno 2] No such file or directory: '/content/random_user_movie_ratings.csv'

In [None]:
import pandas as pd
import numpy as np

# Set random seed for reproducibility
np.random.seed(42)

# Define the parameters of the dataset
num_users = 1000        # Number of unique users
num_items = 100         # Number of unique items
num_ratings = 100000    # Total number of ratings

# Generate user and item IDs
user_ids = np.arange(1, num_users + 1)  # User IDs from 1 to 1000
item_ids = np.arange(1, num_items + 1)  # Item IDs from 1 to 100

# Generate random ratings data
ratings_data = {
    "user_id": np.random.choice(user_ids, size=num_ratings),        # Random user IDs
    "item_id": np.random.choice(item_ids, size=num_ratings),        # Random item IDs
    "rating": np.random.randint(1, 6, size=num_ratings)             # Random ratings between 1 and 5
}

# Create the DataFrame
ratings_df = pd.DataFrame(ratings_data)

# Remove duplicate ratings for the same user-item pair
ratings_df.drop_duplicates(subset=["user_id", "item_id"], inplace=True)

# Reset the index after removing duplicates
ratings_df.reset_index(drop=True, inplace=True)

# Display sample of the generated dataset
print("Sample of the generated dataset:")
print(ratings_df.head())

# Save the dataset to a CSV file
ratings_df.to_csv("generated_ratings_dataset.csv", index=False)

print("\nDataset generated successfully and saved as 'generated_ratings_dataset.csv'.")


In [None]:
f