In [None]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

data = pd.read_csv('_movie_ratings.csv')


ratings_matrix = data.iloc[:, 1:]

cosine_sim = cosine_similarity(ratings_matrix)
cosine_sim_df = pd.DataFrame(cosine_sim, index=data['Unnamed: 0'], columns=data['Unnamed: 0'])

top_percent = 0.2
num_closest_users = int(len(cosine_sim_df) * top_percent)

top_users_dict = {}
for user in cosine_sim_df.index:
    similar_users = cosine_sim_df[user].sort_values(ascending=False)
    top_users = similar_users.iloc[1:num_closest_users+1].index.tolist()
    top_users_dict[user] = top_users


def calculate_prediction(user, top_users, ratings_matrix):
    closest_ratings = ratings_matrix.loc[top_users]
    predicted_ratings = closest_ratings.mean(axis=0)
    return predicted_ratings

predictions = {}
for user, top_users in top_users_dict.items():
    predictions[user] = calculate_prediction(user, top_users, ratings_matrix)


def calculate_discounted_similarity(user, top_users, cosine_sim_df, ratings_matrix, threshold=0.3):
    co_rated_items = ratings_matrix.loc[top_users].notnull().sum(axis=1)
    total_items = len(ratings_matrix.columns)
    co_rated_percentage = co_rated_items / total_items
    df = np.where(co_rated_percentage >= threshold, 1, 0.5)
    discounted_similarity = cosine_sim_df.loc[user, top_users] * df
    return discounted_similarity

discounted_sim_df = cosine_sim_df.copy()
discounted_predictions = {}
for user, top_users in top_users_dict.items():
    valid_top_users = [u for u in top_users if u in discounted_sim_df.index]
    discounted_predictions[user] = calculate_prediction(user, valid_top_users, ratings_matrix)

predictions_df = pd.DataFrame(predictions)
discounted_predictions_df = pd.DataFrame(discounted_predictions)


print("Predictions using Cosine Similarity:")
print(predictions_df.head())
print("\nPredictions using Discounted Similarity:")
print(discounted_predictions_df.head())
predictions_df.to_csv('/path/to/save/predictions_cosine.csv', index=False)
discounted_predictions_df.to_csv('/path/to/save/predictions_discounted.csv', index=False)



In [None]:


tnu = ratings_df['user_id'].nunique()
print(f"Total Number of Users (tnu): {tnu}")


tni = ratings_df['item_id'].nunique()
print(f"Total Number of Items (tni): {tni}")


ratings_per_item = ratings_df['item_id'].value_counts().reset_index()
ratings_per_item.columns = ['item_id', 'num_ratings']
print("\nNumber of Ratings per Item:")
print(ratings_per_item.head())

ratings_per_item.to_csv("ratings_per_item.csv", index=False)


In [None]:

import pandas as pd
import numpy as np
import random

np.random.seed(42)
num_users = 1000
num_items = 100
num_ratings = 100000

user_ids = np.arange(1, num_users + 1)
item_ids = np.arange(1, num_items + 1)

ratings_data = {
    "user_id": np.random.choice(user_ids, size=num_ratings),
    "item_id": np.random.choice(item_ids, size=num_ratings),
    "rating": np.random.randint(1, 6, size=num_ratings)
}

ratings_df = pd.DataFrame(ratings_data)
ratings_df.drop_duplicates(subset=["user_id", "item_id"], inplace=True)
ratings_df.reset_index(drop=True, inplace=True)

print("Sample of the generated dataset:")
print(ratings_df.head())


ratings_df.to_csv("generated_ratings_dataset.csv", index=False)

print("\nDataset generated successfully and saved as 'generated_ratings_dataset.csv'.")
