In [None]:
import pandas as pd
import numpy as np
from scipy.sparse import load_npz, save_npz, coo_matrix

In [None]:
users = pd.read_csv('./data/users.csv')
games = pd.read_csv('./data/games.csv')

unique_user_ids = users['user_id'].unique()
unique_game_ids = games['app_id'].unique()

user_index = {user_id: idx for idx, user_id in enumerate(unique_user_ids)}
app_index = {game_id: idx for idx, game_id in enumerate(unique_game_ids)}
reverse_user_index = {idx: user_id for user_id, idx in user_index.items()}
reverse_app_index = {idx: game_id for game_id, idx in app_index.items()}

In [None]:
rating_matrix_sparse = load_npz('rating_matrix_sparse.npz')
row_indices = rating_matrix_sparse.row
col_indices = rating_matrix_sparse.col
data = rating_matrix_sparse.data

entries = np.vstack((row_indices, col_indices, data)).T

positive_interactions = data > 0
user_row_counts_positive = np.bincount(row_indices, weights=data * positive_interactions, minlength=rating_matrix_sparse.shape[0])

In [None]:
users_15 = np.where(user_row_counts_positive >= 15)[0]
users_5_14 = np.where((user_row_counts_positive < 15) & (user_row_counts_positive >= 5))[0]
users_diff = np.setdiff1d(row_indices, np.union1d(users_15, users_5_14))

In [None]:
filtered_entries_15 = entries[np.isin(entries[:, 0], users_15)]
filtered_entries_5_14 = entries[np.isin(entries[:, 0], users_5_14)]
filtered_entries_diff = entries[np.isin(entries[:, 0], users_diff)]

In [None]:
unique_users_15 = np.unique(filtered_entries_15[:, 0])
selected_users_15 = np.random.choice(unique_users_15, size=1000, replace=False)

test_entries_15 = filtered_entries_15[np.isin(filtered_entries_15[:, 0], selected_users_15)]

In [None]:
unique_users_5_14 = np.unique(filtered_entries_5_14[:, 0])
selected_users_5_14 = np.random.choice(unique_users_5_14, size=500, replace=False)

test_entries_5_14 = filtered_entries_5_14[np.isin(filtered_entries_5_14[:, 0], selected_users_5_14)]

In [None]:
unique_users_diff = np.unique(filtered_entries_diff[:, 0])
selected_users_diff = np.random.choice(unique_users_diff, size=500, replace=False)

test_entries_diff = filtered_entries_diff[np.isin(filtered_entries_diff[:, 0], selected_users_diff)]

In [None]:
test_entries = np.vstack((test_entries_15, test_entries_5_14, test_entries_diff))

test_indices_set = set(map(tuple, test_entries))
train_mask = np.array([tuple(entry) not in test_indices_set for entry in entries])
train_entries = entries[train_mask]

num_users, num_games = rating_matrix_sparse.shape
train_matrix = coo_matrix((train_entries[:, 2], (train_entries[:, 0], train_entries[:, 1])),
                          shape=(num_users, num_games))
test_matrix = coo_matrix((test_entries[:, 2], (test_entries[:, 0], test_entries[:, 1])),
                         shape=(num_users, num_games))

save_npz('train_matrix.npz', train_matrix)
save_npz('test_matrix.npz', test_matrix)

In [None]:
train_matrix = load_npz('train_matrix.npz')
test_matrix = load_npz('test_matrix.npz')

if not isinstance(test_matrix, coo_matrix):
    test_matrix = coo_matrix(test_matrix)

test_row = test_matrix.row
test_col = test_matrix.col
test_data = test_matrix.data

test_entries_by_user = {}
for row, col, data in zip(test_row, test_col, test_data):
    if row not in test_entries_by_user:
        test_entries_by_user[row] = []
    test_entries_by_user[row].append((col, data))

train_and_test_rows = []
train_and_test_cols = []
train_and_test_data = []

rest_test_rows = []
rest_test_cols = []
rest_test_data = []

np.random.seed(42)

for user, entries in test_entries_by_user.items():
    entries = np.array(entries)
    np.random.shuffle(entries)
    
    num_entries = len(entries)
    split_idx = int(num_entries * 0.6)
    
    for col, data in entries[:split_idx]:
        train_and_test_rows.append(user)
        train_and_test_cols.append(col)
        train_and_test_data.append(data)

    for col, data in entries[split_idx:]:
        rest_test_rows.append(user)
        rest_test_cols.append(col)
        rest_test_data.append(data)

train_and_test_matrix = coo_matrix((
    np.hstack([train_matrix.data, train_and_test_data]),
    (
        np.hstack([train_matrix.row, train_and_test_rows]),
        np.hstack([train_matrix.col, train_and_test_cols])
    )
), shape=train_matrix.shape)

rest_test_matrix = coo_matrix((
    rest_test_data,
    (rest_test_rows, rest_test_cols)
), shape=test_matrix.shape)

save_npz('train_and_test.npz', train_and_test_matrix)
save_npz('rest_test.npz', rest_test_matrix)

In [None]:
import matplotlib.pyplot as plt
recommendations = pd.read_csv('./recommendations.csv')

user_recommendations = recommendations.groupby('user_id')['review_id'].count()
app_recommendations = recommendations.groupby('app_id')['review_id'].count()

avg_user_recommendations = user_recommendations.mean()
avg_app_recommendations = app_recommendations.mean()

plt.figure(figsize=(10, 5))
plt.hist(user_recommendations, bins=range(1, 20, 1), color='skyblue', edgecolor='black')
plt.title('Histogram liczby rekomendacji na użytkownika')
plt.xlabel('Liczba rekomendacji')
plt.ylabel('Liczba użytkowników')
plt.grid(axis='y')
plt.show()

plt.figure(figsize=(10, 5))
plt.hist(app_recommendations, bins=range(100, 20000, 100), color='lightgreen', edgecolor='black')
plt.title('Histogram liczby rekomendacji na grę')
plt.xlabel('Liczba rekomendacji')
plt.ylabel('Liczba gier')
plt.grid(axis='y')
plt.show()

df = pd.read_csv('cleora/recommendations.csv')
app_recommendations_count = df['app_id'].value_counts().to_dict()

filtered_df = df[df['app_id'].apply(lambda x: app_recommendations_count[x] >= 15)]
filtered_df.to_csv('filtered_recommendations.csv', index=False)