# Data Reduction

Theduce the orignal rating data to fit into model and bench-marking

In [1]:
import pandas as pd
import numpy as np

In [2]:
def print_data_count(ratings: pd.DataFrame):
    s = ["user", "movie"]
    for i, t in enumerate(["UserID", "MovieID"]):
        group = ratings[[t, "Rating"]].groupby([t])
        movie_count = group.count().values
        print(f"Each {s[i]} rated from {movie_count.min()}-{movie_count.max()} {s[1-i]}s.")

### Full Rating Dataset

In [3]:
SOURCE_FILE = "../database/merged/full/ratings.csv"
ratings = pd.read_csv(SOURCE_FILE)
print_data_count(ratings)

Each user rated from 20-33332 movies.
Each movie rated from 1-102929 users.


### Reduced Rating Dataset

In [9]:
import os, sys
sys.path.append(os.path.abspath(".."))
from utility import Mapper, prune_bigraph

TARGET_FILE = "../database/merged/small/ratings.csv"
SOURCE_MAPPER_FILE = "../database/merged/full/pydata/mapper.pkl"
TARGET_MAPPER_FILE = "../database/merged/small/pydata/mapper.pkl"

In [10]:
user_items = ratings.drop(columns="Rating")
ui_matrix = user_items.to_numpy()
ui_mapper = Mapper(ui_matrix)

In [13]:
user_items = ratings.drop(columns="Rating").to_numpy()
if not os.path.isfile(SOURCE_MAPPER_FILE):
    print("Creating mapper...")
    mapper = Mapper(user_items)
    mapper.save(SOURCE_MAPPER_FILE)
else:
    print("Loading mapper...")
    mapper = Mapper.load(SOURCE_MAPPER_FILE)
print("Done")

Loading mapper...
Done


In [None]:
users = user_items[:, 0]
items = user_items[:, 1]
num_user, num_item = len(np.unique(users)), len(np.unique(items))
print("Mapping...")
users = iter([mapper.user_fwd_map[int(u)] for u in users])
items = iter([mapper.item_fwd_map[int(i)] for i in items])
print("Start bigraph pruning...")
valid_users, valid_items = prune_bigraph(
    a=users, b=items, counts=(num_user, num_item), thresholds=(100, 100)
)
print("Inverse Mapping...")
valid_users = [mapper.user_inv_map[u] for u in valid_users]
valid_items = [mapper.item_inv_map[i] for i in valid_items]
print("Reducing...")
reduced_ratings = ratings[
    (ratings["UserID"].isin(valid_users)) & (ratings["MovieID"].isin(valid_items))
]
print("Saving...")
reduced_ratings.to_csv(TARGET_FILE, index=False)
mapper = Mapper(reduced_ratings.drop(columns="Rating").to_numpy())
mapper.save(TARGET_MAPPER_FILE)
print("Done")

Mapping...
Start bigraph pruning...


In [None]:
print_data_count(reduced_ratings)

Dataset contains 251894 ratings from 
-- 36 users
-- 51947 movies
Each user rated from 5004-33332 movies.
Each movie rated from 1-36 users.


In [None]:
pd.pivot_table(reduced_ratings, index="UserID", columns="MovieID", values="Rating", fill_value=0.0)

MovieID,1,2,3,4,5,6,7,8,9,10,...,291815,291857,291883,292021,292031,292139,292175,292313,292349,292395
UserID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
7858,5.0,0.5,0.5,0.5,1.0,4.0,1.0,0.5,0.5,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,1.0,0.0
8359,4.0,3.0,3.0,0.0,3.0,4.0,0.0,3.0,3.5,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8588,4.0,3.5,3.0,0.0,3.5,5.0,0.0,0.0,2.0,4.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9085,4.0,3.0,2.0,0.0,2.0,4.5,2.0,0.0,0.0,3.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10202,5.0,4.0,4.0,0.0,5.0,4.0,4.0,0.0,0.0,4.0,...,0.0,0.0,2.0,2.5,0.0,0.0,0.0,0.0,0.0,0.0
14674,3.0,1.5,3.0,0.0,2.5,3.5,2.5,0.0,2.5,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15875,2.5,3.5,3.0,2.0,2.5,4.0,2.5,0.0,2.0,3.0,...,0.0,4.0,0.0,0.0,3.0,3.5,0.0,0.0,0.0,0.0
17035,3.5,1.5,3.0,1.0,2.5,4.0,3.0,0.0,0.0,2.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
22744,5.0,4.0,3.0,0.5,4.0,5.0,3.0,2.0,4.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
37677,2.0,5.0,3.0,2.0,0.0,3.0,4.0,0.0,0.0,3.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
