# Data Reduction

In [1]:
import pandas as pd
import numpy as np

In [2]:
def print_data_count(ratings: pd.DataFrame):
    s = ["user", "movie"]
    for i, t in enumerate(["UserID", "MovieID"]):
        group = ratings[[t, "Rating"]].groupby([t])
        movie_count = group.count().values
        print(f"Each {s[i]} rated from {movie_count.min()}-{movie_count.max()} {s[1-i]}s.")

### Full Rating Dataset

In [3]:
SOURCE_FILE = "../database/merged/full/ratings.csv"
ratings = pd.read_csv(SOURCE_FILE)
print_data_count(ratings)

Each user rated from 20-33332 movies.
Each movie rated from 1-102929 users.


### Reduced Rating Dataset

In [4]:
import os, sys
sys.path.append(os.path.abspath(".."))
from utility import Mapper, prune_bigraph

TARGET_FILE = "../database/merged/small/ratings.csv"
SOURCE_MAPPER_FILE = "../database/merged/full/pydata/mapper.pkl"
TARGET_MAPPER_FILE = "../database/merged/small/pydata/mapper.pkl"

user_items = ratings.drop(columns="Rating")
ui_matrix = user_items.to_numpy()
ui_mapper = Mapper(ui_matrix)

In [5]:
user_items = ratings.drop(columns="Rating").to_numpy()
mapper: Mapper = None
if not os.path.isfile(SOURCE_MAPPER_FILE):
    print("Creating mapper...")
    mapper = Mapper(user_items)
    mapper.save(SOURCE_MAPPER_FILE)
else:
    print("Loading mapper...")
    mapper = Mapper.load(SOURCE_MAPPER_FILE)
print("Done")

Loading mapper...
Done


In [6]:
# Guarantee each user rated at least 20 movies
user_threshold = 20
# Guarantee each movie is rated by at least 20 users
item_threshold = 20

In [None]:
users = user_items[:, 0]
items = user_items[:, 1]
num_user, num_item = len(np.unique(users)), len(np.unique(items))
print("Mapping...")
users = iter([mapper.user_fwd_map[int(u)] for u in users])
items = iter([mapper.item_fwd_map[int(i)] for i in items])
print("Start bigraph pruning...")
valid_users, valid_items = prune_bigraph(
    a=users,
    b=items,
    counts=(num_user, num_item),
    thresholds=(user_threshold, item_threshold),
)
print("Inverse Mapping...")
valid_users = [mapper.user_inv_map[u] for u in valid_users]
valid_items = [mapper.item_inv_map[i] for i in valid_items]
print("Reducing...")
reduced_ratings = ratings[
    (ratings["UserID"].isin(valid_users)) & (ratings["MovieID"].isin(valid_items))
]
print("Saving...")
reduced_ratings.to_csv(TARGET_FILE, index=False)
mapper = Mapper(reduced_ratings.drop(columns="Rating").to_numpy())
mapper.save(TARGET_MAPPER_FILE)
print("Done")

Mapping...


In [None]:
print_data_count(reduced_ratings)