In [49]:
import os
import pandas as pd
import numpy as np
import requests, zipfile, re
from tqdm.auto import tqdm
import plotly.express as px

BASE_FOLDER = "/Users/alapofis/ease-optimization/notebooks/movielens_1m/ml-1m"
MOVIES_PATH = os.path.join(BASE_FOLDER, "movies.dat")
USERS_PATH = os.path.join(BASE_FOLDER, "users.dat")
RATINGS_PATH = os.path.join(BASE_FOLDER, "ratings.dat")
README_PATH = os.path.join(BASE_FOLDER, "README")

In [None]:
URL = "http://files.grouplens.org/datasets/movielens/ml-1m.zip"
ZIP_PATH = os.path.join(BASE_FOLDER, "ml-1m.zip")

os.makedirs(BASE_FOLDER, exist_ok=True)

if not os.path.exists(ZIP_PATH):
    with requests.get(URL, stream=True) as r:
        r.raise_for_status()
        with open(ZIP_PATH, 'wb') as f:
            for chunk in r.iter_content(chunk_size=8192):
                f.write(chunk)
    print("Download completed")

if not os.path.exists(MOVIES_PATH):
    with zipfile.ZipFile(ZIP_PATH, 'r') as zip_ref:
        zip_ref.extractall(BASE_FOLDER)
    print(f"Files extracted to {BASE_FOLDER}")

In [None]:
movies_df = pd.read_csv(
    MOVIES_PATH, delimiter='::', engine='python', header=None,
    names=['movie_id', 'movie_name', 'genre'], encoding='latin1'
)

movies_df['genre'] = movies_df['genre'].apply(lambda x: x.split('|'))

ratings_df = pd.read_csv(
    RATINGS_PATH, delimiter='::', engine='python', header=None,
    names=['user_id', 'movie_id', 'rating', 'timestamp'], encoding='latin1'
)

In [52]:
ratings_df = ratings_df[['user_id', 'movie_id', 'timestamp']].copy()
ratings_df['y'] = 1

In [53]:
rng = np.random.default_rng(52)

train_parts, val_parts, test_parts = [], [], []

for u, g in ratings_df.groupby("user_id"):
    n = len(g)
    if n < 3:
        continue

    idx = rng.permutation(n)
    n_train = int(0.8 * n)
    n_val = int(0.1 * n)

    train_parts.append(g.iloc[idx[:n_train]])
    val_parts.append(g.iloc[idx[n_train:n_train + n_val]])
    test_parts.append(g.iloc[idx[n_train + n_val:]])

train_df = pd.concat(train_parts)
val_df = pd.concat(val_parts)
test_df = pd.concat(test_parts)

In [54]:
user_ids = train_df.user_id.unique()
movie_ids = train_df.movie_id.unique()

user2idx = {u: i for i, u in enumerate(user_ids)}
item2idx = {i: j for j, i in enumerate(movie_ids)}

In [55]:
from scipy.sparse import csr_matrix

rows = train_df.user_id.map(user2idx).to_numpy()
cols = train_df.movie_id.map(item2idx).to_numpy()
data = np.ones(len(train_df), dtype=np.float32)

X_train = csr_matrix(
    (data, (rows, cols)),
    shape=(len(user2idx), len(item2idx))
)