In [1]:
import kagglehub
import os, gc, json, math
import numpy as np
import pandas as pd
from scipy import sparse
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split

path = kagglehub.dataset_download("garymk/movielens-25m-dataset")
print("Path to dataset files:", path)

DATA_DIR = path
OUT_DIR  = "artifacts"
os.makedirs(OUT_DIR, exist_ok=True)


Downloading from https://www.kaggle.com/api/v1/datasets/download/garymk/movielens-25m-dataset?dataset_version_number=1...


100%|██████████| 258M/258M [00:02<00:00, 126MB/s]

Extracting files...





Path to dataset files: /root/.cache/kagglehub/datasets/garymk/movielens-25m-dataset/versions/1


In [9]:

DATA_DIR = os.path.join(path, "ml-25m")
ratings = pd.read_csv(
    f"{DATA_DIR}/ratings.csv",
    dtype={"userId":"int32", "movieId":"int32"},
    usecols=["userId","movieId","rating","timestamp"]
)

movies = pd.read_csv(
    f"{DATA_DIR}/movies.csv",
    dtype={"movieId":"int32","title":"string","genres":"string"}
)

tags = pd.read_csv(
    f"{DATA_DIR}/tags.csv",
    dtype={"userId":"int32","movieId":"int32","tag":"string","timestamp":"int64"}
)

genome_scores = pd.read_csv(
    f"{DATA_DIR}/genome-scores.csv",
    dtype={"movieId":"int32","tagId":"int32","relevance":"float32"}
)

genome_tags = pd.read_csv(
    f"{DATA_DIR}/genome-tags.csv",
    dtype={"tagId":"int32","tag":"string"}
)

links = pd.read_csv(
    f"{DATA_DIR}/links.csv",
    dtype={"movieId":"int32","imdbId":"string","tmdbId":"string"}
)



In [10]:

# drop obvious nulls
ratings.dropna(subset=["userId","movieId","rating"], inplace=True)
movies.dropna(subset=["movieId","title"], inplace=True)

# enforce rating bounds
ratings = ratings[ratings["rating"].between(0.5, 5.0)]

# remove duplicate rating rows
ratings.drop_duplicates(subset=["userId","movieId","timestamp"], inplace=True)

# ensure unique movie rows
movies.drop_duplicates(subset=["movieId"], inplace=True)

ratings.shape, movies.shape


((25000095, 4), (62423, 3))

In [11]:
# Keep only active users and movies
user_counts = ratings.groupby("userId").size()
movie_counts = ratings.groupby("movieId").size()

ratings = ratings[
    ratings["userId"].isin(user_counts[user_counts >= 5].index) &
    ratings["movieId"].isin(movie_counts[movie_counts >= 5].index)
].copy()

print("Users:", ratings['userId'].nunique())
print("Movies:", ratings['movieId'].nunique())
print("Ratings:", len(ratings))


Users: 162541
Movies: 32720
Ratings: 24945870


In [12]:
# make matrix indices start from 0
uid_map = {u: i for i, u in enumerate(sorted(ratings["userId"].unique()))}
mid_map = {m: i for i, m in enumerate(sorted(ratings["movieId"].unique()))}

ratings["uid"] = ratings["userId"].map(uid_map)
ratings["mid"] = ratings["movieId"].map(mid_map)

print(ratings.head())


   userId  movieId  rating   timestamp  uid  mid
0       1      296     5.0  1147880044    0  292
1       1      306     3.5  1147868817    0  302
2       1      307     5.0  1147868828    0  303
3       1      665     5.0  1147878820    0  654
4       1      899     3.5  1147868510    0  878
