### 02 - Preprocessing
Prepare cleaned data, encode genres, and create train/valid/test splits.

In [14]:
import os, sys
import pandas as pd
import numpy as np
from pathlib import Path
BASE = Path.cwd().parent
sys.path.append(str(BASE / 'src'))
from data_loader import (
    load_ratings, load_movies, merge_ratings_movies,
    filter_cold_start, encode_genres_multihot,
    train_valid_test_split_by_time, save_dataframe
)

RAW = BASE / 'data' / 'raw'
PROC = BASE / 'data' / 'processed'
print('Base:', BASE)
print('RAW exists:', RAW.exists(), 'PROC:', PROC.exists())

Base: /Users/alanyu/Documents/IIT/ITM/ITMD-524-Applied AI and Deep Learning/finalproject/MovieLens-MCRS
RAW exists: True PROC: True


### Load raw csv

In [10]:
# Define data file paths
ratings_path = RAW / 'ratings.csv'
movies_path  = RAW / 'movies.csv'
assert ratings_path.exists() and movies_path.exists(), 'Place rating.csv and movie.csv in data/raw/'

ratings = load_ratings(str(ratings_path))
movies  = load_movies(str(movies_path))
ratings.head(), movies.head(), ratings.shape, movies.shape

2025-10-18 02:32:14,668 | INFO | data_loader | Loading ratings from /Users/alanyu/Documents/IIT/ITM/ITMD-524-Applied AI and Deep Learning/finalproject/MovieLens-MCRS/data/raw/ratings.csv
2025-10-18 02:32:14,705 | INFO | data_loader | Loading movies from /Users/alanyu/Documents/IIT/ITM/ITMD-524-Applied AI and Deep Learning/finalproject/MovieLens-MCRS/data/raw/movies.csv


(   userId  movieId  rating  timestamp            datetime
 0       1        1     4.0  964982703 2000-07-30 18:45:03
 1       1        3     4.0  964981247 2000-07-30 18:20:47
 2       1        6     4.0  964982224 2000-07-30 18:37:04
 3       1       47     5.0  964983815 2000-07-30 19:03:35
 4       1       50     5.0  964982931 2000-07-30 18:48:51,
    movieId                               title  \
 0        1                    Toy Story (1995)   
 1        2                      Jumanji (1995)   
 2        3             Grumpier Old Men (1995)   
 3        4            Waiting to Exhale (1995)   
 4        5  Father of the Bride Part II (1995)   
 
                                         genres  
 0  Adventure|Animation|Children|Comedy|Fantasy  
 1                   Adventure|Children|Fantasy  
 2                               Comedy|Romance  
 3                         Comedy|Drama|Romance  
 4                                       Comedy  ,
 (100836, 5),
 (9742, 3))

### Merge & Basic Filtering

In [12]:
# Merge & Basic Filtering
df = merge_ratings_movies(ratings, movies)
print('Before filter:', df['userId'].nunique(), 'users,', df['movieId'].nunique(), 'movies,', len(df), 'rows')

MIN_USER = 20
MIN_MOVIE = 50
df_f = filter_cold_start(df, min_user_ratings=MIN_USER, min_movie_ratings=MIN_MOVIE)
df_f.head(), df_f.shape


2025-10-18 02:37:24,881 | INFO | data_loader | Merging ratings with movies
2025-10-18 02:37:24,898 | INFO | data_loader | Filtering cold-start users (<20) and movies (<50)
2025-10-18 02:37:24,918 | INFO | data_loader | Remaining: 479 users, 426 movies, 38654 rows


Before filter: 610 users, 9724 movies, 100836 rows


(   userId  movieId  rating  timestamp            datetime  \
 0       1        1     4.0  964982703 2000-07-30 18:45:03   
 2       1        6     4.0  964982224 2000-07-30 18:37:04   
 3       1       47     5.0  964983815 2000-07-30 19:03:35   
 4       1       50     5.0  964982931 2000-07-30 18:48:51   
 5       1       70     3.0  964982400 2000-07-30 18:40:00   
 
                          title                                       genres  
 0             Toy Story (1995)  Adventure|Animation|Children|Comedy|Fantasy  
 2                  Heat (1995)                        Action|Crime|Thriller  
 3  Seven (a.k.a. Se7en) (1995)                             Mystery|Thriller  
 4   Usual Suspects, The (1995)                       Crime|Mystery|Thriller  
 5   From Dusk Till Dawn (1996)                Action|Comedy|Horror|Thriller  ,
 (38654, 7))

### Encode Genres

In [15]:
movies_enriched, genre_cols = encode_genres_multihot(movies)
print('Genre columns:', len(genre_cols))
movies_enriched.head()

Genre columns: 19


Unnamed: 0,movieId,title,genres,genre_Action,genre_Adventure,genre_Animation,genre_Children,genre_Comedy,genre_Crime,genre_Documentary,...,genre_Film-Noir,genre_Horror,genre_IMAX,genre_Musical,genre_Mystery,genre_Romance,genre_Sci-Fi,genre_Thriller,genre_War,genre_Western
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,0,1,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,Jumanji (1995),Adventure|Children|Fantasy,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,Grumpier Old Men (1995),Comedy|Romance,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
4,5,Father of the Bride Part II (1995),Comedy,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


### Train/Valid/Test Split

In [None]:
# Train/Valid/Test Split
train, valid, test = train_valid_test_by_user = train_valid_test_split_by_time(
    df_f, valid_ratio=0.1, test_ratio=0.1, by_user=True
)
# Summary of splits
for name, part in [('train',train), ('valid',valid), ('test',test)]:
    print(name, part.shape, 'users:', part['userId'].nunique(), 'movies:', part['movieId'].nunique())

2025-10-18 02:48:05,973 | INFO | data_loader | Performing per-user chronological split
2025-10-18 02:48:06,078 | INFO | data_loader | Split sizes -> train:31316 valid:3669 test:3669


train (31316, 7) users: 479 movies: 426
valid (3669, 7) users: 479 movies: 426
test (3669, 7) users: 479 movies: 426


### Save Processed Files

In [17]:
# Save Processed Files
save_dataframe(train, str(PROC / 'ratings_train.csv'))
save_dataframe(valid, str(PROC / 'ratings_valid.csv'))
save_dataframe(test,  str(PROC / 'ratings_test.csv'))
save_dataframe(movies_enriched, str(PROC / 'movies_enriched.csv'))
print('Saved processed datasets in', PROC)

2025-10-18 02:49:01,324 | INFO | data_loader | Saved: /Users/alanyu/Documents/IIT/ITM/ITMD-524-Applied AI and Deep Learning/finalproject/MovieLens-MCRS/data/processed/ratings_train.csv
2025-10-18 02:49:01,336 | INFO | data_loader | Saved: /Users/alanyu/Documents/IIT/ITM/ITMD-524-Applied AI and Deep Learning/finalproject/MovieLens-MCRS/data/processed/ratings_valid.csv
2025-10-18 02:49:01,346 | INFO | data_loader | Saved: /Users/alanyu/Documents/IIT/ITM/ITMD-524-Applied AI and Deep Learning/finalproject/MovieLens-MCRS/data/processed/ratings_test.csv
2025-10-18 02:49:01,367 | INFO | data_loader | Saved: /Users/alanyu/Documents/IIT/ITM/ITMD-524-Applied AI and Deep Learning/finalproject/MovieLens-MCRS/data/processed/movies_enriched.csv


Saved processed datasets in /Users/alanyu/Documents/IIT/ITM/ITMD-524-Applied AI and Deep Learning/finalproject/MovieLens-MCRS/data/processed
