## Import Library & Load Cleaned Data

In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import MultiLabelBinarizer, MinMaxScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

BASE_PATH = r"C:\Users\Irifoxet\Documents\Jupyter\Predict Movie\data"
PROCESSED_PATH = os.path.join(BASE_PATH, "processed")
DATASETS_PATH = os.path.join(PROCESSED_PATH, "datasets")
CLEANED_PATH = os.path.join(PROCESSED_PATH, "cleaned")

PREPROCESS_PATH = os.path.join(PROCESSED_PATH, "preprocess")

os.makedirs(PREPROCESS_PATH, exist_ok=True)

files = [
    "movies_cleaned_f.csv",
    "ratings_cleaned_f.csv",
    "tags_cleaned_f.csv",
    "links_cleaned.csv"
]

print("[RUN] Loading cleaned datasets in multiple locations...\n")
missing_files = []

for file in files:
    found = False
    fpath_cleaned = os.path.join(CLEANED_PATH, file)
    if os.path.exists(fpath_cleaned):
        print(f"[INFO] Load {file} in CLEANED_PATH")
        found = True
    
    fpath_datasets = os.path.join(DATASETS_PATH, file)
    if os.path.exists(fpath_datasets):
        print(f"[INFO] Load {file} in DATASETS_PATH")
        found = True

    if not found:
        print(f"[ERROR] Missing: {file}")
        missing_files.append(file)

if missing_files:
    print(f"\n[ERROR] Missing files: {', '.join(missing_files)}")
else:
    print("\n[DONE] All files Loaded successfully!")

movies = pd.read_csv(os.path.join(CLEANED_PATH, "movies_cleaned_f.csv"))
ratings = pd.read_csv(os.path.join(CLEANED_PATH, "ratings_cleaned_f.csv"))
tags = pd.read_csv(os.path.join(CLEANED_PATH, "tags_cleaned_f.csv"))
links_path = os.path.join(DATASETS_PATH, "links_cleaned.csv") if os.path.exists(os.path.join(DATASETS_PATH, "links_cleaned.csv")) else os.path.join(CLEANED_PATH, "links_cleaned.csv")
links = pd.read_csv(links_path)

display(movies.head())
display(ratings.head())
display(tags.head())
display(links.head())

[RUN] Checking cleaned datasets in multiple locations...

[INFO] Load movies_cleaned_f.csv in CLEANED_PATH
[INFO] Load ratings_cleaned_f.csv in CLEANED_PATH
[INFO] Load tags_cleaned_f.csv in CLEANED_PATH
[INFO] Load links_cleaned.csv in DATASETS_PATH

[DONE] All files found successfully!


Unnamed: 0,movieId,title,year,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story,1995.0,0,0,1,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
1,2,Jumanji,1995.0,0,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,Grumpier Old Men,1995.0,0,0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0
3,4,Waiting to Exhale,1995.0,0,0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0
4,5,Father of the Bride Part II,1995.0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


Unnamed: 0,userId,movieId,rating,timestamp,datetime
0,1,17,4.0,944249077,1999-12-03 19:24:37
1,1,25,1.0,944250228,1999-12-03 19:43:48
2,1,29,2.0,943230976,1999-11-22 00:36:16
3,1,30,5.0,944249077,1999-12-03 19:24:37
4,1,32,5.0,943228858,1999-11-22 00:00:58


Unnamed: 0,userId,movieId,tag,timestamp,datetime
0,22,26479,Kevin Kline,1583038886,2020-03-01 05:01:26
1,22,79592,misogyny,1581476297,2020-02-12 02:58:17
2,22,247150,acrophobia,1622483469,2021-05-31 17:51:09
3,34,2174,music,1249808064,2009-08-09 08:54:24
4,34,2174,weird,1249808102,2009-08-09 08:55:02


Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


## Merge Datasets & Time Features

In [2]:
print("[RUN] Merging datasets...\n")

movie_data = pd.merge(ratings, movies, on="movieId", how="left")
movie_data = pd.merge(movie_data, links, on="movieId", how="left")

print(f"[INFO] movie_data shape: {movie_data.shape}")

movie_data['datetime'] = pd.to_datetime(movie_data['timestamp'], unit='s', errors='coerce')
movie_data['year_rated']  = movie_data['datetime'].dt.year
movie_data['month_rated'] = movie_data['datetime'].dt.month
movie_data['day_rated']   = movie_data['datetime'].dt.day

movie_data['year'] = pd.to_numeric(movie_data['year'], errors='coerce')
movie_data['year'] = movie_data['year'].fillna(movie_data['year'].median())

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
movie_data['year_norm'] = scaler.fit_transform(movie_data[['year']])

print("\n[INFO] Added time-based features: year_rated, month_rated, day_rated, year_norm")
print("[DONE] Merge and time feature generation complete.")

display(movie_data.sample(5))

[RUN] Merging datasets...

[INFO] movie_data shape: (32000204, 29)

[INFO] Added time-based features: year_rated, month_rated, day_rated, year_norm
[DONE] Merge and time feature generation complete.


Unnamed: 0,userId,movieId,rating,timestamp,datetime,title,year,(no genres listed),Action,Adventure,...,Sci-Fi,Thriller,War,Western,imdbId,tmdbId,year_rated,month_rated,day_rated,year_norm
13573847,84879,60072,4.5,1499689088,2017-07-10 12:18:08,Wanted,2008.0,0,1,0,...,0,1,0,0,493464,8909.0,2017,7,10,0.899329
13654128,85360,199958,3.0,1602360132,2020-10-10 20:02:12,"Yes, God, Yes",2017.0,0,0,0,...,0,0,0,0,6089002,421465.0,2020,10,10,0.959732
1340398,8588,2159,0.5,1466985876,2016-06-27 00:04:36,Henry: Portrait of a Serial Killer,1986.0,0,0,0,...,0,1,0,0,99763,10692.0,2016,6,27,0.751678
14217513,88923,117529,3.0,1691178532,2023-08-04 19:48:52,Jurassic World,2015.0,0,1,1,...,1,1,0,0,369610,135397.0,2023,8,4,0.946309
26707982,167887,1799,3.0,945241863,1999-12-15 07:11:03,Suicide Kings,1997.0,0,0,0,...,0,1,0,0,120241,10668.0,1999,12,15,0.825503


## TF-IDF from Tags

In [3]:
print("[RUN] Building TF-IDF matrix from tags...\n")

tags_text = tags.groupby('movieId')['tag'].apply(lambda x: ' '.join(x)).reset_index()
movie_tags = pd.merge(movies, tags_text, on='movieId', how='left')
movie_tags['tag'] = movie_tags['tag'].fillna('')

tfidf = TfidfVectorizer(max_features=1000, stop_words='english')
tfidf_matrix = tfidf.fit_transform(movie_tags['tag'])

tfidf_df = pd.DataFrame(tfidf_matrix.toarray(),
                        columns=tfidf.get_feature_names_out(),
                        index=movie_tags.index)

movie_features = pd.concat([movie_tags[['movieId', 'title', 'year']], tfidf_df], axis=1)
tfidf_path = os.path.join(PREPROCESS_PATH, "movies_tfidf_features.csv")
movie_features.to_csv(tfidf_path, index=False)

print(f"[DONE] TF-IDF features saved to: {tfidf_path}")
print("[INFO] Shape:", movie_features.shape)

display(movie_features.sample(5))

[RUN] Building TF-IDF matrix from tags...

[DONE] TF-IDF features saved to: C:\Users\Irifoxet\Documents\Jupyter\Predict Movie\data\processed\preprocess\movies_tfidf_features.csv
[INFO] Shape: (87585, 1003)


Unnamed: 0,movieId,title,year,10,100,11,1930s,1950s,1960s,1970s,...,world,writer,writing,written,year.1,york,young,younger,zombie,zombies
86714,289577,The Collective,2023.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
18317,95875,Total Recall,2012.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.010042,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2070,2160,Rosemary's Baby,1968.0,0.0,0.014701,0.0,0.0,0.0,0.029265,0.0,...,0.0,0.012756,0.0,0.014217,0.012885,0.046831,0.0,0.0,0.0,0.0
87234,291118,Farmer Seeking Love,2022.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8508,25994,Salt of the Earth,1954.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Create Sparse User–Movie Matrix for Collaborative Filtering

In [4]:
print("[RUN] Creating Sparse User–Movie Matrix for CF...\n")

from scipy.sparse import coo_matrix, save_npz
import numpy as np
import pandas as pd
import os

n_users = movie_data['userId'].nunique()
n_movies = movie_data['movieId'].nunique()
n_ratings = movie_data.shape[0]

print(f"[INFO] Unique users: {n_users}")
print(f"[INFO] Unique movies: {n_movies}")
print(f"[INFO] Total ratings: {n_ratings}")

user_codes = movie_data['userId'].astype('category').cat.codes
movie_codes = movie_data['movieId'].astype('category').cat.codes
rating_values = movie_data['rating'].astype(np.float32)

user_movie_sparse = coo_matrix((rating_values, (user_codes, movie_codes)))

print("\n[INFO] Sparse matrix created successfully!")
print("[INFO] Shape:", user_movie_sparse.shape)
print("[INFO] Density (non-zero ratio):",
      round(user_movie_sparse.nnz / np.prod(user_movie_sparse.shape), 6))

sparse_path = os.path.join(PREPROCESS_PATH, "user_movie_matrix_sparse.npz")
save_npz(sparse_path, user_movie_sparse)

user_mapping = dict(enumerate(movie_data['userId'].astype('category').cat.categories))
movie_mapping = dict(enumerate(movie_data['movieId'].astype('category').cat.categories))

pd.DataFrame(list(user_mapping.items()), columns=["user_index", "userId"])\
    .to_csv(os.path.join(PREPROCESS_PATH, "user_mapping.csv"), index=False)
pd.DataFrame(list(movie_mapping.items()), columns=["movie_index", "movieId"])\
    .to_csv(os.path.join(PREPROCESS_PATH, "movie_mapping.csv"), index=False)

print(f"[DONE] Saved sparse matrix to: {sparse_path}")
print("[DONE] Saved user_mapping.csv & movie_mapping.csv for model reference")

print(f"\n[SUMMARY]")
print(f"[INFO] Sparse matrix size: {user_movie_sparse.shape[0]} users × {user_movie_sparse.shape[1]} movies")
print(f"[INFO] Density: {user_movie_sparse.nnz / np.prod(user_movie_sparse.shape):.6f}")
print(f"[INFO] File size reduced from ~{(n_users*n_movies*8)/1e9:.1f} GB → < {(user_movie_sparse.nnz*8)/1e9:.2f} GB")
print("\n[DONE] Sparse User–Movie Matrix ready for Modeling")

[RUN] Creating Sparse User–Movie Matrix for CF...

[INFO] Unique users: 200948
[INFO] Unique movies: 84432
[INFO] Total ratings: 32000204

[INFO] Sparse matrix created successfully!
[INFO] Shape: (200948, 84432)
[INFO] Density (non-zero ratio): 0.001886
[DONE] Saved sparse matrix to: C:\Users\Irifoxet\Documents\Jupyter\Predict Movie\data\processed\preprocess\user_movie_matrix_sparse.npz
[DONE] Saved user_mapping.csv & movie_mapping.csv for model reference

[SUMMARY]
[INFO] Sparse matrix size: 200948 users × 84432 movies
[INFO] Density: 0.001886
[INFO] File size reduced from ~135.7 GB → < 0.26 GB

[DONE] Sparse User–Movie Matrix ready for Modeling


## Dimensionality Reduction (SVD for Tag Features)

In [5]:
print("[RUN] Performing SVD for dimensionality reduction on tag features...\n")

from sklearn.decomposition import TruncatedSVD

svd = TruncatedSVD(n_components=50, random_state=42)
movie_tfidf_reduced = svd.fit_transform(tfidf_matrix)

movie_tfidf_reduced_df = pd.DataFrame(
    movie_tfidf_reduced,
    columns=[f"feature_{i}" for i in range(50)]
)
movie_tfidf_reduced_df["movieId"] = movie_tags["movieId"]

svd_path = os.path.join(PREPROCESS_PATH, "movies_tfidf_reduced.csv")
movie_tfidf_reduced_df.to_csv(svd_path, index=False)

print(f"[DONE] Reduced TF-IDF saved to: {svd_path}")
print("[INFO] Explained variance ratio sum:",
      np.sum(svd.explained_variance_ratio_))
print("[INFO] Shape:", movie_tfidf_reduced_df.shape)
display(movie_tfidf_reduced_df.head())

[RUN] Performing SVD for dimensionality reduction on tag features...

[DONE] Reduced TF-IDF saved to: C:\Users\Irifoxet\Documents\Jupyter\Predict Movie\data\processed\preprocess\movies_tfidf_reduced.csv
[INFO] Explained variance ratio sum: 0.3229336991591538
[INFO] Shape: (87585, 51)


Unnamed: 0,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,...,feature_41,feature_42,feature_43,feature_44,feature_45,feature_46,feature_47,feature_48,feature_49,movieId
0,0.024369,0.113343,-0.066826,0.042824,-0.00073,0.014523,-0.010473,0.058073,0.019716,-0.06619,...,0.043217,0.014264,-0.099437,-0.064526,-0.079561,0.033173,0.045353,-0.023632,-0.03335,1
1,0.043496,0.135102,-0.077875,-0.023515,-0.058605,-0.00174,-0.026413,0.021826,-0.002712,-0.0884,...,0.0302,-0.034351,-0.070721,-0.004674,0.031803,-0.023982,-0.017174,0.026543,-0.003107,2
2,0.019877,0.072069,-0.042825,-0.005309,-0.017132,-0.031757,0.002561,0.01681,0.010842,-0.046801,...,-0.025924,-0.002673,-0.020403,0.000701,0.014199,-0.000693,-0.017936,0.07576,-0.034206,3
3,0.023366,0.102719,-0.073262,-0.065091,-0.071397,0.068855,-0.096441,-0.023896,-0.047241,-0.023622,...,-0.052068,-0.030263,-0.009901,0.014144,-0.02177,0.035409,-0.003188,-0.005026,0.012138,4
4,0.034848,0.127369,-0.100255,0.030209,-0.029277,-0.047389,-0.002588,0.025224,0.022226,-0.137525,...,0.022234,-0.022321,0.045999,-0.009795,0.025372,0.007069,-0.008183,0.035627,0.001264,5


## Build Hybrid Dataset (Content + Collaborative)

In [6]:
print("[RUN] Building lightweight hybrid dataset (Content + Collaborative)...\n")

print("[INFO] movie_data shape:", movie_data.shape)
print("[INFO] tfidf_reduced shape:", movie_tfidf_reduced_df.shape)

hybrid_info = movie_data[['userId', 'movieId', 'rating', 'year_norm', 'year_rated', 'month_rated', 'day_rated']].copy()

valid_ids = movie_tfidf_reduced_df['movieId'].unique()
hybrid_info = hybrid_info[hybrid_info['movieId'].isin(valid_ids)]

hybrid_info_path = os.path.join(PREPROCESS_PATH, "hybrid_user_movie_info.csv")
tfidf_reduced_path = os.path.join(PREPROCESS_PATH, "movies_tfidf_reduced.csv")

hybrid_info.to_csv(hybrid_info_path, index=False)

print(f"[DONE] Saved hybrid_user_movie_info.csv to: {hybrid_info_path}")
print(f"[INFO] Shape: {hybrid_info.shape}")
print(f"[INFO] Columns: {list(hybrid_info.columns)}")

print("\n[INFO] Note:")
print("  - movies_tfidf_reduced.csv = content features of each movie (used in Content Model)")
print("  - hybrid_user_movie_info.csv = user-movie interactions (used in Collaborative Model)")
print("\n[DONE] Lightweight hybrid dataset ready for modeling")

display(hybrid_info.sample(5))

[RUN] Building lightweight hybrid dataset (Content + Collaborative)...

[INFO] movie_data shape: (32000204, 33)
[INFO] tfidf_reduced shape: (87585, 51)
[DONE] Saved hybrid_user_movie_info.csv to: C:\Users\Irifoxet\Documents\Jupyter\Predict Movie\data\processed\preprocess\hybrid_user_movie_info.csv
[INFO] Shape: (32000204, 7)
[INFO] Columns: ['userId', 'movieId', 'rating', 'year_norm', 'year_rated', 'month_rated', 'day_rated']

[INFO] Note:
  - movies_tfidf_reduced.csv = content features of each movie (used in Content Model)
  - hybrid_user_movie_info.csv = user-movie interactions (used in Collaborative Model)

[DONE] Lightweight hybrid dataset ready for modeling


Unnamed: 0,userId,movieId,rating,year_norm,year_rated,month_rated,day_rated
9758768,60974,88744,3.5,0.919463,2017,4,17
22801473,142765,2858,5.0,0.838926,1999,11,30
24492786,153599,71156,3.0,0.90604,2016,12,2
12981045,81241,6,3.0,0.812081,2005,3,23
20829879,130465,1639,5.0,0.825503,2000,11,9


## Summary and Validation Output

In [7]:
print("\n[SUMMARY] Data Pre-processing Complete\n")
print(f"[INFO] Movies: {movies.shape}")
print(f"[INFO] Ratings: {ratings.shape}")
print(f"[INFO] Tags: {tags.shape}")
print(f"[INFO] Links: {links.shape}")
print(f"[INFO] Hybrid dataset: {hybrid_info.shape}")
print(f"[INFO] Sparse matrix: {user_movie_sparse.shape}")

print("\n[FILES GENERATED]")
print(" ├─ movies_tfidf_features.csv")
print(" ├─ movies_tfidf_reduced.csv")
print(" ├─ user_movie_matrix_sparse.npz")
print(" ├─ user_mapping.csv")
print(" ├─ movie_mapping.csv")
print(" └─ hybrid_user_movie_info.csv")

print("\n[DONE] Pre-processing ready for EDA & Modeling stage")


[SUMMARY] Data Pre-processing Complete

[INFO] Movies: (87585, 23)
[INFO] Ratings: (32000204, 5)
[INFO] Tags: (2000055, 5)
[INFO] Links: (87585, 3)
[INFO] Hybrid dataset: (32000204, 7)
[INFO] Sparse matrix: (200948, 84432)

[FILES GENERATED]
 ├─ movies_tfidf_features.csv
 ├─ movies_tfidf_reduced.csv
 ├─ user_movie_matrix_sparse.npz
 ├─ user_mapping.csv
 ├─ movie_mapping.csv
 └─ hybrid_user_movie_info.csv

[DONE] Pre-processing ready for EDA & Modeling stage
