## Import Library & Load Data

In [1]:
import pandas as pd
import numpy as np
import os
import re

BASE_PATH = r"C:\Users\Irifoxet\Documents\Jupyter\Predict Movie\data"
RAW_PATH = os.path.join(BASE_PATH, "raw")

PROCESSED_PATH = os.path.join(BASE_PATH, "processed")
DATASETS_PATH = os.path.join(PROCESSED_PATH, "datasets")
CLEANED_PATH = os.path.join(PROCESSED_PATH, "cleaned")

os.makedirs(PROCESSED_PATH, exist_ok=True)
os.makedirs(DATASETS_PATH, exist_ok=True)
os.makedirs(CLEANED_PATH, exist_ok=True)

paths_to_check = {
    "data": BASE_PATH,
    "raw": RAW_PATH,
    "processed": PROCESSED_PATH,
    "datasets": DATASETS_PATH,
    "cleaned": CLEANED_PATH
}

print("[RUN] Checking directories...\n")
for path_name, path in paths_to_check.items():
    if os.path.exists(path):
        print(f"[INFO] {path_name}")
    else:
        print(f"[ERROR] {path_name}")

missing = [name for name, path in paths_to_check.items() if not os.path.exists(path)]
if missing:
    print("\n[ERROR] Missing directories:", ", ".join(missing))
else:
    print("\n[DONE] All directories found successfully!")

[RUN] Checking directories...

[INFO] data
[INFO] raw
[INFO] processed
[INFO] datasets
[INFO] cleaned

[DONE] All directories found successfully!


## Check Dataset

In [2]:
files = ["movies.csv", "ratings.csv", "tags.csv", "links.csv"]

print("[RUN] Loading datasets files...\n")
missing_files = []

for file_name in files:
    file_path = os.path.join(RAW_PATH, file_name)
    if os.path.exists(file_path):
        print(f"[INFO] Found: {file_name}")
    else:
        print(f"[ERROR] Not found: {file_name}")
        missing_files.append(file_name)

if missing_files:
    print("\n[ERROR] Missing files:", ", ".join(missing_files))
else:
    print("\n[DONE] All files loaded successfully!")

movies = pd.read_csv(os.path.join(RAW_PATH, "movies.csv")) if os.path.exists(os.path.join(RAW_PATH, "movies.csv")) else None
ratings = pd.read_csv(os.path.join(RAW_PATH, "ratings.csv")) if os.path.exists(os.path.join(RAW_PATH, "ratings.csv")) else None
tags = pd.read_csv(os.path.join(RAW_PATH, "tags.csv")) if os.path.exists(os.path.join(RAW_PATH, "tags.csv")) else None
links = pd.read_csv(os.path.join(RAW_PATH, "links.csv")) if os.path.exists(os.path.join(RAW_PATH, "links.csv")) else None

display(movies.head())
display(ratings.head())
display(tags.head())
display(links.head())

[RUN] Checking datasets files...

[INFO] Found: movies.csv
[INFO] Found: ratings.csv
[INFO] Found: tags.csv
[INFO] Found: links.csv

[DONE] All dataset files found successfully!


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


Unnamed: 0,userId,movieId,rating,timestamp
0,1,17,4.0,944249077
1,1,25,1.0,944250228
2,1,29,2.0,943230976
3,1,30,5.0,944249077
4,1,32,5.0,943228858


Unnamed: 0,userId,movieId,tag,timestamp
0,22,26479,Kevin Kline,1583038886
1,22,79592,misogyny,1581476297
2,22,247150,acrophobia,1622483469
3,34,2174,music,1249808064
4,34,2174,weird,1249808102


Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


## Data Info & Quality Check

In [3]:
for name, df in zip(["movies", "ratings", "tags", "links"], [movies, ratings, tags, links]):
    print(f"\n[DEBUG] {name.upper()}")
    if df is not None:
        df.info()
        print("\n[WARNING] Missing values per column:")
        print(df.isnull().sum())
        print("\n[WARNING] Duplicated rows:", df.duplicated().sum())
    else:
        print(f"[WARNING] {name} DataFrame is missing or not loaded.")


[DEBUG] MOVIES
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 87585 entries, 0 to 87584
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  87585 non-null  int64 
 1   title    87585 non-null  object
 2   genres   87585 non-null  object
dtypes: int64(1), object(2)
memory usage: 2.0+ MB

movieId    0
title      0
genres     0
dtype: int64


[DEBUG] RATINGS
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32000204 entries, 0 to 32000203
Data columns (total 4 columns):
 #   Column     Dtype  
---  ------     -----  
 0   userId     int64  
 1   movieId    int64  
 2   rating     float64
 3   timestamp  int64  
dtypes: float64(1), int64(3)
memory usage: 976.6 MB

userId       0
movieId      0
rating       0
timestamp    0
dtype: int64


[DEBUG] TAGS
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000072 entries, 0 to 2000071
Data columns (total 4 columns):
 #   Column     Dtype 
---  ------     ----- 
 0   userId  

## Data Cleaning

In [4]:
movies.drop_duplicates(inplace=True)
ratings.drop_duplicates(inplace=True)
tags.drop_duplicates(inplace=True)
links.drop_duplicates(inplace=True)

movies.dropna(subset=['movieId', 'title'], inplace=True)
ratings.dropna(subset=['userId', 'movieId', 'rating'], inplace=True)
tags.dropna(subset=['userId', 'movieId'], inplace=True)
links.dropna(subset=['movieId'], inplace=True)

links[['movieId', 'imdbId', 'tmdbId']] = links[['movieId', 'imdbId', 'tmdbId']].astype('Int64')
ratings[['userId', 'movieId']] = ratings[['userId', 'movieId']].astype('Int64')

for name, df in zip(["movies", "ratings", "tags", "links"], [movies, ratings, tags, links]):
    print(f"\n[INFO] {name.upper()} AFTER CLEANING")
    print("Shape:", df.shape)
    print("Missing values:\n", df.isnull().sum())
    print("Duplicated rows:", df.duplicated().sum())


for name, df in zip(["movies", "ratings", "tags", "links"], [movies, ratings, tags, links]):
    if df is not None:
        output_path = os.path.join(DATASETS_PATH, f"{name}_cleaned.csv")
        df.to_csv(output_path, index=False)
        print(f"[DONE] Exported {name}_cleaned.csv")


[INFO] MOVIES AFTER CLEANING
Shape: (87585, 3)
Missing values:
 movieId    0
title      0
genres     0
dtype: int64
Duplicated rows: 0

[INFO] RATINGS AFTER CLEANING
Shape: (32000204, 4)
Missing values:
 userId       0
movieId      0
rating       0
timestamp    0
dtype: int64
Duplicated rows: 0

[INFO] TAGS AFTER CLEANING
Shape: (2000072, 4)
Missing values:
 userId        0
movieId       0
tag          17
timestamp     0
dtype: int64
Duplicated rows: 0

[INFO] LINKS AFTER CLEANING
Shape: (87585, 3)
Missing values:
 movieId      0
imdbId       0
tmdbId     124
dtype: int64
Duplicated rows: 0
[DONE] Exported movies_cleaned.csv
[DONE] Exported ratings_cleaned.csv
[DONE] Exported tags_cleaned.csv
[DONE] Exported links_cleaned.csv


## Extract Movie Year

In [5]:
movies_clean = pd.read_csv(os.path.join(DATASETS_PATH, "movies_cleaned.csv"))

print("\n[RUN] MOVIES_CLEAN BEFORE YEAR EXTRACTION")
print("[INFO] Shape:", movies_clean.shape)
print("[WARNING] Missing values per column:\n", movies_clean.isnull().sum())
print("[WARNING] Duplicated rows:", movies_clean.duplicated().sum())

movies_clean['year'] = movies_clean['title'].str.extract(r'\((\d{4})\)$')
movies_clean['title'] = movies_clean['title'].str.replace(r'\s*\(\d{4}\)$', '', regex=True)
movies_clean['year'] = pd.to_numeric(movies_clean['year'], errors='coerce')
movies_clean['year'] = movies_clean['year'].fillna("Unknown")

print("\n[RUN] MOVIES_CLEAN AFTER YEAR EXTRACTION")
print("[INFO] Shape:", movies_clean.shape)
print("[WARNING] Missing values per column:\n", movies_clean.isnull().sum())
print("[WARNING] Duplicated rows:", movies_clean.duplicated().sum())

output_path_y = os.path.join(DATASETS_PATH, "movies_cleaned_y.csv")
movies_clean.to_csv(output_path_y, index=False)
print(f"\n[DONE] Saved movies_clean y to: {output_path_y}")

display(movies_clean.sample(5))


[RUN] MOVIES_CLEAN BEFORE YEAR EXTRACTION
[INFO] Shape: (87585, 3)
 movieId    0
title      0
genres     0
dtype: int64

[RUN] MOVIES_CLEAN AFTER YEAR EXTRACTION
[INFO] Shape: (87585, 4)
 movieId    0
title      0
genres     0
year       0
dtype: int64

[DONE] Saved movies_clean y to: C:\Users\Irifoxet\Documents\Jupyter\Predict Movie\data\processed\datasets\movies_cleaned_y.csv


Unnamed: 0,movieId,title,genres,year
70808,228053,Home for Harvest,Romance,2019.0
74451,243650,Fabulous,Comedy,2019.0
30647,136377,Tigers in Lipstick,Comedy,1979.0
61047,202345,Cyborg 2087,Drama|Sci-Fi,1966.0
4418,4522,Masquerade,Mystery|Romance|Thriller,1988.0


## Transform Genres from String to List and One-Hot Encoding

In [6]:
from sklearn.preprocessing import MultiLabelBinarizer

movies_clean_y = pd.read_csv(os.path.join(DATASETS_PATH, "movies_cleaned_y.csv"))

print("\n[RUN] MOVIES_CLEAN_Y")
print("[INFO] Shape:", movies_clean_y.shape)
print("[WARNING] Missing values per column:\n", movies_clean_y.isnull().sum())
print("[WARNING] Duplicated rows:", movies_clean_y.duplicated().sum())
print("")

movies_cleaned_g = movies_clean_y.copy()
movies_cleaned_g['genres'] = movies_cleaned_g['genres'].apply(
    lambda x: x.split('|') if isinstance(x, str) else []
)

output_path_g = os.path.join(DATASETS_PATH, "movies_cleaned_g.csv")
movies_cleaned_g.to_csv(output_path_g, index=False)
print(f"[DONE] Saved movies_clean g to: {output_path_g}")

mlb = MultiLabelBinarizer()
genres_encoded = pd.DataFrame(
    mlb.fit_transform(movies_cleaned_g['genres']),
    columns=mlb.classes_,
    index=movies_cleaned_g.index
)

movies_cleaned_f = pd.concat([movies_cleaned_g.drop(columns=['genres']), genres_encoded], axis=1)

output_path_f = os.path.join(CLEANED_PATH, "movies_cleaned_f.csv")
movies_cleaned_f.to_csv(output_path_f, index=False)
print(f"[DONE] Saved movies_clean f to: {output_path_f}")

display(movies_cleaned_g.sample(5))
display(movies_cleaned_f.sample(5))


[RUN] MOVIES_CLEAN_Y
[INFO] Shape: (87585, 4)
 movieId    0
title      1
genres     0
year       0
dtype: int64

[DONE] Saved movies_clean g to: C:\Users\Irifoxet\Documents\Jupyter\Predict Movie\data\processed\datasets\movies_cleaned_g.csv
[DONE] Saved movies_clean f to: C:\Users\Irifoxet\Documents\Jupyter\Predict Movie\data\processed\cleaned\movies_cleaned_f.csv


Unnamed: 0,movieId,title,genres,year
37233,151479,The Flash 2 - Revenge of the Trickster,"[Action, Fantasy, Sci-Fi]",1991.0
48773,176235,Pooh's Heffalump Halloween Movie,"[Animation, Children]",2005.0
69220,222993,Bheeshma,"[Drama, Romance]",2020.0
72426,234562,Young Lady Chatterley II,[Drama],1985.0
16327,86399,Movie Days (Bíódagar),"[Comedy, Drama]",1994.0


Unnamed: 0,movieId,title,year,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
7569,7979,Monterey Pop,1968.0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4066,4169,Blow Dry (a.k.a. Never Better),2001.0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
33398,142519,Hunter × Hunter: The Last Mission,2013.0,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3257,3350,"Raisin in the Sun, A",1961.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
20650,106734,Marfa Girl,2012.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Clean Ratings and Tags Data

In [7]:
ratings = pd.read_csv(os.path.join(DATASETS_PATH, "ratings_cleaned.csv"))
tags = pd.read_csv(os.path.join(DATASETS_PATH, "tags_cleaned.csv"))

for name, df in zip(["ratings", "tags"], [ratings, tags]):
    print(f"\n[RUN] {name.upper()} BEFORE CLEANING")
    print("[INFO] Shape:", df.shape)
    print("[WARNING] Missing values:\n", df.isnull().sum())
    print("[WARNING] Duplicated rows:", df.duplicated().sum())

ratings = ratings[(ratings['rating'] >= 0) & (ratings['rating'] <= 5)]
ratings['rating'] = (ratings['rating'] * 2).round() / 2.0
ratings['rating'] = ratings['rating'].astype(float)

ratings['datetime'] = pd.to_datetime(ratings['timestamp'], unit='s', errors='coerce')
tags['datetime'] = pd.to_datetime(tags['timestamp'], unit='s', errors='coerce')

tags.dropna(subset=['tag'], inplace=True)

for name, df in zip(["ratings", "tags"], [ratings, tags]):
    print(f"\n[RUN] {name.upper()} AFTER CLEANING")
    print("[INFO] Shape:", df.shape)
    print("[WARNING] Missing values:\n", df.isnull().sum())
    print("[WARNING] Duplicated rows:", df.duplicated().sum())

ratings.to_csv(os.path.join(CLEANED_PATH, "ratings_cleaned_f.csv"), index=False)
tags.to_csv(os.path.join(CLEANED_PATH, "tags_cleaned_f.csv"), index=False)
print("[DONE] Saved ratings_cleaned_f.csv & tags_cleaned_f.csv")

print("Unique rating values:", sorted(ratings['rating'].unique()))


[RUN] RATINGS BEFORE CLEANING
[INFO] Shape: (32000204, 4)
 userId       0
movieId      0
rating       0
timestamp    0
dtype: int64

[RUN] TAGS BEFORE CLEANING
[INFO] Shape: (2000072, 4)
 userId        0
movieId       0
tag          17
timestamp     0
dtype: int64

[RUN] RATINGS AFTER CLEANING
[INFO] Shape: (32000204, 5)
 userId       0
movieId      0
rating       0
timestamp    0
datetime     0
dtype: int64

[RUN] TAGS AFTER CLEANING
[INFO] Shape: (2000055, 5)
 userId       0
movieId      0
tag          0
timestamp    0
datetime     0
dtype: int64
[DONE] Saved ratings_cleaned_f.csv & tags_cleaned_f.csv
Unique rating values: [np.float64(0.5), np.float64(1.0), np.float64(1.5), np.float64(2.0), np.float64(2.5), np.float64(3.0), np.float64(3.5), np.float64(4.0), np.float64(4.5), np.float64(5.0)]


## Validate movieId Consistency and Explore Ratings

In [8]:
movies = pd.read_csv(os.path.join(CLEANED_PATH, "movies_cleaned_f.csv"))
ratings = pd.read_csv(os.path.join(CLEANED_PATH, "ratings_cleaned_f.csv"))
tags = pd.read_csv(os.path.join(CLEANED_PATH, "tags_cleaned_f.csv"))
links = pd.read_csv(os.path.join(DATASETS_PATH, "links_cleaned.csv"))

valid_movie_ids = set(movies['movieId'])
ratings = ratings[ratings['movieId'].isin(valid_movie_ids)]
tags = tags[tags['movieId'].isin(valid_movie_ids)]
links = links[links['movieId'].isin(valid_movie_ids)]

print(f"[INFO] Valid movieId count: {len(valid_movie_ids)}")
print("[DONE] Filtered all datasets for valid movieId")

num_users = ratings['userId'].nunique()
num_movies = ratings['movieId'].nunique()
avg_ratings_per_user = ratings.groupby('userId')['rating'].count().mean()
avg_ratings_per_movie = ratings.groupby('movieId')['rating'].count().mean()

print("\n[STATISTICS]")
print(f"[INFO] Unique users: {num_users}")
print(f"[INFO] Unique movies: {num_movies}")
print(f"[INFO] Average ratings per user: {avg_ratings_per_user:.2f}")
print(f"[INFO] Average ratings per movie: {avg_ratings_per_movie:.2f}")

[INFO] Valid movieId count: 87585
[DONE] Filtered all datasets for valid movieId

[STATISTICS]
[INFO] Unique users: 200948
[INFO] Unique movies: 84432
[INFO] Average ratings per user: 159.25
[INFO] Average ratings per movie: 379.01
