In [1]:
import pandas as pd       # data manipulation
import numpy as np        # numerical operations
import re                 # regular expressions
from pathlib import Path  # handle filesystem paths

#function to load excel file and print rows and columns
def load_excel(path: Path, filename: str):
    file = path / filename                          # create full path
    if not file.exists():                           # check if file exists
        raise FileNotFoundError(f"File not found: {file}")
    df = pd.read_excel(file, engine="openpyxl")     # read excel file with openpyxl
    print(f"Loaded {filename}: {df.shape[0]} rows, {df.shape[1]} cols")
    return df

#split comma-separated strings into list and strip spaces
def split_and_strip(s):
    if pd.isna(s) or str(s).strip() == '':
        return []
    return [item.strip() for item in str(s).split(',') if item.strip() != '']

base_path = Path(r"C:\Users\dbust\OneDrive\Documentos\Amsterdam_2025\DDBM\Database_Management\Project_DBM")

df = load_excel(base_path, "metaClean43Brightspace.xlsx")  # load movies data

df_genre= df.filter(items=['title', 'genre'])


#convert genre columns into lists
if 'genre' in df_genre.columns:
    df_genre['genre'] = df_genre['genre'].apply(split_and_strip)
    #print(df_genre['genre'])
    
# ---- Create normalized genre table ----

#explode genre lists into individual rows
norm_genre=df_genre[['genre']].explode('genre').dropna()

#drop duplicates to get unique genres
norm_genre = norm_genre.drop_duplicates().reset_index(drop=True)

#add genre_id
norm_genre['genre_id'] = range(1, len(norm_genre) + 1)

#reorder columns
norm_genre = norm_genre[['genre_id', 'genre']]

#print check
print("Unique genres:", len(norm_genre))
print(norm_genre.head(20))


Loaded metaClean43Brightspace.xlsx: 11364 rows, 13 cols
Unique genres: 27
    genre_id        genre
0          1  Documentary
1          2       Action
2          3       Sci-Fi
3          4        Drama
4          5      Mystery
5          6     Thriller
6          7       Horror
7          8       Comedy
8          9      Romance
9         10    Adventure
10        11      Fantasy
11        12       Family
12        13        Crime
13        14          War
14        15      History
15        16        Sport
16        17    Biography
17        18        Music
18        19         News
19        20      Musical
