**Imports and Utility Functions**

In [None]:
import pandas as pd       # data manipulation
import numpy as np        # numerical operations
import re                 # regular expressions
from pathlib import Path  # handle filesystem paths

# function to load excel file and print rows and columns
def load_excel(path: Path, filename: str):
    file = path / filename                          # create full path
    if not file.exists():                           # check if file exists
        raise FileNotFoundError(f"File not found: {file}")
    df = pd.read_excel(file, engine="openpyxl")     # read excel file with openpyxl
    print(f"Loaded {filename}: {df.shape[0]} rows, {df.shape[1]} cols")
    return df

# function to normalize text columns with a list of steps
def normalize_columns(df: pd.DataFrame, transformations: dict):
    for col, steps in transformations.items():        # iterate over column and steps
        if col not in df.columns:                     # skip if column not present
            continue
        s = df[col].astype("string")                  # convert column to string dtype
        for step in steps:                            # iterate transformations
            if isinstance(step, str):
                if step == "strip":
                    s = s.str.strip()                # remove leading/trailing spaces
                elif step == "lower":
                    s = s.str.lower()                # convert to lowercase
                elif step == "upper":
                    s = s.str.upper()                # convert to uppercase
                elif step == "title":
                    s = s.str.title()                # title case
                else:
                    if hasattr(s.str, step):        # fallback for other string methods
                        s = getattr(s.str, step)()
                    else:
                        raise ValueError(f"Unknown step '{step}' for column {col}")
            elif isinstance(step, (tuple, list)) and step[0] == "replace":
                _, pat, repl, *rest = step
                regex = rest[0] if rest else False
                s = s.str.replace(pat, repl, regex=regex)   # replace pattern
            elif callable(step):
                try:
                    res = step(s)                     # try vectorized
                    if isinstance(res, (pd.Series, np.ndarray, list)):
                        s = pd.Series(res, index=s.index)
                    else:
                        s = s.apply(step)
                except Exception:
                    s = s.apply(step)                 # fallback to apply
            else:
                raise ValueError("Unsupported transformation step: " + repr(step))
        df[col] = s                                  # assign back to df
    return df

# split comma-separated strings into list and strip spaces
def split_and_strip(s):
    if pd.isna(s) or str(s).strip() == '':
        return []
    return [item.strip() for item in str(s).split(',') if item.strip() != '']

# unified slug extraction from url for both movies and sales
def make_slug(series: pd.Series) -> pd.Series:
    s = series.fillna('').astype(str).str.strip().str.lower()
    s = s.str.replace(r'\?.*$', '', regex=True)       # remove query params
    s = s.str.extract(r'/([^/]+)/?$')[0]             # take last part of url
    s = (
        s.str.replace('-', ' ', regex=False)         # replace dashes with spaces
         .str.replace(r'\(.*?\)', '', regex=True)    # remove parentheses like (2000)
         .str.strip()
         .str.title()
    )
    s = s.replace({'': np.nan})
    return s


**Process Movies**

In [20]:
base_path = Path(r"C:\Users\dbust\OneDrive\Documentos\Amsterdam_2025\DDBM\Database_Management\Project_DBM")

df = load_excel(base_path, "metaClean43Brightspace.xlsx")  # load movies data

if 'summary' in df.columns:
    df_clean = df.drop(columns=['summary']).copy()  # drop summary column
else:
    df_clean = df.copy()

# normalize columns
df_clean = normalize_columns(df_clean, {
    "title": ["strip", "title"],
    "studio": ["strip", "title"],
    "rating": ["strip", "upper"]   # will remove "| " separately
})

# remove leading "| " in rating
if 'rating' in df_clean.columns:
    df_clean['rating'] = df_clean['rating'].str.replace(r'^\|\s*', '', regex=True)

# create movie_id sorted by release date if exists
if 'RelDate' in df_clean.columns:
    df_clean = df_clean.sort_values('RelDate').reset_index(drop=True)
else:
    df_clean = df_clean.reset_index(drop=True)
df_clean['movie_id'] = range(1, len(df_clean) + 1)

# convert cast and genre columns into lists
if 'cast' in df_clean.columns:
    df_clean['cast'] = df_clean['cast'].apply(split_and_strip)
if 'genre' in df_clean.columns:
    df_clean['genre'] = df_clean['genre'].apply(split_and_strip)

# apply unified slug function
if 'url' in df_clean.columns:
    df_clean['slug'] = make_slug(df_clean['url'])
    print("\nMovies - URLs before and after slug extraction:")
    print(df_clean[['url', 'slug']].head(10))

# prints for verification
print("Movies: rows,cols", df_clean.shape)
print("Unique slugs:", df_clean['slug'].nunique(dropna=True) if 'slug' in df_clean else 0)
print("Example titles and ratings:")
print(df_clean[['title','rating']].head(10))

Loaded metaClean43Brightspace.xlsx: 11364 rows, 13 cols

Movies - URLs before and after slug extraction:
                                                 url  \
0     https://www.metacritic.com/movie/fantasia-2000   
1  https://www.metacritic.com/movie/lupin-iii-the...   
2       https://www.metacritic.com/movie/next-friday   
3       https://www.metacritic.com/movie/my-dog-skip   
4         https://www.metacritic.com/movie/supernova   
5       https://www.metacritic.com/movie/down-to-you   
6  https://www.metacritic.com/movie/things-you-ca...   
7     https://www.metacritic.com/movie/the-big-tease   
8           https://www.metacritic.com/movie/the-cup   
9          https://www.metacritic.com/movie/santitos   

                                         slug  
0                               Fantasia 2000  
1          Lupin Iii The Castle Of Cagliostro  
2                                 Next Friday  
3                                 My Dog Skip  
4                                   Su

**Sales Process**

In [None]:

df_sales = load_excel(base_path, "sales_movies.xlsx")  # load sales data

# drop unnecessary columns if they exist
to_drop = ['Unnamed: 8','opening_weekend', 'theatre_count','avg run per theatre', 'creative_type']
df_sales_clean = df_sales.drop(columns=[c for c in to_drop if c in df_sales.columns]).copy()
df_sales_clean.columns = df_sales_clean.columns.str.strip()  # clean column names

# create sales_id sorted by year
df_sales_clean = df_sales_clean.sort_values('year').reset_index(drop=True)
df_sales_clean['sales_id'] = range(1, len(df_sales_clean) + 1)

# apply unified slug function
if 'url' in df_sales_clean.columns:
    df_sales_clean['slug'] = make_slug(df_sales_clean['url'])
    print("\nSales - URLs before and after slug extraction:")
    print(df_sales_clean[['url', 'slug']].head(10))

# --- Merge sales with movies using slug ---
df_sales_merged = df_sales_clean.merge(
    df_clean[['movie_id', 'title', 'slug']] if 'title' in df_clean.columns else df_clean[['movie_id', 'slug']],
    on='slug',
    how='left'  # keep all sales, even if no match
)

# --- Check 1: Non-matches (sales that didn't find a movie_id) ---
non_matches = df_sales_merged[df_sales_merged['movie_id'].isna()]
print(f"\nNumber of non-matching slugs: {len(non_matches)}")
if len(non_matches) > 0:
    print("First 10 non-matching slugs:")
    print(non_matches['slug'].head(10).tolist())
    print(f"Total non-matching slugs: {len(non_matches)}")

# --- Check 2: Duplicates (slugs that matched multiple movie_ids) ---
dup_sales = df_sales_merged[df_sales_merged.duplicated(subset=['slug'], keep=False)]
print(f"\nNumber of duplicated slugs in merge: {dup_sales['slug'].nunique()}")
if len(dup_sales) > 0:
    print("Duplicated slugs list:")
    print(dup_sales[['slug', 'movie_id']].drop_duplicates())

# assign back movie_id to clean sales table
df_sales_clean['movie_id'] = df_sales_merged['movie_id']


Loaded sales_movies.xlsx: 30612 rows, 16 cols

Sales - URLs before and after slug extraction:
                                                 url  \
0  https://www.the-numbers.com/movie/Bakha-Satang...   
1  https://www.the-numbers.com/movie/Looking-for-...   
2      https://www.the-numbers.com/movie/Kurukshetra   
3  https://www.the-numbers.com/movie/Little-Nicky...   
4     https://www.the-numbers.com/movie/Suzhou-River   
5  https://www.the-numbers.com/movie/Possible-Worlds   
6  https://www.the-numbers.com/movie/Me-and-Isaac...   
7    https://www.the-numbers.com/movie/Angels-Ladies   
8  https://www.the-numbers.com/movie/Charlies-Angels   
9  https://www.the-numbers.com/movie/Legend-of-Ba...   

                         slug  
0                Bakha Satang  
1         Looking For An Echo  
2                 Kurukshetra  
3                Little Nicky  
4                Suzhou River  
5             Possible Worlds  
6         Me And Isaac Newton  
7               Angels Ladies  
8