<a href="https://colab.research.google.com/github/DvAzevedo/Hybrid_Recommendation_System/blob/main/recsys_datacleaning_eduknow.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


# Runtime & Paths

In [None]:

from google.colab import drive
drive.mount('/content/drive', force_remount=False)

!pip -q install rdflib SPARQLWrapper diskcache tqdm

PROJ_ROOT = "/content/drive/MyDrive/SemanticRec"
DATA_DIR  = f"{PROJ_ROOT}/Data"
CACHE_DIR = f"{PROJ_ROOT}/Cache"

from pathlib import Path
for p in (DATA_DIR, CACHE_DIR):
    Path(p).mkdir(parents=True, exist_ok=True)

print("DATA_DIR  =", DATA_DIR)
print("CACHE_DIR =", CACHE_DIR)


# Download MovieLens "ml-latest-small"

In [None]:
import urllib.request, zipfile, tempfile, shutil, os, pathlib

ML_URL   = "https://files.grouplens.org/datasets/movielens/ml-latest-small.zip"
ratings_csv = pathlib.Path(DATA_DIR) / "ratings.csv"
movies_csv  = pathlib.Path(DATA_DIR) / "movies.csv"

def download_movielens(url: str = ML_URL):
    if ratings_csv.exists() and movies_csv.exists():
        print("✔ MovieLens já presente — pulando download")
        return

    print("⬇Baixando MovieLens…")
    with tempfile.TemporaryDirectory() as tmpdir:
        zip_path = f"{tmpdir}/ml.zip"
        urllib.request.urlretrieve(url, zip_path)

        with zipfile.ZipFile(zip_path) as zf:
            zf.extractall(tmpdir)

        src = pathlib.Path(tmpdir) / "ml-latest-small"
        shutil.copy(src / "ratings.csv", ratings_csv)
        shutil.copy(src / "movies.csv",  movies_csv)

    print("Arquivos copiados para", DATA_DIR)

download_movielens()

# Utilitary Functions

In [None]:
import unicodedata, re
import pandas as pd

def slugify(text: str) -> str:

    # Converte 'Amélie Poulain' → 'amelie-poulain'
    # ASCII-only, minúsculas, hífen no lugar de qualquer caractere não alfanumérico, colapsa hífens múltiplos

    txt = (unicodedata
           .normalize("NFKD", text)
           .encode("ascii", "ignore")
           .decode()
           .lower())
    txt = re.sub(r"[^a-z0-9]+", "-", txt).strip("-")
    return re.sub(r"-{2,}", "-", txt)

def clean_year(raw) -> str:

    # Normaliza ano vindo como string ou float.
    try:
        yr = int(float(raw))
        return str(yr)
    except (ValueError, TypeError):
        return ""

print("slugify('Wall-E (2008)')  →", slugify("Wall-E (2008)"))
print("clean_year('1999.0')      →", clean_year("1999.0"))

# Process movies.csv

In [None]:

import pandas as pd, re

BASE_NS = "http://semantics.id/ns/movies#"
RAW_MOVIES  = movies_csv
CLEAN_MOVIES = RAW_MOVIES.with_name("movies_clean.csv")

def _reorder_article(title: str) -> str:
    # Alguns títulos estão mal formatados, como: 'Shawshank Redemption, The' → 'The Shawshank Redemption'
    # Formata os artigos mais comuns.

    m = re.match(r"^(?P<base>.+),\s*(?P<art>The|A|An|La|Le|El|Los|Las|O|Os|As)$",
                 title, flags=re.IGNORECASE)
    return f"{m.group('art')} {m.group('base')}" if m else title

def build_movies_df(src_path=RAW_MOVIES, dst_path=CLEAN_MOVIES) -> pd.DataFrame:

    # lê movies.csv bruto;
    # gera colunas Titulo, Ano, movie_uri;
    #grava movies_clean.csv (somente as colunas de interesse).

    df = pd.read_csv(src_path, dtype=str)

    # Separa o ano do titulo
    df["Ano"] = df["title"].str.extract(r"\((\d{4})\)").iloc[:, 0].apply(clean_year)
    df["Titulo"] = (df["title"]
                    .str.replace(r"\s*\(\d{4}\)\s*$", "", regex=True)
                    .str.strip()
                    .apply(_reorder_article))

    # URI interna única
    df["movie_uri"] = df.apply(
        lambda r: f"{BASE_NS}{slugify(r.Titulo)}-{r.Ano}" if r.Ano
                  else f"{BASE_NS}{slugify(r.Titulo)}",
        axis=1)

    # salva subset ordenado
    out_cols = ["movieId", "Titulo", "Ano", "movie_uri", "genres"]
    df[out_cols].to_csv(dst_path, index=False)
    print(f"movies_clean.csv salvo em {dst_path}  ({len(df):,} linhas)")
    return df[out_cols]

# execução
movies_df = build_movies_df()
movies_df.head()

# Process ratings.csv

In [None]:
from pathlib import Path
import pandas as pd
import unicodedata, re

DATA_DIR = Path("/content/drive/MyDrive/SemanticRec/Data")
RAW_RATINGS = DATA_DIR / "ratings.csv"
MOVIES_CLEAN = DATA_DIR / "movies_clean.csv"
RATINGS_CLEAN = DATA_DIR / "ratings_clean.csv"

#  Carrega
rat = pd.read_csv(RAW_RATINGS, dtype={"userId":int,"movieId":int,"rating":float})
mov = pd.read_csv(MOVIES_CLEAN, dtype={"movieId":int,"Titulo":str})

# Faz merge pelo movieId (chave segura)
df = rat.merge(mov[["movieId","Titulo"]], on="movieId", how="inner")

# Mantém só as colunas que o notebook de CF espera
df = df[["userId","Titulo","rating"]]

print("Linhas antes :", len(rat))
print("Linhas depois:", len(df))
print("Usuários     :", df["userId"].nunique())
print("Filmes       :", df["Titulo"].nunique())

# Salva
df.to_csv(RATINGS_CLEAN, index=False)
print("ratings_clean.csv gravado em", RATINGS_CLEAN)

# Get Directors


In [None]:
import requests, time, json, random, pathlib, pickle
from typing import List, Tuple
import pandas as pd

OMDB_KEYS: List[str] = [
    "", "", "", ""  # Coleque as chaves da api aqui
]

CACHE_FILE = pathlib.Path(CACHE_DIR) / "director_cache.pkl"


# cache
def _load_cache() -> dict[Tuple[str, str], str]:
    if CACHE_FILE.exists():
        with CACHE_FILE.open("rb") as fh:
            return pickle.load(fh)
    return {}

def _save_cache(cache: dict):
    with CACHE_FILE.open("wb") as fh:
        pickle.dump(cache, fh)


# consulta
def fetch_director_omdb(title: str, year: str, api_key: str) -> str:
    """
    Consulta OMDb e devolve o campo 'Director'.
    Retorna "" em qualquer falha.
    """
    url = "http://www.omdbapi.com/"
    params = {"t": title, "y": year, "apikey": api_key}
    try:
        r = requests.get(url, params=params, timeout=5)
        data = r.json()
        if data.get("Response") == "True":
            director = data.get("Director", "")
            return director if director not in {"N/A", ""} else ""
    except Exception:
        pass
    return ""


#  loop que pega o diretor da omdb
def enrich_with_director(df_movies: pd.DataFrame,
                         api_keys: List[str] = OMDB_KEYS,
                         sleep_sec: float = 1.0) -> pd.DataFrame:
    #cache persistente
    # rodízio de chaves
    # 1 req / s (parâmetro sleep_sec)

    df = df_movies.copy()
    if "Diretor" not in df.columns:
        df["Diretor"] = ""

    cache = _load_cache()
    keys_cycle = iter(api_keys)
    key = next(keys_cycle)

    for idx, row in df.iterrows():
        title, year = row["Titulo"], row["Ano"]
        cache_key = (title.lower(), year)

        if df.at[idx, "Diretor"]:                     # já preenchido
            continue
        if cache_key in cache:                        # cache hit
            df.at[idx, "Diretor"] = cache[cache_key]
            continue

        # consulta OMDb
        director = fetch_director_omdb(title, year, key)
        if not director:
            try:
                key = next(keys_cycle)
            except StopIteration:
                keys_cycle = iter(api_keys)
                key = next(keys_cycle)
            director = fetch_director_omdb(title, year, key)

        # grava resultado em cache & dataframe
        cache[cache_key] = director
        df.at[idx, "Diretor"] = director
        print(f"{idx:>5}: {title} ({year}) → {director}")

        _save_cache(cache)                            # flush a cada passo
        time.sleep(sleep_sec)

    return df


# ---------- executar e salvar ----------------------------------------
DIRECTOR_CSV = pathlib.Path(DATA_DIR) / "movies_diretor.csv"

movies_with_dir = enrich_with_director(movies_df)
movies_with_dir.to_csv(DIRECTOR_CSV, index=False)
print(f" movies_director.csv salvo em {DIRECTOR_CSV}")