<a href="https://colab.research.google.com/github/DvAzevedo/Hybrid_Recommendation_System/blob/main/recsys_datacleaning_eduknow.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


# Runtime & Paths

In [None]:

from google.colab import drive
drive.mount('/content/drive', force_remount=False)

!pip -q install rdflib SPARQLWrapper diskcache tqdm

PROJ_ROOT = "/content/drive/MyDrive/SemanticRec"
DATA_DIR  = f"{PROJ_ROOT}/Data"
CACHE_DIR = f"{PROJ_ROOT}/Cache"

from pathlib import Path
for p in (DATA_DIR, CACHE_DIR):
    Path(p).mkdir(parents=True, exist_ok=True)

print("üìÅ DATA_DIR  =", DATA_DIR)
print("üìÅ CACHE_DIR =", CACHE_DIR)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
üìÅ DATA_DIR  = /content/drive/MyDrive/SemanticRec/Data
üìÅ CACHE_DIR = /content/drive/MyDrive/SemanticRec/Cache


# Download MovieLens "ml-latest-small"

In [None]:
import urllib.request, zipfile, tempfile, shutil, os, pathlib

ML_URL   = "https://files.grouplens.org/datasets/movielens/ml-latest-small.zip"
ratings_csv = pathlib.Path(DATA_DIR) / "ratings.csv"
movies_csv  = pathlib.Path(DATA_DIR) / "movies.csv"

def download_movielens(url: str = ML_URL):
    if ratings_csv.exists() and movies_csv.exists():
        print("‚úî MovieLens j√° presente ‚Äî pulando download")
        return

    print("‚¨áÔ∏è  Baixando MovieLens‚Ä¶")
    with tempfile.TemporaryDirectory() as tmpdir:
        zip_path = f"{tmpdir}/ml.zip"
        urllib.request.urlretrieve(url, zip_path)

        with zipfile.ZipFile(zip_path) as zf:
            zf.extractall(tmpdir)

        src = pathlib.Path(tmpdir) / "ml-latest-small"
        shutil.copy(src / "ratings.csv", ratings_csv)
        shutil.copy(src / "movies.csv",  movies_csv)

    print("‚úÖ Arquivos copiados para", DATA_DIR)

download_movielens()

‚úî MovieLens j√° presente ‚Äî pulando download


# Utilitary Functions

In [None]:
import unicodedata, re
import pandas as pd

def slugify(text: str) -> str:
    """
    Converte 'Am√©lie Poulain' ‚Üí 'amelie-poulain'
    ‚Ä¢ ASCII-only
    ‚Ä¢ min√∫sculas
    ‚Ä¢ h√≠fen no lugar de qualquer caractere n√£o alfanum√©rico
    ‚Ä¢ colapsa h√≠fens m√∫ltiplos
    """
    txt = (unicodedata
           .normalize("NFKD", text)
           .encode("ascii", "ignore")
           .decode()
           .lower())
    txt = re.sub(r"[^a-z0-9]+", "-", txt).strip("-")
    return re.sub(r"-{2,}", "-", txt)

def clean_year(raw) -> str:
    """
    Normaliza ano vindo como string ou float.
    Ex.: '1999.0' ‚Üí '1999';   NaN ‚Üí ''.
    """
    try:
        yr = int(float(raw))
        return str(yr)
    except (ValueError, TypeError):
        return ""

print("slugify('Wall-E (2008)')  ‚Üí", slugify("Wall-E (2008)"))
print("clean_year('1999.0')      ‚Üí", clean_year("1999.0"))

slugify('Wall-E (2008)')  ‚Üí wall-e-2008
clean_year('1999.0')      ‚Üí 1999


# Process movies.csv

In [None]:

import pandas as pd, re

# --- utilidades que voc√™ j√° tem em outra c√©lula ---------------------
# from utils import slugify, clean_year       #  ‚Üê se preferir importar
# (vou assumir que slugify() e clean_year() j√° est√£o no namespace)

BASE_NS = "http://semantics.id/ns/movies#"
RAW_MOVIES  = movies_csv               # caminho do seu movies.csv original
CLEAN_MOVIES = RAW_MOVIES.with_name("movies_clean.csv")   # evita sobrescrever

def _reorder_article(title: str) -> str:
    """
    'Shawshank Redemption, The' ‚Üí 'The Shawshank Redemption'
    cobre artigos mais comuns.
    """
    m = re.match(r"^(?P<base>.+),\s*(?P<art>The|A|An|La|Le|El|Los|Las)$",
                 title, flags=re.IGNORECASE)
    return f"{m.group('art')} {m.group('base')}" if m else title

def build_movies_df(src_path=RAW_MOVIES, dst_path=CLEAN_MOVIES) -> pd.DataFrame:
    """
    1) l√™ movies.csv bruto;
    2) gera colunas Titulo, Ano, movie_uri;
    3) grava movies_clean.csv (somente as colunas de interesse).
    """
    df = pd.read_csv(src_path, dtype=str)

    # --- Ano + t√≠tulo sem o (1995) -----------------------------------
    df["Ano"] = df["title"].str.extract(r"\((\d{4})\)").iloc[:, 0].apply(clean_year)
    df["Titulo"] = (df["title"]
                    .str.replace(r"\s*\(\d{4}\)\s*$", "", regex=True)
                    .str.strip()
                    .apply(_reorder_article))

    # --- URI interna √∫nica -------------------------------------------
    df["movie_uri"] = df.apply(
        lambda r: f"{BASE_NS}{slugify(r.Titulo)}-{r.Ano}" if r.Ano
                  else f"{BASE_NS}{slugify(r.Titulo)}",
        axis=1)

    # --- salva subset ordenado ---------------------------------------
    out_cols = ["movieId", "Titulo", "Ano", "movie_uri", "genres"]
    df[out_cols].to_csv(dst_path, index=False)
    print(f"‚úÖ movies_clean.csv salvo em {dst_path}  ({len(df):,} linhas)")
    return df[out_cols]

# ---- execu√ß√£o -------------------------------------------------------
movies_df = build_movies_df()
movies_df.head()

‚úÖ movies_clean.csv salvo em /content/drive/MyDrive/SemanticRec/Data/movies_clean.csv  (9,742 linhas)


Unnamed: 0,movieId,Titulo,Ano,movie_uri,genres
0,1,Toy Story,1995,http://semantics.id/ns/movies#toy-story-1995,Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji,1995,http://semantics.id/ns/movies#jumanji-1995,Adventure|Children|Fantasy
2,3,Grumpier Old Men,1995,http://semantics.id/ns/movies#grumpier-old-men...,Comedy|Romance
3,4,Waiting to Exhale,1995,http://semantics.id/ns/movies#waiting-to-exhal...,Comedy|Drama|Romance
4,5,Father of the Bride Part II,1995,http://semantics.id/ns/movies#father-of-the-br...,Comedy


# Process ratings.csv

In [None]:
from pathlib import Path
import pandas as pd
import unicodedata, re

DATA_DIR = Path("/content/drive/MyDrive/SemanticRec/Data")
RAW_RATINGS = DATA_DIR / "ratings.csv"        # MovieLens original
MOVIES_CLEAN = DATA_DIR / "movies_clean.csv"  # j√° cont√©m campo Titulo
RATINGS_CLEAN = DATA_DIR / "ratings_clean.csv"

# 1.  Carrega
rat = pd.read_csv(RAW_RATINGS, dtype={"userId":int,"movieId":int,"rating":float})
mov = pd.read_csv(MOVIES_CLEAN, dtype={"movieId":int,"Titulo":str})

# 2.  Faz merge pelo movieId (chave segura)
df = rat.merge(mov[["movieId","Titulo"]], on="movieId", how="inner")

# 3.  Mant√©m s√≥ as colunas que o notebook de CF espera
df = df[["userId","Titulo","rating"]]

print("Linhas antes :", len(rat))
print("Linhas depois:", len(df))
print("Usu√°rios     :", df["userId"].nunique())
print("Filmes       :", df["Titulo"].nunique())

# 4.  Salva
df.to_csv(RATINGS_CLEAN, index=False)
print("‚úÖ ratings_clean.csv gravado em", RATINGS_CLEAN)

Linhas antes : 100836
Linhas depois: 100836
Usu√°rios     : 610
Filmes       : 9433
‚úÖ ratings_clean.csv gravado em /content/drive/MyDrive/SemanticRec/Data/ratings_clean.csv


# Get Directors


In [None]:
import requests, time, json, random, pathlib, pickle
from typing import List, Tuple

OMDB_KEYS: List[str] = [
    "3392d4d5", "a2b5a87e", "e0f5193a", "b0eb8c26"  # ‚Ü©Ô∏è  suas chaves aqui
]

CACHE_FILE = pathlib.Path(CACHE_DIR) / "director_cache.pkl"


# ---------- cache em disco -------------------------------------------
def _load_cache() -> dict[Tuple[str, str], str]:
    if CACHE_FILE.exists():
        with CACHE_FILE.open("rb") as fh:
            return pickle.load(fh)
    return {}

def _save_cache(cache: dict):
    with CACHE_FILE.open("wb") as fh:
        pickle.dump(cache, fh)


# ---------- consulta √∫nica -------------------------------------------
def fetch_director_omdb(title: str, year: str, api_key: str) -> str:
    """
    Consulta OMDb e devolve o campo 'Director'.
    Retorna "" em qualquer falha.
    """
    url = "http://www.omdbapi.com/"
    params = {"t": title, "y": year, "apikey": api_key}
    try:
        r = requests.get(url, params=params, timeout=5)
        data = r.json()
        if data.get("Response") == "True":
            director = data.get("Director", "")
            return director if director not in {"N/A", ""} else ""
    except Exception:
        pass
    return ""


# ---------- loop de enriquecimento -----------------------------------
def enrich_with_director(df_movies: pd.DataFrame,
                         api_keys: List[str] = OMDB_KEYS,
                         sleep_sec: float = 1.0) -> pd.DataFrame:
    """
    Preenche coluna 'Diretor' usando OMDb, com:
      ‚Ä¢ cache persistente                      (CACHE_FILE)
      ‚Ä¢ rod√≠zio de chaves                      (api_keys)
      ‚Ä¢ 1 req / s (par√¢metro sleep_sec)
    """
    df = df_movies.copy()
    if "Diretor" not in df.columns:
        df["Diretor"] = ""

    cache = _load_cache()
    keys_cycle = iter(api_keys)
    key = next(keys_cycle)

    for idx, row in df.iterrows():
        title, year = row["Titulo"], row["Ano"]
        cache_key = (title.lower(), year)

        if df.at[idx, "Diretor"]:                     # j√° preenchido
            continue
        if cache_key in cache:                        # cache hit
            df.at[idx, "Diretor"] = cache[cache_key]
            continue

        # ---------------- consulta OMDb ------------------------
        director = fetch_director_omdb(title, year, key)
        if not director:                              # se falhou, tenta pr√≥xima key
            try:
                key = next(keys_cycle)
            except StopIteration:                     # rodou todas
                keys_cycle = iter(api_keys)
                key = next(keys_cycle)
            director = fetch_director_omdb(title, year, key)

        # grava resultado (mesmo vazio) em cache & dataframe
        cache[cache_key] = director
        df.at[idx, "Diretor"] = director
        print(f"{idx:>5}: {title} ({year}) ‚Üí {director}")

        _save_cache(cache)                            # flush a cada passo
        time.sleep(sleep_sec)                         # respeita rate-limit

    return df


# ---------- executar e salvar ----------------------------------------
DIRECTOR_CSV = pathlib.Path(DATA_DIR) / "movies_director.csv"

movies_with_dir = enrich_with_director(movies_df)
movies_with_dir.to_csv(DIRECTOR_CSV, index=False)
print(f"‚úÖ movies_director.csv salvo em {DIRECTOR_CSV}")

    0: Toy Story (1995) ‚Üí John Lasseter
    1: Jumanji (1995) ‚Üí Joe Johnston
    2: Grumpier Old Men (1995) ‚Üí Howard Deutch
    3: Waiting to Exhale (1995) ‚Üí Forest Whitaker
    4: Father of the Bride Part II (1995) ‚Üí Charles Shyer
    5: Heat (1995) ‚Üí Michael Mann
    6: Sabrina (1995) ‚Üí Sydney Pollack
    7: Tom and Huck (1995) ‚Üí Peter Hewitt
    8: Sudden Death (1995) ‚Üí Peter Hyams
    9: GoldenEye (1995) ‚Üí Martin Campbell
   10: The American President (1995) ‚Üí Rob Reiner
   11: Dracula: Dead and Loving It (1995) ‚Üí Mel Brooks
   12: Balto (1995) ‚Üí Simon Wells
   13: Nixon (1995) ‚Üí Oliver Stone
   14: Cutthroat Island (1995) ‚Üí Renny Harlin
   15: Casino (1995) ‚Üí Martin Scorsese
   16: Sense and Sensibility (1995) ‚Üí Ang Lee
   17: Four Rooms (1995) ‚Üí Allison Anders, Alexandre Rockwell, Robert Rodriguez
   18: Ace Ventura: When Nature Calls (1995) ‚Üí Steve Oedekerk
   19: Money Train (1995) ‚Üí Joseph Ruben
   20: Get Shorty (1995) ‚Üí Barry Sonnenf

KeyboardInterrupt: 