In [1]:
import pandas as pd

In [2]:
CSV_PATH = "./data/movies_tagged_old.csv"

In [3]:
csv_data = pd.read_csv(CSV_PATH)

In [4]:
csv_data.head()

Unnamed: 0,movieId,title,genres,year,tags
0,1,Toy Story,"adventure, animation, children, comedy, fantasy",1995.0,"{\n ""tags"": [""adventure"", ""animation"", ""chil..."
1,2,Jumanji,"adventure, children, fantasy",1995.0,"{""tags"":[""Jumanji"",""1995"",""adventure"",""childre..."
2,3,Grumpier Old Men,"comedy, romance",1995.0,"{\n ""tags"":[\n ""grumpier"",\n ""old""..."
3,4,Waiting to Exhale,"comedy, drama, romance",1995.0,"{\n ""tags"":[\n ""comedy"",\n ""drama""..."
4,5,Father of the Bride Part II,comedy,1995.0,"{\n ""title"": ""Father of the Bride Part II"",\..."


In [5]:
csv_data["tags"] = (
    csv_data["tags"]                                
        .str.extract(r'\[([^\]]+)\]', expand=False) # keep stuff inside [...]
        .fillna('')                                 # handle blanks
        .str.replace(r'["\s]', '', regex=True)      # drop quotes & whitespace
        .str.replace(',', ', ', regex=False)        # add uniform “, ” spacing
)

csv_data.head()

Unnamed: 0,movieId,title,genres,year,tags
0,1,Toy Story,"adventure, animation, children, comedy, fantasy",1995.0,"adventure, animation, children, comedy, fantasy"
1,2,Jumanji,"adventure, children, fantasy",1995.0,"Jumanji, 1995, adventure, children, fantasy"
2,3,Grumpier Old Men,"comedy, romance",1995.0,"grumpier, old, men, comedy, romance"
3,4,Waiting to Exhale,"comedy, drama, romance",1995.0,"comedy, drama, romance, waiting, exhale"
4,5,Father of the Bride Part II,comedy,1995.0,comedy


# With a clean table, we can train a model to recommend content base (not popularity)

In [7]:
COL_WEIGHTS = {               # column → relative weight
    "tags"       : 0.40,
    "title"      : 0.30,
    "genres"     : 0.15,
    "year"       : 0.05,
}

In [8]:
from pathlib import Path
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

# ── Core ──────────────────────────────────────────────────────────────────
MODEL_NAME  = "sentence-transformers/all-mpnet-base-v2"

_model = SentenceTransformer(MODEL_NAME)
_dim   = _model.get_sentence_embedding_dimension()

def _embed(text: str) -> np.ndarray:
    return _model.encode(text or "", normalize_embeddings=True)

def _row_vec(row) -> np.ndarray:
    vec = np.zeros(_dim)
    for col, w in COL_WEIGHTS.items():
        if w:
            vec += w * _embed(str(row[col]))
    return vec / np.linalg.norm(vec)

def build_vectors(df: pd.DataFrame) -> np.ndarray:
    vecs = np.vstack(df.apply(_row_vec, axis=1).values)
    np.save("movie_vecs.npy", vecs)
    return vecs

def recommend(df: pd.DataFrame,
              vecs: np.ndarray,
              query: str,
              k: int = 10) -> pd.DataFrame:
    q_vec = _embed(query)
    sims  = cosine_similarity(vecs, q_vec.reshape(1, -1)).ravel()
    top   = sims.argsort()[-k:][::-1]
    return df.loc[top, ["movieId", "title", "genres", "tags", "year"]]

In [None]:
movie_vecs = build_vectors(csv_data)

In [None]:
recommend(csv_data, movie_vecs, "toys action animation", k=10)