In [None]:
from pathlib import Path
import pandas as pd
import urllib.request

# Configuration
DATA_FILENAME = "imdb_top_1000.csv"
# If __file__ isn't defined (in Jupyter), use cwd
try:
    NOTEBOOK_DIR = Path(__file__).parent
except NameError:
    NOTEBOOK_DIR = Path.cwd()
DATA_PATH = NOTEBOOK_DIR / DATA_FILENAME
FALLBACK_URL = "https://raw.githubusercontent.com/YourUsername/ECS171-Movie-Rec/main/imdb_top_1000.csv"

# Download the CSV if missing
if not DATA_PATH.exists():
    print(f"→ {DATA_FILENAME} not found locally; downloading from GitHub…")
    urllib.request.urlretrieve(FALLBACK_URL, DATA_PATH)
    print("→ Download complete.")

# Load the DataFrame
df = pd.read_csv(DATA_PATH)
print(f"Loaded {len(df)} rows from {DATA_PATH}")



Loaded 1000 rows from /Users/brayantorres/Downloads/ECS171-Movie-Rec/imdb_top_1000.csv


# Only With Overview

In [None]:
# Imports
import pandas as pd
from sentence_transformers import SentenceTransformer, util

# Load dataset
df = df.dropna(subset=["Overview"]).reset_index(drop=True)

# Load SBERT model
model = SentenceTransformer('all-MiniLM-L6-v2')  # Fast and semantically strong

# Encode all movie overviews
print("Encoding overviews... (this takes ~10s)")
overview_embeddings = model.encode(df["Overview"].tolist(), convert_to_tensor=True)

# Define recommendation function
def recommend_movies_sbert(title, top_n=5):
    title_idx = df[df["Series_Title"].str.lower() == title.lower()].index
    if len(title_idx) == 0:
        return "Movie not found."
    title_idx = title_idx[0]
    query_embedding = overview_embeddings[title_idx]
    
    # Compute cosine similarity with all others
    cos_scores = util.cos_sim(query_embedding, overview_embeddings)[0]
    
    # Get top-n similar movie indices (excluding the movie itself)
    top_results = cos_scores.argsort(descending=True)[1:top_n+1]
    
    results = []
    for idx_tensor in top_results:
        idx = int(idx_tensor)
        results.append({
            "Series_Title": df.iloc[idx]["Series_Title"],
            "Similarity": float(cos_scores[idx]),
            "IMDB_Rating": df.iloc[idx]["IMDB_Rating"],
            "Overview": df.iloc[idx]["Overview"]
        })
    return pd.DataFrame(results)

# Try it out
recommend_movies_sbert("Joker", top_n=5)


Encoding overviews... (this takes ~10s)


Unnamed: 0,Series_Title,Similarity,IMDB_Rating,Overview
0,The Dark Knight,0.592202,9.0,When the menace known as the Joker wreaks havo...
1,Batman: Mask of the Phantasm,0.490274,7.8,Batman is wrongly implicated in a series of mu...
2,American Psycho,0.482145,7.6,A wealthy New York City investment banking exe...
3,The Godfather: Part II,0.44381,9.0,The early life and career of Vito Corleone in ...
4,The Dark Knight Rises,0.443529,8.4,Eight years after the Joker's reign of anarchy...


# With Overview + Genre

In [3]:
# Imports
import pandas as pd
from sentence_transformers import SentenceTransformer, util

# Load dataset
df = df.dropna(subset=["Overview", "Genre"]).reset_index(drop=True)

# Load SBERT model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Encode all plot overviews
print("Encoding overviews... (this takes ~10s)")
overview_embeddings = model.encode(df["Overview"].tolist(), convert_to_tensor=True)

# Define recommender using Overview + Genre
def recommend_movies_sbert_genre(title, top_n=5):
    # Find the index of the input movie
    title_idx = df[df["Series_Title"].str.lower() == title.lower()].index
    if len(title_idx) == 0:
        return "Movie not found."
    title_idx = title_idx[0]

    # Get genres of the input movie
    input_genres = set(df.iloc[title_idx]["Genre"].lower().split(", "))

    # Get SBERT embedding of input movie
    query_embedding = overview_embeddings[title_idx]

    # Compute cosine similarity
    cos_scores = util.cos_sim(query_embedding, overview_embeddings)[0]

    # Sort by similarity
    top_results = cos_scores.argsort(descending=True)

    results = []
    for idx_tensor in top_results:
        idx = int(idx_tensor)
        if idx == title_idx:
            continue  
        # Check if genres overlap
        movie_genres = set(df.iloc[idx]["Genre"].lower().split(", "))
        if input_genres & movie_genres:
            results.append({
                "Series_Title": df.iloc[idx]["Series_Title"],
                "Genre": df.iloc[idx]["Genre"],
                "Similarity": float(cos_scores[idx]),
                "IMDB_Rating": df.iloc[idx]["IMDB_Rating"],
                "Overview": df.iloc[idx]["Overview"]
            })

        if len(results) >= top_n:
            break

    return pd.DataFrame(results)

# Test the recommender
recommend_movies_sbert_genre("Dangal", top_n=5)

Encoding overviews... (this takes ~10s)


Unnamed: 0,Series_Title,Genre,Similarity,IMDB_Rating,Overview
0,Paan Singh Tomar,"Action, Biography, Crime",0.392603,8.2,"The story of Paan Singh Tomar, an Indian athle..."
1,Warrior,"Action, Drama, Sport",0.382826,8.2,The youngest son of an alcoholic former boxer ...
2,Once Were Warriors,"Crime, Drama",0.373498,7.9,A family descended from Maori warriors is bede...
3,Lagaan: Once Upon a Time in India,"Adventure, Drama, Musical",0.372703,8.1,The people of a small village in Victorian Ind...
4,The Wrestler,"Drama, Sport",0.359007,7.9,"A faded professional wrestler must retire, but..."
