In [3]:
import pandas as pd
import joblib
from sklearn.metrics.pairwise import cosine_similarity

## Load data and models

In [13]:
movies_df = pd.read_csv("merged_imdb_data.csv")
movies_df = movies_df.sort_values(by = "averageRating", ascending = False).dropna(subset = ["averageRating"]).head(10000)
movies_df["genres"] = movies_df["genres"].fillna("Unknown")
movies_df["primaryName"] = movies_df["primaryName"].fillna("Unknown")

In [18]:
tfidf_vectorizer = joblib.load("models/tfidf_vectorizer.pkl")
nearest_neighbors_indices = joblib.load("models/nearest_neighbors_indices.pkl")
svd_model = joblib.load("models/svd_model.pkl")

## Define function

In [21]:
def get_similar_movies(movie_title):
    match = movies_df[movies_df["primaryTitle"].str.lower() == movie_title.lower()]
    if match.empty:
        print("Movie not found in top 10K dataset.")
        return pd.DataFrame()
    index = match.index[0]
    similar_indices = nearest_neighbors_indices[index][1:]
    return movies_df.iloc[similar_indices].copy()

In [23]:
def predict_rating_for_movie(movie_name, user_id=999):
    try:
        return round(svd_model.predict(user_id, movie_name).est, 2)
    except:
        return None

In [31]:
def test_recommendation(movie_title):
    print(f"\nTest recommendations for: {movie_title}\n")
    similar_movies = get_similar_movies(movie_title)
    if similar_movies.empty:
        print("No similar movies found.")
        return
        
    similar_movies["PredictedRating"] = similar_movies["primaryTitle"].apply(lambda title: predict_rating_for_movie(title))

    user_genre_vector = tfidf_vectorizer.transform([""])
    genre_scores = cosine_similarity(user_genre_vector, tfidf_vectorizer.transform(similar_movies["genres"])).flatten()
    similar_movies["user_genre_similarity"] = genre_scores

    similar_movies["score"] = (
        0.6 * similar_movies["user_genre_similarity"]
        + 0.4 * similar_movies["averageRating"].fillna(0)
    )

    top_movies = similar_movies.sort_values(by="score", ascending=False).head(10)
    print(top_movies[["primaryTitle", "genres", "averageRating", "PredictedRating", "score"]].to_string(index=False))

In [39]:
print(movies_df["primaryTitle"].head(20).to_list())

['Paradise (bunnies and flowers)', 'Carraco', 'Tarka', 'Carraco', 'Tarka', 'Kaputol', 'Kaputol', 'Kaputol', 'Kaputol', 'Kaputol', 'Carraco', 'Carraco', 'Carraco', 'Clownface 3', 'Kaputol', 'Tarka', 'Tarka', 'Tarka', 'Tarka', "Soori Love's Sandhya"]
