In [None]:
!pip install huggingface_hub

In [23]:
from huggingface_hub import hf_hub_download
import pandas as pd
import joblib

# Configuration
REPO_ID = "Alok8732/Movie_Recommnedation_Artifacts"

print("Fetching artifacts from Hugging Face...")

# 1. Load the Dataframe
parquet_path = hf_hub_download(repo_id=REPO_ID, filename="movies_fully_cleaned.parquet", repo_type="dataset")
df = pd.read_parquet(parquet_path)
df = df.reset_index(drop=True)
# 2. Load the TF-IDF Matrix
matrix_path = hf_hub_download(repo_id=REPO_ID, filename="tfidf_matrix.pkl", repo_type="dataset")
tfidf_matrix = joblib.load(matrix_path)

# 3. Load the Vectorizer (Optional, but good to have)
vectorizer_path = hf_hub_download(repo_id=REPO_ID, filename="tfidf_vectorizer.pkl", repo_type="dataset")
tfidf_vectorizer = joblib.load(vectorizer_path)

print(f"✅ Success! Loaded {len(df)} movies and a {tfidf_matrix.shape} similarity matrix.")

Fetching artifacts from Hugging Face...
✅ Success! Loaded 503398 movies and a (503398, 50000) similarity matrix.


In [24]:
import pandas as pd
import joblib
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
# ADD THESE TWO LINES:
from thefuzz import process, fuzz
from IPython.display import display

In [25]:
# Create the movie index map to link titles to matrix rows
movie_index_map = pd.Series(df.index, index=df["title"]).drop_duplicates()
print(f"Index map created for {len(movie_index_map)} unique movies.")

Index map created for 503398 unique movies.


In [26]:
def get_recommendations(title, n=12, alpha=0.85, beta=0.15):
    # 1. ENSURE MAP EXISTS
    global movie_index_map
    if 'movie_index_map' not in globals():
        movie_index_map = pd.Series(df.index, index=df["title"]).drop_duplicates()

    # 2. SCALABLE FUZZY MATCHING
    all_titles = df['title'].unique()
    best_match, score = process.extractOne(title, all_titles, scorer=fuzz.WRatio)
    
    if score < 60 or len(title.strip()) < 3:
        print(f"⚠️ Search term '{title}' is too vague.")
        return None
    
    # 3. EXTRACT MATRIX INDEX
    idx_entry = movie_index_map[best_match]
    idx = idx_entry.iloc[0] if isinstance(idx_entry, pd.Series) else idx_entry
    
    # 4. COMPUTE HYBRID SCORE
    sim_scores = cosine_similarity(tfidf_matrix[idx], tfidf_matrix).flatten()
    log_pop = np.log1p(df['vote_count'])
    pop_norm = (log_pop - log_pop.min()) / (log_pop.max() - log_pop.min())
    
    hybrid_scores = (alpha * sim_scores) + (beta * pop_norm.values)
    
    # 5. EFFICIENT SELECTION WITH ZERO-VOTE PENALTY
    # We grab more than 'n' results initially to ensure we have room to re-sort
    k_search = n * 2 
    partition_idx = np.argpartition(hybrid_scores, -k_search)[-k_search:]
    
    # 6. FORMAT AND FINAL SORT
    res = df.iloc[partition_idx].copy()
    res['similarity'] = sim_scores[partition_idx]
    res['hybrid_score'] = hybrid_scores[partition_idx]
    
    # REMOVE THE SEARCHED MOVIE ITSELF
    res = res[res.index != idx]
    
    # SECONDARY SORTING: Sort by hybrid_score first, then by vote_count
    # This ensures that if scores are similar, the one with more votes wins.
    # It also naturally pushes 0-vote movies lower.
    res = res.sort_values(by=['hybrid_score', 'vote_count'], ascending=False).head(n)
    
    print(f"✅ Found matches for: '{best_match}'")
    return res[['title', 'genres', 'vote_count', 'similarity', 'hybrid_score']].reset_index(drop=True)

In [17]:
# Test with a popular title
results = get_recommendations("Avata", n=12)

if results is not None:
    display(results.style.hide(axis='index'))

✅ Found matches for: 'Avatar'


title,genres,vote_count,similarity,hybrid_score
Woman of the Year,"Comedy, Romance, Drama",160.0,1.0,0.922151
Eine fast perfekte Bescherung,"Comedy, Romance, Drama",2.0,0.390775,0.347758
Choose the Life I Want,"Comedy, Romance, Drama",0.0,0.407965,0.34677
Badhaai Ho,"Comedy, Romance, Drama",144.0,0.303338,0.328503
Love Actually,"Comedy, Romance, Drama",7217.0,0.236485,0.327162
The Proposal,"Comedy, Romance, Drama",7296.0,0.232354,0.323806
The Circus,"Comedy, Romance, Drama",860.0,0.266821,0.322757
The Oranges,"Comedy, Romance, Drama",502.0,0.274297,0.321479
Show Me Love,"Comedy, Romance, Drama",574.0,0.271886,0.32133
Saving Face,"Comedy, Romance, Drama",247.0,0.280663,0.31685


In [None]:
# Test with a popular title
#change the movie title 
#copy and paste the function in another cell
results = get_recommendations("Fast X", n=12)

if results is not None:
    display(results.style.hide(axis='index'))