In [None]:
!pip install huggingface_hub

In [23]:
from huggingface_hub import hf_hub_download
import pandas as pd
import joblib

# Configuration
REPO_ID = "Alok8732/Movie_Recommnedation_Artifacts"

print("Fetching artifacts from Hugging Face...")

# 1. Load the Dataframe
parquet_path = hf_hub_download(repo_id=REPO_ID, filename="movies_fully_cleaned.parquet", repo_type="dataset")
df = pd.read_parquet(parquet_path)
df = df.reset_index(drop=True)
# 2. Load the TF-IDF Matrix
matrix_path = hf_hub_download(repo_id=REPO_ID, filename="tfidf_matrix.pkl", repo_type="dataset")
tfidf_matrix = joblib.load(matrix_path)

# 3. Load the Vectorizer (Optional, but good to have)
vectorizer_path = hf_hub_download(repo_id=REPO_ID, filename="tfidf_vectorizer.pkl", repo_type="dataset")
tfidf_vectorizer = joblib.load(vectorizer_path)

print(f"✅ Success! Loaded {len(df)} movies and a {tfidf_matrix.shape} similarity matrix.")

Fetching artifacts from Hugging Face...
✅ Success! Loaded 503398 movies and a (503398, 50000) similarity matrix.


In [24]:
import pandas as pd
import joblib
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
# ADD THESE TWO LINES:
from thefuzz import process, fuzz
from IPython.display import display

In [25]:
# Create the movie index map to link titles to matrix rows
movie_index_map = pd.Series(df.index, index=df["title"]).drop_duplicates()
print(f"Index map created for {len(movie_index_map)} unique movies.")

Index map created for 503398 unique movies.


In [26]:
def get_recommendations(title, n=12, alpha=0.85, beta=0.15):
    # 1. ENSURE MAP EXISTS
    global movie_index_map
    if 'movie_index_map' not in globals():
        movie_index_map = pd.Series(df.index, index=df["title"]).drop_duplicates()

    # 2. SCALABLE FUZZY MATCHING
    all_titles = df['title'].unique()
    best_match, score = process.extractOne(title, all_titles, scorer=fuzz.WRatio)
    
    if score < 60 or len(title.strip()) < 3:
        print(f"⚠️ Search term '{title}' is too vague.")
        return None
    
    # 3. EXTRACT MATRIX INDEX
    idx_entry = movie_index_map[best_match]
    idx = idx_entry.iloc[0] if isinstance(idx_entry, pd.Series) else idx_entry
    
    # 4. COMPUTE HYBRID SCORE
    sim_scores = cosine_similarity(tfidf_matrix[idx], tfidf_matrix).flatten()
    log_pop = np.log1p(df['vote_count'])
    pop_norm = (log_pop - log_pop.min()) / (log_pop.max() - log_pop.min())
    
    hybrid_scores = (alpha * sim_scores) + (beta * pop_norm.values)
    
    # 5. EFFICIENT SELECTION WITH ZERO-VOTE PENALTY
    # We grab more than 'n' results initially to ensure we have room to re-sort
    k_search = n * 2 
    partition_idx = np.argpartition(hybrid_scores, -k_search)[-k_search:]
    
    # 6. FORMAT AND FINAL SORT
    res = df.iloc[partition_idx].copy()
    res['similarity'] = sim_scores[partition_idx]
    res['hybrid_score'] = hybrid_scores[partition_idx]
    
    # REMOVE THE SEARCHED MOVIE ITSELF
    res = res[res.index != idx]
    
    # SECONDARY SORTING: Sort by hybrid_score first, then by vote_count
    # This ensures that if scores are similar, the one with more votes wins.
    # It also naturally pushes 0-vote movies lower.
    res = res.sort_values(by=['hybrid_score', 'vote_count'], ascending=False).head(n)
    
    print(f"✅ Found matches for: '{best_match}'")
    return res[['title', 'genres', 'vote_count', 'similarity', 'hybrid_score']].reset_index(drop=True)

In [17]:
# Test with a popular title
results = get_recommendations("Avata", n=12)

if results is not None:
    display(results.style.hide(axis='index'))

✅ Found matches for: 'Avatar'


title,genres,vote_count,similarity,hybrid_score
Woman of the Year,"Comedy, Romance, Drama",160.0,1.0,0.922151
Eine fast perfekte Bescherung,"Comedy, Romance, Drama",2.0,0.390775,0.347758
Choose the Life I Want,"Comedy, Romance, Drama",0.0,0.407965,0.34677
Badhaai Ho,"Comedy, Romance, Drama",144.0,0.303338,0.328503
Love Actually,"Comedy, Romance, Drama",7217.0,0.236485,0.327162
The Proposal,"Comedy, Romance, Drama",7296.0,0.232354,0.323806
The Circus,"Comedy, Romance, Drama",860.0,0.266821,0.322757
The Oranges,"Comedy, Romance, Drama",502.0,0.274297,0.321479
Show Me Love,"Comedy, Romance, Drama",574.0,0.271886,0.32133
Saving Face,"Comedy, Romance, Drama",247.0,0.280663,0.31685


In [11]:
# Test with a popular title
results = get_recommendations("Fast X", n=12)

if results is not None:
    display(results.style.hide(axis='index'))

✅ Recommendations for: 'Fast X'


title,genres,vote_count,similarity,hybrid_score
Tel chi el telùn,Comedy,328.0,0.198643,0.251146
Un Natale stupefacente,Comedy,156.0,0.198746,0.240729
"Amore, bugie e calcetto",Comedy,214.0,0.188393,0.236392
"Kev Adams & Gad Elmaleh - Kev Gad, Tout est possible",Comedy,95.0,0.198746,0.233744
Gad Elmaleh : Sans tambour,Comedy,111.0,0.193975,0.231877
Caccia al tesoro,Comedy,82.0,0.198746,0.231678
El tatuaje,Comedy,73.0,0.198746,0.230048
Una canzone per te,Comedy,69.0,0.198746,0.229259
La mia vita a stelle e strisce,Comedy,57.0,0.198746,0.226589
Kyan Khojandi : Pulsions,Comedy,54.0,0.198746,0.225835


In [27]:
results = get_recommendations("Dhurandhar", n=12)

if results is not None:
    display(results.style.hide(axis='index'))

✅ Found matches for: 'Dhurandhar'


title,genres,vote_count,similarity,hybrid_score
The Great Escape,"Action, Thriller",5.0,0.422941,0.384941
Adventure King,"Action, Thriller",6.0,0.418061,0.382982
Sí matarás,"Action, Thriller",2.0,0.422941,0.375099
Firasat,"Action, Thriller",1.0,0.422941,0.369342
Eighteen Arhats of Shaolin Temple,"Action, Thriller",2.0,0.413054,0.366695
John Wick,"Action, Thriller",20393.0,0.262205,0.363772
Gavanam,"Action, Thriller",0.0,0.422941,0.3595
Пассажиры,"Action, Thriller",0.0,0.422941,0.3595
Fuerza Máxima,"Action, Thriller",0.0,0.422941,0.3595
Mayonaka no Super Car,"Action, Thriller",0.0,0.422941,0.3595


In [28]:
results = get_recommendations("ek tha tiger", n=12)

if results is not None:
    display(results.style.hide(axis='index'))

✅ Found matches for: 'Ek Tha Tiger'


title,genres,vote_count,similarity,hybrid_score
Hero Naam Yaad Rakhi,"Action, Thriller, Romance",1.0,0.652682,0.564622
Assassin Training Course,"Action, Thriller, Romance",0.0,0.643897,0.547312
Stopover Tokyo,"Action, Thriller, Romance",7.0,0.607204,0.545649
The Tourist,"Action, Thriller, Romance",5651.0,0.473448,0.525108
Collision,"Action, Thriller, Romance",105.0,0.535813,0.521657
Ghat Pratyaghat,"Action, Thriller, Romance",0.0,0.598365,0.50861
X,"Action, Thriller, Romance",312.0,0.497855,0.504768
Ko 2,"Action, Thriller, Romance",6.0,0.54831,0.493694
Saamy²,"Action, Thriller, Romance",14.0,0.51086,0.472683
No More Dirty Deals,"Action, Thriller, Romance",1.0,0.541367,0.470004


In [31]:
results = get_recommendations("pushpa", n=12)

if results is not None:
    display(results.style.hide(axis='index'))

✅ Found matches for: 'Push'


title,genres,vote_count,similarity,hybrid_score
Predator,"Science Fiction, Action, Adventure, Thriller",8785.0,0.383962,0.455309
Star Trek V: The Final Frontier,"Science Fiction, Action, Adventure, Thriller",1260.0,0.400491,0.441794
Star Trek III: The Search for Spock,"Science Fiction, Action, Adventure, Thriller",1461.0,0.384276,0.430111
Gemini Man,"Science Fiction, Action, Adventure, Thriller",5344.0,0.357098,0.425417
Star Trek: Insurrection,"Science Fiction, Action, Adventure, Thriller",1311.0,0.373169,0.419133
Serenity,"Science Fiction, Action, Adventure, Thriller",3759.0,0.353831,0.417646
Maze Runner: The Death Cure,"Science Fiction, Action, Adventure, Thriller",8208.0,0.33977,0.416781
Star Trek VI: The Undiscovered Country,"Science Fiction, Action, Adventure, Thriller",1367.0,0.36569,0.413369
Kin,"Science Fiction, Action, Adventure, Thriller",869.0,0.362803,0.404489
Firefox,"Science Fiction, Action, Adventure, Thriller",593.0,0.366542,0.402249


In [32]:
results = get_recommendations("dangal", n=12)

if results is not None:
    display(results.style.hide(axis='index'))

✅ Found matches for: 'Dangal'


title,genres,vote_count,similarity,hybrid_score
Lucky Star,"Drama, Family, Comedy",5.0,0.579087,0.517666
The Longshots,"Drama, Family, Comedy",70.0,0.518033,0.500854
Fahim,"Drama, Family, Comedy",172.0,0.463644,0.467269
Lamer Mother in Law is Friendly to Her Son in Law,"Drama, Family, Comedy",0.0,0.537219,0.456636
Home Sweet Loan,"Drama, Family, Comedy",16.0,0.478839,0.447242
Shakuntala Devi,"Drama, Family, Comedy",35.0,0.462288,0.443827
The Better Half,"Drama, Family, Comedy",4.0,0.477446,0.428682
Skippy,"Drama, Family, Comedy",23.0,0.436175,0.415875
Sweet Agony 2,"Drama, Family, Comedy",0.0,0.484607,0.411916
Khali Purse of Billionaires,"Drama, Family, Comedy",3.0,0.457904,0.408903


In [33]:
results = get_recommendations("sholay", n=12)

if results is not None:
    display(results.style.hide(axis='index'))

✅ Found matches for: 'Sholay'


title,genres,vote_count,similarity,hybrid_score
Sholay: The Final Cut,"Action, Adventure, Crime, Comedy, Thriller",2.0,0.59483,0.521205
Maa Kasam,Action,1.0,0.317077,0.279358
Munnibai,Action,1.0,0.312692,0.27563
Desh Premee,"Action, Drama",6.0,0.290772,0.274786
Garota da Moto,Action,12.0,0.277728,0.272489
A Gamer's Day,Action,5.0,0.277728,0.26151
Ganga Ki Saugand,"Action, Romance",3.0,0.2834,0.260574
El Diamante de Caro Quintero,Action,4.0,0.277728,0.258922
With the Devil in the Blood,Action,4.0,0.277728,0.258922
Sentencia de narcoticos,Action,4.0,0.277728,0.258922


In [34]:
results = get_recommendations("pK", n=12)

if results is not None:
    display(results.style.hide(axis='index'))

⚠️ Search term 'pK' is too vague.


In [35]:
# Check if popular movies exist
test_titles = [
    "Dune: Part Two",
    "Mad Max: Fury Road", 
    "Mission: Impossible - Fallout",
    "Blade Runner 2049"
]

for title in test_titles:
    if title in df['title'].values:
        print(f"✓ {title} exists")
    else:
        print(f"✗ {title} MISSING")

✓ Dune: Part Two exists
✓ Mad Max: Fury Road exists
✓ Mission: Impossible - Fallout exists
✓ Blade Runner 2049 exists


NameError: name 'bollywood_movies' is not defined