Load your Indian movie dataset files (ratings and movie info)

In [3]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler
from scipy.sparse.linalg import svds
from scipy.sparse import hstack

# Example: load ratings and movies from Indian dataset files
ratings = pd.read_csv("indian_ratings.csv")  # should have columns: user_id, movie_id, rating
movies = pd.read_csv("indian_movies.csv")    # should have columns: movie_id, title, genres (or genre flags)

# Adjust this depending on the dataset schema:
# Assume 'genres' is a single string of genres separated by '|', e.g., "Action|Drama|Romance"


Preprocess movies to extract genres matrix and titles

In [4]:
# If genres are stored as strings separated by "|", convert to dummy columns
movies['genres'] = movies['genres'].fillna('')
genres_split = movies['genres'].str.get_dummies(sep='|')  # creates genre_ columns

# Use genres as features + TF-IDF vector for titles (or synopsis if available)
tfidf = TfidfVectorizer(stop_words='english')
title_tfidf = tfidf.fit_transform(movies['title'].fillna(''))

# Combine genre matrix and title TF-IDF matrix horizontally
content_features = hstack([genres_split.values, title_tfidf])


Prepare the ratings matrix for collaborative filtering

In [6]:
n_users = ratings.user_id.nunique()
n_items = ratings.movie_id.nunique()

R = ratings.pivot(index='user_id', columns='movie_id', values='rating').fillna(0).values
user_ratings_mean = np.mean(R, axis=1)
R_demeaned = R - user_ratings_mean.reshape(-1, 1)

k = min(50, min(R_demeaned.shape) - 1)  # fix k here
U, sigma, Vt = svds(R_demeaned, k=k)
sigma = np.diag(sigma)
pred_ratings = np.dot(np.dot(U, sigma), Vt) + user_ratings_mean.reshape(-1, 1)

cf_preds_df = pd.DataFrame(pred_ratings, index=np.arange(1, n_users+1), columns=np.arange(1, n_items+1))


Compute item-item similarity matrix (content-based)

In [7]:
item_sim = cosine_similarity(content_features)


Map movie IDs to matrix indices

In [8]:
movie_id_to_idx = {mid: idx for idx, mid in enumerate(movies['movie_id'])}
idx_to_movie_id = {idx: mid for mid, idx in movie_id_to_idx.items()}


Define the hybrid recommender function

In [9]:
def hybrid_recommend(user_id, alpha=0.7, N=10):
    # Collaborative filtering predictions for this user
    cf_scores = cf_preds_df.loc[user_id].to_dict()

    scaler = MinMaxScaler()
    cf_norm = dict(zip(cf_scores.keys(), scaler.fit_transform(np.array(list(cf_scores.values())).reshape(-1,1)).flatten()))

    # Get items rated highly by user (>=4 rating)
    rated_items = ratings[(ratings.user_id==user_id) & (ratings.rating >= 4)].movie_id.values

    if len(rated_items) > 0:
        rated_idxs = [movie_id_to_idx[iid] for iid in rated_items if iid in movie_id_to_idx]
        content_scores = item_sim[rated_idxs].mean(axis=0)  # average similarity with liked items
        content_norm = dict(zip(movies.movie_id, scaler.fit_transform(content_scores.reshape(-1,1)).flatten()))
    else:
        content_norm = {iid: 0 for iid in movies.movie_id}

    # Combine CF and content scores
    hybrid_scores = {iid: alpha * cf_norm.get(iid, 0) + (1 - alpha) * content_norm.get(iid, 0) for iid in movies.movie_id}

    # Remove movies already rated by user
    rated = set(ratings[ratings.user_id == user_id].movie_id.values)
    hybrid_scores = {iid: score for iid, score in hybrid_scores.items() if iid not in rated}

    # Sort and return top N
    topN = sorted(hybrid_scores.items(), key=lambda x: x[1], reverse=True)[:N]
    return [(iid, movies.loc[movies.movie_id == iid, 'title'].values[0], score) for iid, score in topN]


Use the hybrid recommender

In [10]:
for movie_id, title, score in hybrid_recommend(user_id=1, alpha=0.7, N=10):
    print(f"{movie_id} | {score:.4f} | {title}")


12 | 0.2421 | Zindagi Na Milegi Dobara
4 | 0.2421 | PK
9 | 0.1742 | Taare Zameen Par
17 | 0.1671 | Padmaavat
3 | 0.1593 | Dangal
6 | 0.1593 | Bajrangi Bhaijaan
16 | 0.1593 | Tanhaji
10 | 0.1590 | Gully Boy
20 | 0.1590 | Secret Superstar
5 | 0.1590 | Queen
