# Content-Based Recommender System using Naive Bayes

This notebook implements two types of content-based recommendation systems using the MovieLens dataset:
1. User-specific recommender using Naive Bayes (user profile models)
2. Global recommender using Kronecker product of user/item features
3. Evaluation methodology for realistic recommendation performance

### Load and Preprocess Data

In [15]:
import pandas as pd
import os

DATA_PATH = "../ml-latest-small"

ratings = pd.read_csv(os.path.join(DATA_PATH, "ratings.csv"))
movies = pd.read_csv(os.path.join(DATA_PATH, "movies.csv"))
tags = pd.read_csv(os.path.join(DATA_PATH, "tags.csv"))

## 🧹 Preprocess Movie Metadata
tags_agg = tags.groupby("movieId")["tag"].apply(lambda x: " ".join(x)).reset_index()
movies = movies.merge(tags_agg, on="movieId", how="left")
movies["tag"] = movies["tag"].fillna("")
movies["content"] = movies["genres"].str.replace("|", " ") + " " + movies["tag"]

## 1. User-Specific Naive Bayes Recommender

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

def user_specific_model(user_id, top_k=10):
    user_ratings = ratings[ratings['userId'] == user_id]
    user_data = pd.merge(user_ratings, movies, on='movieId')

    # Create labels
    user_data['label'] = user_data['rating'].apply(lambda r: 1 if r >= 4 else (0 if r <= 2 else None))
    user_data = user_data.dropna(subset=['label'])
    user_data['label'] = user_data['label'].astype(int)

    if user_data.empty:
        return pd.DataFrame()

    tfidf = TfidfVectorizer(max_features=1000, stop_words='english')
    X_train = tfidf.fit_transform(user_data['content'])
    y_train = user_data['label']

    model = MultinomialNB()
    model.fit(X_train, y_train)

    seen = user_ratings['movieId'].unique()
    candidate_pool = movies[~movies['movieId'].isin(seen)].copy()
    X_test = tfidf.transform(candidate_pool['content'])

    candidate_pool['score'] = model.predict_proba(X_test)[:, 1]
    return candidate_pool.sort_values("score", ascending=False)[['movieId', 'title', 'score']].head(top_k)

### 🔍 Test it:

In [17]:
user_specific_model(user_id=1)

Unnamed: 0,movieId,title,score
7550,85261,Mars Needs Moms (2011),0.999852
9282,157865,Ratchet & Clank (2016),0.999852
6455,52287,Meet the Robinsons (2007),0.999852
6047,40339,Chicken Little (2005),0.999852
7170,71999,Aelita: The Queen of Mars (Aelita) (1924),0.999838
1390,1907,Mulan (1998),0.999833
9169,148775,Wizards of Waverly Place: The Movie (2009),0.99983
478,546,Super Mario Bros. (1993),0.999824
9358,161594,Kingsglaive: Final Fantasy XV (2016),0.999824
4348,6350,Laputa: Castle in the Sky (Tenkû no shiro Rapy...,0.999818


## 2. Global Content-Based Recommender (Single Model for All Users)

In [18]:
def train_global_model():
    data = pd.merge(ratings, tags_agg, on='movieId')
    data = data.dropna(subset=['tag'])

    data['label'] = data['rating'].apply(lambda r: 1 if r >= 4 else (0 if r <= 2 else None))
    data = data.dropna(subset=['label'])
    data['label'] = data['label'].astype(int)

    tfidf = TfidfVectorizer(max_features=1000, stop_words='english')
    X = tfidf.fit_transform(data['tag'])
    y = data['label']

    model = MultinomialNB()
    model.fit(X, y)
    return model, tfidf

model_global, tfidf_global = train_global_model()

def recommend_global(user_id, top_k=10):
    seen = ratings[ratings['userId'] == user_id]['movieId'].unique()
    unseen = tags_agg[~tags_agg['movieId'].isin(seen)].copy()
    X_test = tfidf_global.transform(unseen['tag'])

    unseen['score'] = model_global.predict_proba(X_test)[:, 1]
    result = unseen.merge(movies[['movieId', 'title']], on='movieId')
    return result.sort_values('score', ascending=False)[['movieId', 'title', 'score']].head(top_k)

### 🔍 Test it:

In [19]:
recommend_global(user_id=1)

Unnamed: 0,movieId,title,score
71,318,"Shawshank Redemption, The (1994)",0.996657
1256,48516,"Departed, The (2006)",0.991428
242,1193,One Flew Over the Cuckoo's Nest (1975),0.985799
1035,7361,Eternal Sunshine of the Spotless Mind (2004),0.984006
282,1276,Cool Hand Luke (1967),0.983725
62,280,Murder in the First (1995),0.983725
66,293,Léon: The Professional (a.k.a. The Professiona...,0.982317
157,904,Rear Window (1954),0.981399
1326,79132,Inception (2010),0.980381
1306,68157,Inglourious Basterds (2009),0.97701


## 3. Evaluation Methodology (Candidate Pool Strategy)

In [20]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, roc_auc_score

def evaluate_user_model(user_id):
    user_ratings = ratings[ratings['userId'] == user_id]
    if len(user_ratings) < 5:
        return None

    train, test = train_test_split(user_ratings, test_size=0.4, random_state=42)
    train_data = pd.merge(train, movies, on='movieId')
    train_data['label'] = train_data['rating'].apply(lambda r: 1 if r >= 4 else (0 if r <= 2 else None))
    train_data = train_data.dropna(subset=['label'])
    train_data['label'] = train_data['label'].astype(int)

    if len(train_data) < 3:
        return None

    tfidf = TfidfVectorizer(max_features=1000, stop_words='english')
    X_train = tfidf.fit_transform(train_data['content'])
    y_train = train_data['label']

    model = MultinomialNB()
    model.fit(X_train, y_train)

    test_data = pd.merge(test, movies, on='movieId')
    X_test = tfidf.transform(test_data['content'])
    y_test = test_data['rating'].apply(lambda r: 1 if r >= 4 else 0)

    preds = model.predict(X_test)
    precision = precision_score(y_test, preds)
    auc = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])

    return {
        'user_id': user_id,
        'precision': precision,
        'auc': auc
    }


### 🔍 Test it:

In [21]:
evaluate_user_model(user_id=1)

{'user_id': 1,
 'precision': 0.8602150537634409,
 'auc': np.float64(0.541826923076923)}