# Content-Based Recommender System using Naive Bayes

This notebook implements two types of content-based recommendation systems using the MovieLens dataset:
1. User-specific recommender using Naive Bayes (user profile models)
2. Global recommender using Kronecker product of user/item features
3. Evaluation methodology for realistic recommendation performance

## 1. Load and Preprocess Data

In [None]:
import pandas as pd
import os

DATA_PATH = "../ml-latest-small"

ratings_df = pd.read_csv(os.path.join(DATA_PATH, "ratings.csv"))
movies_df = pd.read_csv(os.path.join(DATA_PATH, "movies.csv"))
tags_df = pd.read_csv(os.path.join(DATA_PATH, "tags.csv"))

FileNotFoundError: [Errno 2] No such file or directory: 'ratings.csv'

### Merge and clean metadata

In [None]:
tags_agg = tags_df.groupby('movieId')['tag'].apply(lambda x: ' '.join(x)).reset_index()
movies_meta = pd.merge(movies_df, tags_agg, on='movieId', how='left')
movies_meta['tag'] = movies_meta['tag'].fillna('')
movies_meta['content'] = movies_meta['genres'].str.replace('|', ' ') + ' ' + movies_meta['tag']

## 2. User-Specific Recommender using Naive Bayes

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

def train_user_model(user_id, top_k=10):
    user_data = ratings_df[ratings_df['userId'] == user_id]
    rated_items = pd.merge(user_data, movies_meta, on='movieId')
    
    rated_items['label'] = rated_items['rating'].apply(lambda r: 1 if r >= 4 else (0 if r <= 2 else None))
    rated_items = rated_items.dropna(subset=['label']).copy()
    rated_items['label'] = rated_items['label'].astype(int)

    if rated_items.empty:
        return None

    tfidf = TfidfVectorizer(max_features=1000, stop_words='english')
    X = tfidf.fit_transform(rated_items['content'])
    y = rated_items['label']

    model = MultinomialNB()
    model.fit(X, y)

    seen_ids = user_data['movieId'].unique()
    candidate_pool = movies_meta[~movies_meta['movieId'].isin(seen_ids)]
    X_test = tfidf.transform(candidate_pool['content'])
    probs = model.predict_proba(X_test)[:, 1]
    candidate_pool = candidate_pool.copy()
    candidate_pool['score'] = probs

    return candidate_pool[['movieId', 'title', 'score']].sort_values('score', ascending=False).head(top_k)

### 🔍 Test it with a user:

In [None]:
recommendations_user = train_user_model(user_id=1)
recommendations_user

## 3. Global Recommender using Kronecker Product

In [None]:
from sklearn.model_selection import train_test_split
import numpy as np

user_tags = tags_df.groupby('userId')['tag'].apply(lambda x: ' '.join(x)).reset_index()
user_tags['tag'] = user_tags['tag'].fillna('')

item_vectorizer = TfidfVectorizer(max_features=100)
item_vectors = item_vectorizer.fit_transform(movies_meta['content'])
item_df = pd.DataFrame(item_vectors.toarray(), index=movies_meta['movieId'])

user_vectorizer = TfidfVectorizer(max_features=100)
user_vectors = user_vectorizer.fit_transform(user_tags['tag'])
user_df = pd.DataFrame(user_vectors.toarray(), index=user_tags['userId'])

def kronecker(user_vec, item_vec):
    return np.kron(user_vec, item_vec)

train_data = []
labels = []

for _, row in ratings_df.iterrows():
    if row['userId'] not in user_df.index or row['movieId'] not in item_df.index:
        continue

    user_vec = user_df.loc[row['userId']].values
    item_vec = item_df.loc[row['movieId']].values
    vec = kronecker(user_vec, item_vec)

    if row['rating'] >= 4:
        label = 1
    elif row['rating'] <= 2:
        label = 0
    else:
        continue

    train_data.append(vec)
    labels.append(label)

X_global = np.vstack(train_data)
y_global = np.array(labels)

model_global = MultinomialNB()
model_global.fit(X_global, y_global)

### 🔍 Global Recommendation for User:

In [None]:
def recommend_global(user_id, top_k=10):
    if user_id not in user_df.index:
        return []

    user_vec = user_df.loc[user_id].values
    seen = ratings_df[ratings_df['userId'] == user_id]['movieId'].unique()
    candidates = item_df[~item_df.index.isin(seen)]

    features = np.vstack([kronecker(user_vec, item_vec) for item_vec in candidates.values])
    probs = model_global.predict_proba(features)[:, 1]

    results = pd.DataFrame({
        'movieId': candidates.index,
        'score': probs
    }).merge(movies_df[['movieId', 'title']], on='movieId')

    return results.sort_values('score', ascending=False).head(top_k)

### 🔍 Test it:

In [None]:
recommendations_global = recommend_global(user_id=1)
recommendations_global

## 4. Evaluation Methodology (Candidate Pool Strategy)

```python
from sklearn.metrics import precision_score, roc_auc_score

def evaluate_user(user_id):
    user_data = ratings_df[ratings_df['userId'] == user_id]
    if len(user_data) < 5:
        return None

    train, test = train_test_split(user_data, test_size=0.4, random_state=42)
    train_model_data = pd.merge(train, movies_meta, on='movieId')

    train_model_data['label'] = train_model_data['rating'].apply(lambda r: 1 if r >= 4 else (0 if r <= 2 else None))
    train_model_data = train_model_data.dropna(subset=['label'])
    train_model_data['label'] = train_model_data['label'].astype(int)

    tfidf = TfidfVectorizer(max_features=1000, stop_words='english')
    X_train = tfidf.fit_transform(train_model_data['content'])
    y_train = train_model_data['label']

    model = MultinomialNB()
    model.fit(X_train, y_train)

    test_data = pd.merge(test, movies_meta, on='movieId')
    X_test = tfidf.transform(test_data['content'])
    y_test = test_data['rating'].apply(lambda r: 1 if r >= 4 else 0)

    preds = model.predict(X_test)
    precision = precision_score(y_test, preds)
    auc = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])

    return {"user_id": user_id, "precision": precision, "roc_auc": auc}

evaluate_user(1)
```

## ✅ Done!
You now have both user-specific and global content-based recommendation systems, along with a realistic evaluation setup.