# Interest-based Content Recommendation System
This notebook contains the patched implementation of Hybrid Recommender with:
1. Separate vectorizers for users and posts.
2. Global MinMaxScaler fitted once on training scores, reused during inference.


In [1]:
pip install scikit-surprise



In [2]:
pip install numpy==1.26.4



In [3]:

import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder, MinMaxScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import roc_auc_score, ndcg_score
from surprise import Dataset, Reader, SVD
from collections import defaultdict
import warnings

warnings.filterwarnings('ignore')


In [4]:

class HybridRecommender:
    def __init__(self, content_weight=0.3):
        self.content_model = lgb.LGBMClassifier(objective='binary', random_state=42, verbose=-1)
        self.collab_model = SVD(n_factors=150, random_state=42)
        self.content_weight = content_weight
        self.collab_weight = 1 - content_weight

        # Encoders & scalers
        self.user_encoder = LabelEncoder()
        self.post_encoder = LabelEncoder()
        self.user_vectorizer = TfidfVectorizer()
        self.post_vectorizer = TfidfVectorizer()
        self.age_scaler = StandardScaler()
        self.popularity_scaler = StandardScaler()
        self.content_score_scaler = MinMaxScaler()
        self.collab_score_scaler = MinMaxScaler()

        self.user_features = None
        self.post_features = None

        self.users_df = None
        self.posts_df = None
        self.all_engagements = None
        self.val_users = None
        self.val_true_engagements = defaultdict(list)

    def _prepare_features(self):
        print("Preparing features for the content-based model...")
        users, posts = self.users_df, self.posts_df

        # User features
        users['gender_encoded'] = LabelEncoder().fit_transform(users['gender'])
        users['age_scaled'] = self.age_scaler.fit_transform(users[['age']])
        user_interests = self.user_vectorizer.fit_transform(users['top_3_interests'])
        self.user_features = np.hstack([
            users[['gender_encoded', 'age_scaled', 'past_engagement_score']].values,
            user_interests.toarray()
        ])

        # Post features
        posts['content_type_encoded'] = LabelEncoder().fit_transform(posts['content_type'])
        post_tags = self.post_vectorizer.fit_transform(posts['tags'])
        posts['popularity_scaled'] = self.popularity_scaler.fit_transform(posts[['popularity_score']])

        self.post_features = np.hstack([
            posts[['content_type_encoded', 'popularity_scaled']].values,
            post_tags.toarray()
        ])

    def fit(self, users, posts, engagements):
        self.users_df = users.copy()
        self.posts_df = posts.copy()
        self.all_engagements = engagements.copy()

        print("Calculating post popularity feature...")
        popularity_df = engagements.groupby('post_id').size().reset_index(name='popularity_score')
        self.posts_df = pd.merge(self.posts_df, popularity_df, on='post_id', how='left')
        self.posts_df['popularity_score'].fillna(0, inplace=True)

        self.users_df['user_idx'] = self.user_encoder.fit_transform(self.users_df['user_id'])
        self.posts_df['post_idx'] = self.post_encoder.fit_transform(self.posts_df['post_id'])

        self._prepare_features()

        print("Splitting data into training and validation sets...")
        unique_users = users['user_id'].unique()
        train_users, self.val_users = train_test_split(unique_users, test_size=0.2, random_state=42)

        train_engagements = engagements[engagements['user_id'].isin(train_users)]
        val_engagements = engagements[engagements['user_id'].isin(self.val_users)]

        for _, row in val_engagements.iterrows():
            if row['engagement'] == 1:
                self.val_true_engagements[row['user_id']].append(row['post_id'])

        train_data = pd.merge(train_engagements, self.users_df, on='user_id')
        train_data = pd.merge(train_data, self.posts_df, on='post_id')

        X = np.hstack([self.user_features[train_data['user_idx'].values], self.post_features[train_data['post_idx'].values]])
        y = train_data['engagement']

        print("Training content-based model (LightGBM)...")
        self.content_model.fit(X, y)

        print("Training collaborative filtering model (SVD)...")
        reader = Reader(rating_scale=(0, 1))
        train_data_surprise = Dataset.load_from_df(train_engagements[['user_id', 'post_id', 'engagement']], reader)
        trainset = train_data_surprise.build_full_trainset()
        self.collab_model.fit(trainset)

        # --- Fit global scalers on training predictions ---
        print("Fitting global score scalers...")
        content_scores = self.content_model.predict_proba(X)[:, 1]
        collab_scores = [self.collab_model.predict(uid, iid).est for uid, iid in zip(train_data['user_id'], train_data['post_id'])]
        self.content_score_scaler.fit(np.array(content_scores).reshape(-1, 1))
        self.collab_score_scaler.fit(np.array(collab_scores).reshape(-1, 1))

        print("Training complete!")

    def _get_blended_scores(self, user_id, post_ids):
        post_df_subset = self.posts_df[self.posts_df['post_id'].isin(post_ids)].copy()
        if post_df_subset.empty: return pd.DataFrame(columns=['post_id', 'final_score'])

        user_idx = self.users_df.loc[self.users_df['user_id'] == user_id, 'user_idx'].iloc[0]
        user_vec = self.user_features[user_idx]
        post_indices = post_df_subset['post_idx'].values

        user_vec_repeated = np.tile(user_vec, (len(post_indices), 1))
        feature_matrix = np.hstack([user_vec_repeated, self.post_features[post_indices]])

        content_scores = self.content_model.predict_proba(feature_matrix)[:, 1]
        collab_scores = [self.collab_model.predict(user_id, post_id).est for post_id in post_df_subset['post_id']]

        norm_content = self.content_score_scaler.transform(np.array(content_scores).reshape(-1, 1)).flatten()
        norm_collab = self.collab_score_scaler.transform(np.array(collab_scores).reshape(-1, 1)).flatten()

        post_df_subset['final_score'] = (self.content_weight * norm_content) + (self.collab_weight * norm_collab)

        return post_df_subset[['post_id', 'final_score']]

    def recommend(self, user_id, k=3):
        seen_posts = set(self.all_engagements[self.all_engagements['user_id'] == user_id]['post_id'])
        candidate_post_ids = self.posts_df[~self.posts_df['post_id'].isin(seen_posts)]['post_id'].tolist()
        if not candidate_post_ids: return []
        scores_df = self._get_blended_scores(user_id, candidate_post_ids)
        top_k_posts = scores_df.sort_values('final_score', ascending=False).head(k)
        return top_k_posts['post_id'].tolist()

    def evaluate(self, n_neg_samples=100):
        print("Evaluating model performance on validation users...")
        all_precisions, all_recalls, all_aucs, all_ndcgs = [], [], [], []
        all_post_ids = self.posts_df['post_id'].unique()

        for user_id in self.val_users:
            true_positives = self.val_true_engagements.get(user_id, [])
            if not true_positives: continue

            recommendations = self.recommend(user_id, k=3)
            hits = len(set(recommendations) & set(true_positives))
            all_precisions.append(hits / len(recommendations) if recommendations else 0)
            all_recalls.append(hits / len(true_positives))

            seen_posts = set(self.all_engagements[self.all_engagements['user_id'] == user_id]['post_id'])
            possible_negatives = list(set(all_post_ids) - seen_posts - set(true_positives))
            neg_samples = np.random.choice(possible_negatives, size=min(len(possible_negatives), n_neg_samples), replace=False)
            eval_items = true_positives + list(neg_samples)
            scores_df = self._get_blended_scores(user_id, eval_items).set_index('post_id')

            y_true = np.array([1 if item in true_positives else 0 for item in scores_df.index])
            y_score = scores_df['final_score'].values

            if len(np.unique(y_true)) > 1:
                all_aucs.append(roc_auc_score(y_true, y_score))
            all_ndcgs.append(ndcg_score([y_true], [y_score], k=10))

        avg_precision = np.mean(all_precisions) if all_precisions else 0
        avg_recall = np.mean(all_recalls) if all_recalls else 0
        avg_auc = np.mean(all_aucs) if all_aucs else 0
        avg_ndcg = np.mean(all_ndcgs) if all_ndcgs else 0

        print(f"  -> Average Precision@3: {avg_precision:.4f}")
        print(f"  -> Average Recall@3:    {avg_recall:.4f}")
        print(f"  -> Average AUC:         {avg_auc:.4f}")
        print(f"  -> Average nDCG@10:     {avg_ndcg:.4f}")
        return avg_precision, avg_recall, avg_auc, avg_ndcg


In [6]:
if __name__ == "__main__":
    try:
        users = pd.read_csv("Users.csv")
        posts = pd.read_csv("Posts.csv")
        engagements = pd.read_csv("Engagements.csv")

        users['top_3_interests'].fillna('', inplace=True)
        posts['tags'].fillna('', inplace=True)

        # --- Grid Search Setup ---
        best_auc = 0
        best_params = {}

        # Define the grid of parameters to test
        param_grid = {
            'content_weight': [0.1, 0.2, 0.3],
            'n_factors': [50, 150, 200]
        }

        # Loop through all combinations
        for weight in param_grid['content_weight']:
            for factors in param_grid['n_factors']:
                print(f"\n--- Testing with content_weight={weight}, n_factors={factors} ---")

                # Initialize and train the model with the current parameters
                recommender = HybridRecommender(content_weight=weight)
                recommender.collab_model = SVD(n_factors=factors, random_state=42) # Set n_factors here

                recommender.fit(users, posts, engagements)
                _, _, avg_auc, _ = recommender.evaluate()

                # Check if this is the best model so far
                if avg_auc > best_auc:
                    best_auc = avg_auc
                    best_params = {'content_weight': weight, 'n_factors': factors}

        print("\n==============================================")
        print(f"Grid Search Complete!")
        print(f"Best AUC Score: {best_auc:.4f}")
        print(f"Best Parameters: {best_params}")
        print("==============================================")

    except FileNotFoundError:
        print("Error: Make sure Users.csv, Posts.csv, and Engagements.csv are in the same directory.")


--- Testing with content_weight=0.1, n_factors=50 ---
Calculating post popularity feature...
Preparing features for the content-based model...
Splitting data into training and validation sets...
Training content-based model (LightGBM)...
Training collaborative filtering model (SVD)...
Fitting global score scalers...
Training complete!
Evaluating model performance on validation users...
  -> Average Precision@3: 0.0000
  -> Average Recall@3:    0.0000
  -> Average AUC:         0.5258
  -> Average nDCG@10:     0.0960

--- Testing with content_weight=0.1, n_factors=150 ---
Calculating post popularity feature...
Preparing features for the content-based model...
Splitting data into training and validation sets...
Training content-based model (LightGBM)...
Training collaborative filtering model (SVD)...
Fitting global score scalers...
Training complete!
Evaluating model performance on validation users...
  -> Average Precision@3: 0.0000
  -> Average Recall@3:    0.0000
  -> Average AUC:    

In [8]:
# --- Example run after training ---

if recommender.val_users.size > 0:
    sample_user_id = recommender.val_users[0]
    print(f"\n--- Example Recommendations for User: {sample_user_id} ---")

    user_info = users[users['user_id'] == sample_user_id]
    print(
        f"User Profile: Age={user_info['age'].iloc[0]}, "
        f"Gender={user_info['gender'].iloc[0]}, "
        f"Interests='{user_info['top_3_interests'].iloc[0]}'"
    )

    top_3_recs = recommender.recommend(sample_user_id, k=3)

    print("\nTop 3 Recommended Post IDs:")
    if top_3_recs:
        for i, post_id in enumerate(top_3_recs):
            print(f"{i+1}. {post_id}")
    else:
        print("Could not generate recommendations for this user.")



--- Example Recommendations for User: U14 ---
User Profile: Age=19, Gender=M, Interests='sports, fitness, travel'

Top 3 Recommended Post IDs:
1. P88
2. P46
3. P41
