In [453]:
# For Building recommender systems
!pip install scikit-surprise



In [454]:
import joblib
import random # For generating random numbers or making random selections
import numpy as np # For numerical operations, such as arrays and mathematical functions
import pickle # For serializing and deserializing Python objects
import pandas as pd # For data manipulation and analysis, especially with DataFrames
from surprise import SVD, Dataset, Reader # For building recommendation systems
from sklearn.preprocessing import MinMaxScaler # To scale features to a specified range, typically [0, 1]
from datetime import datetime, timedelta #For handling date and time operations

In [455]:
news_df = pd.read_csv('/content/news.csv')
rec_items_df = pd.read_csv('/content/rec_items.csv')
rec_feedback_df = pd.read_csv('/content/rec_feedback.csv')
users_df = pd.read_csv('/content/users.csv')
rec_users_df = pd.read_csv('/content/rec_users.csv')

In [456]:
# Load the CBF model (TF-IDF and Cosine Similarity Matrix)
tfidf = joblib.load('/content/tfidf_vectorizer.pkl')
cosine_sim = joblib.load('/content/cosine_similarity_matrix.pkl')

# Load the CF model (e.g., a pre-trained collaborative filtering model)
cf_model = joblib.load('/content/knn_recommender_model.pkl')

In [457]:
# Function to Get Trending News
def get_recent_trending_news(rec_feedback_df, top_n=5, days=7):
    if "timestamp" not in rec_feedback_df.columns:
        print("No timestamp column found. Using most interacted articles.")
        return rec_feedback_df["item_id"].value_counts().head(top_n).index.tolist()

    rec_feedback_df["timestamp"] = pd.to_datetime(rec_feedback_df["timestamp"], errors='coerce')

    if rec_feedback_df["timestamp"].isnull().all():
        print("All timestamps are invalid. Using most interacted articles.")
        return rec_feedback_df["item_id"].value_counts().head(top_n).index.tolist()

    recent_date = datetime.now() - timedelta(days=days)
    recent_engagements = rec_feedback_df[rec_feedback_df["timestamp"] >= recent_date]

    if recent_engagements.empty:
        print("No recent engagements found. Using most interacted articles.")
        return rec_feedback_df["item_id"].value_counts().head(top_n).index.tolist()

    trending = recent_engagements["item_id"].value_counts().head(top_n).index.tolist()
    return trending if trending else []

In [458]:
# Function to Calculate User Alpha (Hybrid Weighting)
def calculate_user_alpha(user_id, rec_feedback_df):
    user_interactions = rec_feedback_df[rec_feedback_df["user_id"] == user_id]
    total_interactions = len(user_interactions)
    return min(1, max(0, total_interactions / 100)) if total_interactions else 0.5

In [459]:
# Maximal Marginal Relevance (MMR) for Diversity
def max_marginal_relevance(recommendations, hybrid_scores, cosine_sim, top_n=5):
    selected_items = []
    for item in recommendations:
        similarity_score = sum(cosine_sim.get(item, {}).get(other_item, 0) for other_item in selected_items)
        adjusted_score = hybrid_scores[item] - similarity_score * 0.7
        hybrid_scores[item] = adjusted_score
    return sorted(hybrid_scores.keys(), key=lambda x: hybrid_scores[x], reverse=True)[:top_n]

In [460]:
# Normalize Scores to [0, 1] Range
def normalize_scores(scores):
    if not scores:
        return {}
    values = np.array(list(scores.values())).reshape(-1, 1)
    scaler = MinMaxScaler()
    normalized_values = scaler.fit_transform(values).flatten()
    return {key: norm_score for key, norm_score in zip(scores.keys(), normalized_values)}

In [461]:
# Content-Based Recommendation (CBF) using pre-trained cosine similarity
def recommend_content_based(news_id, news_df, top_n=5):
    # Create a mapping of news_id to index for quick lookup
    id_to_index = {news_df['id'][i]: i for i in range(len(news_df))}

    # Ensure the provided news_id exists in the dataset
    if news_id not in id_to_index:
        print(f"News ID {news_id} not found in dataset.")
        return []

    # Get the index for the last interacted news
    index = id_to_index[news_id]

    # Fetch the similarity scores for the last interacted item
    similar_items = list(enumerate(cosine_sim[index]))

    # Sort items based on similarity score in descending order
    sorted_items = sorted(similar_items, key=lambda x: x[1], reverse=True)

    # Get the top N similar items (excluding the item itself)
    recommended_items = [news_df['id'][item[0]] for item in sorted_items[1:top_n+1]]

    return news_df[news_df['id'].isin(recommended_items)][['id', 'title']]

In [462]:
def recommend_collaborative(user_id, model, rec_feedback_df, top_n=5):
    # Get all items that the user has not interacted with yet
    all_items = rec_feedback_df['item_id'].unique()
    interacted_items = rec_feedback_df[rec_feedback_df['user_id'] == user_id]['item_id'].unique()
    items_to_predict = list(set(all_items) - set(interacted_items))

    # Get CF scores for the items
    predictions = {}
    for item in items_to_predict:
        predictions[item] = model.predict(user_id, item).est

    # Log the predictions for debugging
    print(f"CF predictions for user {user_id}: {predictions}")

    # Sort predictions and return top N
    recommended_items = sorted(predictions.items(), key=lambda x: x[1], reverse=True)[:top_n]
    return [item[0] for item in recommended_items]

In [463]:
def recommend_all(user_id, rec_feedback_df, news_df, model=cf_model, top_n=5):
    if 'id' not in news_df.columns:
        raise KeyError("'id' column not found in news_df")
    if 'item_id' not in rec_feedback_df.columns:
        raise KeyError("'item_id' column not found in rec_feedback_df")

    user_interactions = rec_feedback_df[rec_feedback_df["user_id"] == user_id]

    if user_interactions.empty:
        print(f"No data for user {user_id}. Showing recent trending news...")
        trending_ids = get_recent_trending_news(rec_feedback_df, top_n=top_n) or random.sample(list(rec_feedback_df["item_id"].unique()), top_n)
        return {
            "CF": news_df[news_df["id"].isin(trending_ids)][["id", "title"]],
            "CBF": news_df[news_df["id"].isin(trending_ids)][["id", "title"]],
            "Hybrid": news_df[news_df["id"].isin(trending_ids)][["id", "title"]],
        }

    alpha = calculate_user_alpha(user_id, rec_feedback_df)
    last_interacted_news_id = user_interactions.iloc[-1]["item_id"]

    # Content-based filtering recommendations using pre-trained cosine similarity
    cbf_recommendations = recommend_content_based(last_interacted_news_id, news_df, top_n=top_n)

    # Collaborative filtering recommendations
    cf_recommendations = recommend_collaborative(user_id, model, rec_feedback_df, top_n=top_n)

    # Debugging: Check the structure of the recommendations
    print("CBF Recommendations:", cbf_recommendations)
    print("CF Recommendations:", cf_recommendations)

    # Check if cbf_recommendations is a list of IDs or dictionaries
    if isinstance(cbf_recommendations, list):
        # If it's a list of IDs
        cbf_ids = cbf_recommendations
    elif isinstance(cbf_recommendations, list) and isinstance(cbf_recommendations[0], dict) and 'id' in cbf_recommendations[0]:
        # If it's a list of dictionaries
        cbf_ids = [rec['id'] for rec in cbf_recommendations]
    else:
        # If neither, print an error and handle accordingly
        print("Unexpected format for CBF recommendations")
        cbf_ids = []

    # Check if cf_recommendations is a list of IDs or dictionaries
    if isinstance(cf_recommendations, list):
        # If it's a list of IDs
        cf_ids = cf_recommendations
    elif isinstance(cf_recommendations, list) and isinstance(cf_recommendations[0], dict) and 'id' in cf_recommendations[0]:
        # If it's a list of dictionaries
        cf_ids = [rec['id'] for rec in cf_recommendations]
    else:
        # If neither, print an error and handle accordingly
        print("Unexpected format for CF recommendations")
        cf_ids = []

    # Now we combine both CF and CBF recommendations into the hybrid list
    hybrid_recommendations = list(set(cbf_ids).union(cf_ids))

    # Create a mapping from news_id to index in the cosine similarity matrix
    id_to_index = {news_df['id'].iloc[i]: i for i in range(len(news_df))}

    # Ensure last_interacted_news_id is mapped to an index safely
    if last_interacted_news_id not in id_to_index:
        print(f"News ID {last_interacted_news_id} not found in index mapping.")
        last_interacted_index = None
    else:
        last_interacted_index = id_to_index[last_interacted_news_id]

    # Compute CBF Scores safely
    if last_interacted_index is not None:
        cbf_scores = {
            item: cosine_sim[last_interacted_index][id_to_index[item]]
            for item in cbf_ids if item in id_to_index
        }
    else:
        cbf_scores = {}

    # Collaborative filtering scores
    cf_scores = {item: model.predict(user_id, item).est for item in cf_ids}

    # Normalize both CBF and CF scores to a range of [0, 1]
    cbf_scores = normalize_scores(cbf_scores)
    cf_scores = normalize_scores(cf_scores)

    # Hybrid scoring: Combine CF and CBF scores
    hybrid_scores = {item: alpha * cbf_scores.get(item, 0) + (1 - alpha) * cf_scores.get(item, 0) for item in hybrid_recommendations}

    # Apply Maximal Marginal Relevance (MMR) to diversify the recommendations
    top_hybrid_recommendations = max_marginal_relevance(list(hybrid_scores.keys()), hybrid_scores, cosine_sim, top_n=top_n)

    # Return recommendations
    return {
        "CF": news_df[news_df["id"].isin(cf_ids)][["id", "title"]],
        "CBF": news_df[news_df["id"].isin(cbf_ids)][["id", "title"]],
        "Hybrid": news_df[news_df["id"].isin(top_hybrid_recommendations)][["id", "title"]],
    }

In [464]:
user_id = 2329
recommendations = recommend_all(user_id, rec_feedback_df, news_df)

# Output for CF Recommendations
print(f"\nTop CF recommendations for User {user_id}:")
print("="*40)
print(recommendations["CF"].to_string(index=False))

# Output for CBF Recommendations
print(f"\nTop CBF recommendations for User {user_id}:")
print("="*40)
print(recommendations["CBF"].to_string(index=False))

# Output for Hybrid Recommendations
print(f"\nTop Hybrid recommendations for User {user_id}:")
print("="*40)
print(recommendations["Hybrid"].to_string(index=False))

CF predictions for user 2329: {23565: 1, 16432: 1, 20016: 1, 23603: 1, 23609: 1, 22090: 1, 23118: 1, 22612: 1, 23639: 1, 22618: 1, 18530: 1, 23140: 1, 23652: 1, 23653: 1, 17513: 1, 22307: 1, 23149: 1, 23150: 1, 23662: 1, 23664: 1, 22641: 1, 17521: 1, 23666: 1, 23668: 1, 23679: 1, 16519: 1, 23689: 1, 23692: 1, 23693: 1, 23695: 1, 17557: 1, 23702: 1, 23708: 1, 23711: 1, 23712: 1, 23713: 1, 23715: 1, 23718: 1, 18599: 1, 23721: 1, 23722: 1, 23723: 1, 23732: 1, 23733: 1, 23735: 1, 17596: 1, 23740: 1, 23742: 1, 23232: 1, 23744: 1, 23745: 1, 23747: 1, 17604: 1, 23749: 1, 23750: 1, 23239: 1, 23240: 1, 21705: 1, 23754: 1, 23753: 1, 23756: 1, 22733: 1, 23759: 1, 23760: 1, 23761: 1, 23763: 1, 23764: 1, 23254: 1, 22743: 1, 23766: 1, 23767: 1, 23769: 1, 23770: 1, 23771: 1, 23774: 1, 23778: 1, 23780: 1, 23781: 1, 23782: 1, 23783: 1, 23786: 1, 23275: 1, 16620: 1, 23788: 1, 23789: 1, 23791: 1, 23794: 1, 23795: 1, 23796: 1, 23797: 1, 23799: 1, 23800: 1, 23801: 1, 23804: 1, 23293: 1, 19709: 1, 23805: 1,

In [465]:
print("\nUnique user IDs in the feedback data:")
print(rec_feedback_df['user_id'].unique())


Unique user IDs in the feedback data:
[1182 1058  698 1203 1045 2303 1760  700  695 1221  795 1376  268  482
  417  787 1980 1427 1783 1147  797 1362  704 1245  891 1312 2280  678
 1490  257  258 1546  777 2329 1144  726 1244 2292 1958 2095  469 1483
 1993 2126 1770 2301  650  691  848  645 1984  286 1021  886  980 1093
 1814 1375  662 1417  646  714 1798 2163 1356  270  333 1085  876 1477
 1755  273 2258 1042  262 2115 1184 1108 1602  930 1200 1540  272 1717
 1526  707 1495 2203 1482  737 1979 1192  287 1397  425  349  676 2252
  261 2139 1711 1126 2281 1749  979 1349  679]
