In [186]:
# For Building recommender systems
!pip install scikit-surprise



In [187]:
import joblib
import random # For generating random numbers or making random selections
import numpy as np # For numerical operations, such as arrays and mathematical functions
import pickle # For serializing and deserializing Python objects
import pandas as pd # For data manipulation and analysis, especially with DataFrames
from surprise import SVD, Dataset, Reader # For building recommendation systems
from sklearn.preprocessing import MinMaxScaler # To scale features to a specified range, typically [0, 1]
from datetime import datetime, timedelta #For handling date and time operations

In [188]:
news_df = pd.read_csv('/content/news.csv')
rec_items_df = pd.read_csv('/content/rec_items.csv')
rec_feedback_df = pd.read_csv('/content/rec_feedback.csv')
users_df = pd.read_csv('/content/users.csv')
rec_users_df = pd.read_csv('/content/rec_users.csv')

In [189]:
# Load the CBF model (TF-IDF and Cosine Similarity Matrix)
tfidf = joblib.load('/content/tfidf_vectorizer.pkl')
cosine_sim = joblib.load('/content/cosine_similarity_matrix.pkl')

# Load the CF model (e.g., a pre-trained collaborative filtering model)
cf_model = joblib.load('/content/svd_recommender_model.pkl')

In [190]:
# Function to Get Trending News
def get_recent_trending_news(rec_feedback_df, top_n=5, days=7):
    if "timestamp" not in rec_feedback_df.columns:
        print("No timestamp column found. Using most interacted articles.")
        return rec_feedback_df["item_id"].value_counts().head(top_n).index.tolist()

    rec_feedback_df["timestamp"] = pd.to_datetime(rec_feedback_df["timestamp"], errors='coerce')

    if rec_feedback_df["timestamp"].isnull().all():
        print("All timestamps are invalid. Using most interacted articles.")
        return rec_feedback_df["item_id"].value_counts().head(top_n).index.tolist()

    recent_date = datetime.now() - timedelta(days=days)
    recent_engagements = rec_feedback_df[rec_feedback_df["timestamp"] >= recent_date]

    if recent_engagements.empty:
        print("No recent engagements found. Using most interacted articles.")
        return rec_feedback_df["item_id"].value_counts().head(top_n).index.tolist()

    trending = recent_engagements["item_id"].value_counts().head(top_n).index.tolist()
    return trending if trending else []

In [191]:
# Function to Calculate User Alpha (Hybrid Weighting)
def calculate_user_alpha(user_id, rec_feedback_df):
    user_interactions = rec_feedback_df[rec_feedback_df["user_id"] == user_id]
    total_interactions = len(user_interactions)
    return min(1, max(0, total_interactions / 100)) if total_interactions else 0.5

In [192]:
# Maximal Marginal Relevance (MMR) for Diversity
def max_marginal_relevance(recommendations, hybrid_scores, cosine_sim, top_n=5):
    selected_items = []
    for item in recommendations:
        similarity_score = sum(cosine_sim.get(item, {}).get(other_item, 0) for other_item in selected_items)
        adjusted_score = hybrid_scores[item] - similarity_score * 0.7
        hybrid_scores[item] = adjusted_score
    return sorted(hybrid_scores.keys(), key=lambda x: hybrid_scores[x], reverse=True)[:top_n]

In [193]:
# Normalize Scores to [0, 1] Range
def normalize_scores(scores):
    if not scores:
        return {}
    values = np.array(list(scores.values())).reshape(-1, 1)
    scaler = MinMaxScaler()
    normalized_values = scaler.fit_transform(values).flatten()
    return {key: norm_score for key, norm_score in zip(scores.keys(), normalized_values)}

In [194]:
# Content-Based Recommendation (CBF)
def recommend_content_based(news_id, top_n=5):
    id_to_index = {news_df['id'][i]: i for i in range(len(news_df))}

    if news_id not in id_to_index:
        print(f"News ID {news_id} not found in dataset.")
        return []

    index = id_to_index[news_id]
    similar_items = list(enumerate(cosine_sim[index]))
    sorted_items = sorted(similar_items, key=lambda x: x[1], reverse=True)

    recommended_items = [news_df['id'][item[0]] for item in sorted_items[1:top_n+1]]
    return news_df[news_df['id'].isin(recommended_items)][['id', 'title']]

In [195]:
# Collaborative Filtering Recommendation (CF)
def recommend_collaborative(user_id, model, rec_feedback_df, top_n=5):
    if user_id not in rec_feedback_df['user_id'].values:
        return []

    all_items = rec_feedback_df['item_id'].unique()
    predictions = {item: model.predict(user_id, item).est for item in all_items}
    return sorted(predictions, key=predictions.get, reverse=True)[:top_n]

In [196]:
# Hybrid Recommendation (CF + CBF)
def recommend_all(user_id, rec_feedback_df, news_df, model=cf_model, top_n=5):
    if 'id' not in news_df.columns:
        raise KeyError("'id' column not found in news_df")
    if 'item_id' not in rec_feedback_df.columns:
        raise KeyError("'item_id' column not found in rec_feedback_df")

    user_interactions = rec_feedback_df[rec_feedback_df["user_id"] == user_id]

    if user_interactions.empty:
        print(f"No data for user {user_id}. Showing recent trending news...")
        trending_ids = get_recent_trending_news(rec_feedback_df, top_n=top_n) or random.sample(list(rec_feedback_df["item_id"].unique()), top_n)
        return {
            "CF": news_df[news_df["id"].isin(trending_ids)][["id", "title"]],
            "CBF": news_df[news_df["id"].isin(trending_ids)][["id", "title"]],
            "Hybrid": news_df[news_df["id"].isin(trending_ids)][["id", "title"]],
        }

    alpha = calculate_user_alpha(user_id, rec_feedback_df)
    last_interacted_news_id = user_interactions.iloc[-1]["item_id"]

    # Content-based filtering recommendations
    cbf_recommendations = recommend_content_based(last_interacted_news_id, top_n=top_n)

    # Collaborative filtering recommendations
    cf_recommendations = recommend_collaborative(user_id, model, rec_feedback_df, top_n=top_n)

    # Create a mapping from news_id to index in the cosine similarity matrix
    id_to_index = {news_df['id'][i]: i for i in range(len(news_df))}

    # Ensure last_interacted_news_id is mapped to an index safely
    if last_interacted_news_id not in id_to_index:
        print(f"News ID {last_interacted_news_id} not found in index mapping.")
        last_interacted_index = None
    else:
        last_interacted_index = id_to_index[last_interacted_news_id]

    # Compute CBF Scores safely
    if last_interacted_index is not None:
        cbf_scores = {
            item: cosine_sim[last_interacted_index][id_to_index[item]]
            for item in cbf_recommendations['id'] if item in id_to_index
        }
    else:
        cbf_scores = {}

    # Collaborative filtering scores
    cf_scores = {item: model.predict(user_id, item).est for item in cf_recommendations}

    # Normalize both CBF and CF scores to a range of [0, 1]
    cbf_scores = normalize_scores(cbf_scores)
    cf_scores = normalize_scores(cf_scores)

    # Hybrid scoring
    hybrid_scores = {item: alpha * cbf_scores.get(item, 0) + (1 - alpha) * cf_scores.get(item, 0) for item in set(cbf_recommendations['id']).union(cf_recommendations)}

    # Apply Maximal Marginal Relevance (MMR) to diversify the recommendations
    top_hybrid_recommendations = max_marginal_relevance(list(hybrid_scores.keys()), hybrid_scores, cosine_sim, top_n=top_n)

    return {
        "CF": news_df[news_df["id"].isin(cf_recommendations)][["id", "title"]],
        "CBF": cbf_recommendations,
        "Hybrid": news_df[news_df["id"].isin(top_hybrid_recommendations)][["id", "title"]],
    }

In [199]:
# Get Recommendations for a User
user_id = 2203
recommendations = recommend_all(user_id, rec_feedback_df, news_df)

# Output for CF Recommendations
print(f"\nTop CF recommendations for User {user_id}:")
print("="*40)
print(recommendations["CF"].to_string(index=False))  # Display without index

# Output for CBF Recommendations
print(f"\nTop CBF recommendations for User {user_id}:")
print("="*40)
print(recommendations["CBF"].to_string(index=False))  # Display without index

# Output for Hybrid Recommendations
print(f"\nTop Hybrid recommendations for User {user_id}:")
print("="*40)
print(recommendations["Hybrid"].to_string(index=False))  # Display without index


Top CF recommendations for User 2203:
   id                                                                                 title
23953                                           අදානි සුලං බලාගාර ව්‍යාපෘතියෙන් ඉවත් වෙයි.?
23942                               එක්සත් අරාබි එමීර් රාජ්‍යය හා ශ්‍රී ලංකාව අතර ගිවිසුමක්
23941 ජනපති ලෝක නායකයන් ඇමතූ දේශණයේ ‘ඉන්දීය මහද්වීපය හා ආලෝක වර්ෂ ගණනක වේගය’ නිවැරදි කරයි.
23937     යුරෝපයට පලා යද්දී මෙරට Ex. මන්ත‍්‍රීවරයෙකු තමින්නාඩු පොලිස් අත්අඩංගුවට. රිමාන්ඩ්.
23930                                                  විදුලි කප්පාදුව අදත් - වේලාවන් මෙන්න

Top CBF recommendations for User 2203:
   id                                                                                title
23914                                                නොරොච්චෝලේ අද පනගන්වන්න දැඩි උත්සහයක්
23906                                             ඉදිරි විදුලි කප්පාදුව ගැන අවසන් තීරණය අද
23859                                                   අදත් පැය එකහමාරක විදුලි කප්පාදුවක්
23796

In [198]:
print(rec_feedback_df['user_id'].unique())

[1182 1058  698 1203 1045 2303 1760  700  695 1221  795 1376  268  482
  417  787 1980 1427 1783 1147  797 1362  704 1245  891 1312 2280  678
 1490  257  258 1546  777 2329 1144  726 1244 2292 1958 2095  469 1483
 1993 2126 1770 2301  650  691  848  645 1984  286 1021  886  980 1093
 1814 1375  662 1417  646  714 1798 2163 1356  270  333 1085  876 1477
 1755  273 2258 1042  262 2115 1184 1108 1602  930 1200 1540  272 1717
 1526  707 1495 2203 1482  737 1979 1192  287 1397  425  349  676 2252
  261 2139 1711 1126 2281 1749  979 1349  679]
