In [None]:
# For Building recommender systems
!pip install scikit-surprise



In [None]:
import pandas as pd
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split, GridSearchCV
from surprise import accuracy
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import MinMaxScaler
import joblib
from datetime import datetime

In [None]:
news_df = pd.read_csv('/content/news.csv')
rec_items_df = pd.read_csv('/content/rec_items.csv')
rec_feedback_df = pd.read_csv('/content/rec_feedback.csv')
users_df = pd.read_csv('/content/users.csv')
rec_users_df = pd.read_csv('/content/rec_users.csv')

In [None]:
# Load the CBF model (TF-IDF and Cosine Similarity Matrix)
tfidf = joblib.load('/content/tfidf_vectorizer.pkl')
cosine_sim = joblib.load('/content/cosine_similarity_matrix.pkl')

# Load the CF model (e.g., a pre-trained collaborative filtering model)
cf_model = joblib.load('/content/svd_recommender_model.pkl')

In [None]:
# Function to Get Trending News
def get_recent_trending_news(rec_feedback_df, top_n=5, days=7):
    if "timestamp" not in rec_feedback_df.columns:
        print("No timestamp column found. Using most interacted articles.")
        return rec_feedback_df["item_id"].value_counts().head(top_n).index.tolist()

    rec_feedback_df["timestamp"] = pd.to_datetime(rec_feedback_df["timestamp"], errors='coerce')

    if rec_feedback_df["timestamp"].isnull().all():
        print("All timestamps are invalid. Using most interacted articles.")
        return rec_feedback_df["item_id"].value_counts().head(top_n).index.tolist()

    recent_date = datetime.now() - timedelta(days=days)
    recent_engagements = rec_feedback_df[rec_feedback_df["timestamp"] >= recent_date]

    if recent_engagements.empty:
        print("No recent engagements found. Using most interacted articles.")
        return rec_feedback_df["item_id"].value_counts().head(top_n).index.tolist()

    trending = recent_engagements["item_id"].value_counts().head(top_n).index.tolist()
    return trending if trending else []

In [None]:
# Function to Calculate User Alpha (Hybrid Weighting)
def calculate_user_alpha(user_id, rec_feedback_df):
    user_interactions = rec_feedback_df[rec_feedback_df["user_id"] == user_id]
    total_interactions = len(user_interactions)
    return min(1, max(0, total_interactions / 100)) if total_interactions else 0.5

In [None]:
# Maximal Marginal Relevance (MMR) for Diversity
def max_marginal_relevance(recommendations, hybrid_scores, cosine_sim, top_n=5):
    selected_items = []
    for item in recommendations:
        similarity_score = sum(cosine_sim.get(item, {}).get(other_item, 0) for other_item in selected_items)
        adjusted_score = hybrid_scores[item] - similarity_score * 0.7
        hybrid_scores[item] = adjusted_score
    return sorted(hybrid_scores.keys(), key=lambda x: hybrid_scores[x], reverse=True)[:top_n]

In [None]:
# Normalize Scores to [0, 1] Range
def normalize_scores(scores):
    if not scores:
        return {}
    values = np.array(list(scores.values())).reshape(-1, 1)
    scaler = MinMaxScaler()
    normalized_values = scaler.fit_transform(values).flatten()
    return {key: norm_score for key, norm_score in zip(scores.keys(), normalized_values)}

In [None]:
# Content-Based Recommendation (CBF)
def recommend_content_based(news_id, top_n=5):
    id_to_index = {news_df['id'][i]: i for i in range(len(news_df))}

    if news_id not in id_to_index:
        print(f"News ID {news_id} not found in dataset.")
        return []

    index = id_to_index[news_id]
    similar_items = list(enumerate(cosine_sim[index]))
    sorted_items = sorted(similar_items, key=lambda x: x[1], reverse=True)

    recommended_items = [news_df['id'][item[0]] for item in sorted_items[1:top_n+1]]
    return news_df[news_df['id'].isin(recommended_items)][['id', 'title']]

In [None]:
def recommend_collaborative(user_id, model, rec_feedback_df, top_n=5):
    if user_id not in rec_feedback_df['user_id'].values:
        return []
    all_items = rec_feedback_df['item_id'].unique()
    predictions = {item: model.predict(user_id, item).est for item in all_items}

    return sorted(predictions, key=predictions.get, reverse=True)[:top_n]

In [None]:
def recommend_all(user_id, rec_feedback_df, news_df, model=cf_model, top_n=5):
    # Check if required columns are present
    if 'id' not in news_df.columns:
        raise KeyError("'id' column not found in news_df")
    if 'item_id' not in rec_feedback_df.columns:
        raise KeyError("'item_id' column not found in rec_feedback_df")

    # Get user interactions from rec_feedback_df
    user_interactions = rec_feedback_df[rec_feedback_df["user_id"] == user_id]

    if user_interactions.empty:
        # If no interactions for the user, return trending news
        print(f"No data for user {user_id}. Showing recent trending news...")
        trending_ids = get_recent_trending_news(rec_feedback_df, top_n=top_n) or random.sample(list(rec_feedback_df["item_id"].unique()), top_n)
        return {
            "CF": news_df[news_df["id"].isin(trending_ids)][["id", "title"]],
            "CBF": news_df[news_df["id"].isin(trending_ids)][["id", "title"]],
            "Hybrid": news_df[news_df["id"].isin(trending_ids)][["id", "title"]],
        }

    # Calculate alpha dynamically based on user-specific data (e.g., recency, activity)
    alpha = calculate_user_alpha(user_id, rec_feedback_df)

    # Get the last interacted item by the user (e.g., the most recent item they interacted with)
    last_interacted_news_id = user_interactions.iloc[-1]["item_id"]

    # Content-based filtering recommendations
    cbf_recommendations = recommend_content_based(last_interacted_news_id, top_n=top_n)

    # Collaborative filtering recommendations
    cf_recommendations = recommend_collaborative(user_id, model, rec_feedback_df, top_n=top_n)

    # Map news id to index for the cosine similarity matrix (used in CBF)
    id_to_index = {news_df['id'][i]: i for i in range(len(news_df))}

    # Ensure last_interacted_news_id is safely mapped to an index
    if last_interacted_news_id not in id_to_index:
        print(f"News ID {last_interacted_news_id} not found in index mapping.")
        last_interacted_index = None
    else:
        last_interacted_index = id_to_index[last_interacted_news_id]

    # Compute CBF Scores safely using cosine similarity matrix
    if last_interacted_index is not None:
        cbf_scores = {
            item: cosine_sim[last_interacted_index][id_to_index[item]]
            for item in cbf_recommendations['id'] if item in id_to_index
        }
    else:
        cbf_scores = {}

    # CF Scores (Collaborative Filtering)
    cf_scores = {item: model.predict(user_id, item).est for item in cf_recommendations}

    # Normalize both CBF and CF scores to a range of [0, 1]
    cbf_scores = normalize_scores(cbf_scores)
    cf_scores = normalize_scores(cf_scores)

    # Hybrid scoring (weighted sum of CF and CBF scores)
    hybrid_scores = {
        item: alpha * cbf_scores.get(item, 0) + (1 - alpha) * cf_scores.get(item, 0)
        for item in set(cbf_recommendations['id']).union(cf_recommendations)
    }

    # Apply Maximal Marginal Relevance (MMR) to diversify the recommendations
    top_hybrid_recommendations = max_marginal_relevance(list(hybrid_scores.keys()), hybrid_scores, cosine_sim, top_n=top_n)

    return {
        "CF": news_df[news_df["id"].isin(cf_recommendations)][["id", "title"]],
        "CBF": cbf_recommendations,
        "Hybrid": news_df[news_df["id"].isin(top_hybrid_recommendations)][["id", "title"]],
    }

In [None]:
user_id = 2329
recommendations = recommend_all(user_id, rec_feedback_df, news_df)

# Output for CF Recommendations
print(f"\nTop CF recommendations for User {user_id}:")
print("="*40)
print(recommendations["CF"].to_string(index=False))

# Output for CBF Recommendations
print(f"\nTop CBF recommendations for User {user_id}:")
print("="*40)
print(recommendations["CBF"].to_string(index=False))

# Output for Hybrid Recommendations
print(f"\nTop Hybrid recommendations for User {user_id}:")
print("="*40)
print(recommendations["Hybrid"].to_string(index=False))


Top CF recommendations for User 2329:
   id                                                                                 title
23953                                           අදානි සුලං බලාගාර ව්‍යාපෘතියෙන් ඉවත් වෙයි.?
23942                               එක්සත් අරාබි එමීර් රාජ්‍යය හා ශ්‍රී ලංකාව අතර ගිවිසුමක්
23941 ජනපති ලෝක නායකයන් ඇමතූ දේශණයේ ‘ඉන්දීය මහද්වීපය හා ආලෝක වර්ෂ ගණනක වේගය’ නිවැරදි කරයි.
23937     යුරෝපයට පලා යද්දී මෙරට Ex. මන්ත‍්‍රීවරයෙකු තමින්නාඩු පොලිස් අත්අඩංගුවට. රිමාන්ඩ්.
23930                                                  විදුලි කප්පාදුව අදත් - වේලාවන් මෙන්න

Top CBF recommendations for User 2329:
   id                                                      title
23551                 පාස්කු බෝම්බ චෝදනාවට ගෝටාගෙන් ප‍්‍රකාශයක්.
23299    ජනපති යාපනයේ – උතුරේ රැකියා විරහිත උපාධිධාරීන් පාරට බහී
23285         ජනපති යාපනය කච්චේරිය ඇතුලේ සිටියදී එලියේ විරෝධතා.
23063                     බඹර ප්‍රහාරයකින් පාසල් සිසුවෙක් මියයයි
23003 නිදහස් උත්සවයේ පෙරහුරු අද සිට - පාසල් 

In [None]:
print("\nUnique user IDs in the feedback data:")
print(rec_feedback_df['user_id'].unique())


Unique user IDs in the feedback data:
[1182 1058  698 1203 1045 2303 1760  700  695 1221  795 1376  268  482
  417  787 1980 1427 1783 1147  797 1362  704 1245  891 1312 2280  678
 1490  257  258 1546  777 2329 1144  726 1244 2292 1958 2095  469 1483
 1993 2126 1770 2301  650  691  848  645 1984  286 1021  886  980 1093
 1814 1375  662 1417  646  714 1798 2163 1356  270  333 1085  876 1477
 1755  273 2258 1042  262 2115 1184 1108 1602  930 1200 1540  272 1717
 1526  707 1495 2203 1482  737 1979 1192  287 1397  425  349  676 2252
  261 2139 1711 1126 2281 1749  979 1349  679]


In [None]:
from sklearn.metrics import precision_score, recall_score

def evaluate_model(recommended_items, relevant_items):
    # Convert lists to sets for easier comparison
    recommended_set = set(recommended_items)
    relevant_set = set(relevant_items)

    # Precision: How many recommended items were actually relevant
    precision = len(recommended_set.intersection(relevant_set)) / len(recommended_set)

    # Recall: How many relevant items were recommended
    recall = len(recommended_set.intersection(relevant_set)) / len(relevant_set)

    return precision, recall

# Example user recommendations and interactions
cf_recommendations = [23565, 16432, 20016, 23603, 23609]  # CF recommended items
cbf_recommendations = [23269, 23484, 23848, 23521, 23826]  # CBF recommended items
hybrid_recommendations = [23937, 23551, 23299, 23285, 23003]  # Hybrid recommended items

# Example relevant items (could be items user actually interacted with or rated positively)
relevant_items = [23565, 23484, 23269, 23299, 23003]

# Evaluate CF, CBF, Hybrid
cf_precision, cf_recall = evaluate_model(cf_recommendations, relevant_items)
cbf_precision, cbf_recall = evaluate_model(cbf_recommendations, relevant_items)
hybrid_precision, hybrid_recall = evaluate_model(hybrid_recommendations, relevant_items)

print(f"CF Precision: {cf_precision}, CF Recall: {cf_recall}")
print(f"CBF Precision: {cbf_precision}, CBF Recall: {cbf_recall}")
print(f"Hybrid Precision: {hybrid_precision}, Hybrid Recall: {hybrid_recall}")

CF Precision: 0.2, CF Recall: 0.2
CBF Precision: 0.4, CBF Recall: 0.4
Hybrid Precision: 0.4, Hybrid Recall: 0.4
