## CBF

In [357]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import pickle
from sklearn.impute import SimpleImputer

In [358]:
news_df = pd.read_csv('/content/news.csv')
rec_items_df = pd.read_csv('/content/rec_items.csv')
rec_feedback_df = pd.read_csv('/content/rec_feedback.csv')
users_df = pd.read_csv('/content/users.csv')
users_df = pd.read_csv('/content/users.csv')

In [359]:
news_df.head()

Unnamed: 0,id,title,description,published_date,breaking_news,blob_image,source_url,created_at,updated_at,published_at,created_by_id,updated_by_id,shares,comment_count,type
0,23996,අද ඩොලරයේ අගය,ශ්‍රී ලංකා මහ බැංකුව විසින් අද (13) දින නිකුත්...,2025-02-13 12:10:32.919,False,,https://www.hirunews.lk/396721/%E0%B6%85%E0%B6...,2025-02-13 09:43:07.843,2025-02-13 09:43:12.403,2025-02-13 09:43:12.131,7.0,7.0,0,0,News
1,23995,‘ක්ලීන් ශ‍්‍රී ලංකා ලිඛිතව තියෙනවා.’ මාලිමා මන...,‘ක්‍ලීන් ශ්‍රී ලංකා’ වැඩපිළිවෙල යනු කුමක්දැයි ...,2025-02-13 09:30:00,False,,https://lankacnews.com/%e0%b6%9a%e0%b7%8a%e0%b...,2025-02-13 09:42:37.114,2025-02-13 09:42:47.003,,,,0,0,News
2,23994,පාපන්දු ගෝල කණුවක් කඩා වැටී පාසැල් සිසුවෙකු ජී...,පාසැල් ක්‍රීඩාගංනයක තිබූ පාපන්දු ගෝල කණුවක් කඩ...,2025-02-13 11:10:47.133,False,,https://www.hirunews.lk/396719/%E0%B6%B4%E0%B7...,2025-02-13 09:33:20.185,2025-02-13 09:33:23.796,2025-02-13 09:33:23.747,7.0,7.0,0,0,News
3,23993,සුජීව සේනසිංහගේ මූලික අයිතිවාසිකම් පෙත්සම විභා...,තමන්ට එරෙහිව අපරාධ පරීක්ෂණ දෙපාර්තමේන්තුව විසි...,2025-02-13 10:10:29.598,False,,https://www.hirunews.lk/396716/%E0%B7%83%E0%B7...,2025-02-13 09:31:55.442,2025-02-13 09:31:59.501,2025-02-13 09:31:59.41,7.0,7.0,0,0,News
4,23992,නීති විරෝධී ධීවර දැල් දෙසීය පනහක් නීතියේ රැහැනට,"ශ්‍රී ලංකා නාවික හමුදාව, කිලිනොච්චිය මුද්දලම්ප...",2025-02-13 10:58:56.639,False,,https://www.dinamina.lk/2025/02/13/lawnorder/1...,2025-02-13 09:29:23.368,2025-02-13 09:29:26.299,2025-02-13 09:29:26.258,7.0,7.0,0,0,News


In [360]:
news_df.dtypes

Unnamed: 0,0
id,int64
title,object
description,object
published_date,object
breaking_news,bool
blob_image,float64
source_url,object
created_at,object
updated_at,object
published_at,object


In [361]:
print(news_df.isnull().sum())

id                   0
title                0
description          0
published_date       0
breaking_news        0
blob_image        1000
source_url           0
created_at           0
updated_at           0
published_at        35
created_by_id      382
updated_by_id       35
shares               0
comment_count        0
type                 0
dtype: int64


In [362]:
news_df.drop(columns=["blob_image", "shares", "comment_count", "type"], inplace=True)

In [363]:
news_df['published_at'] = news_df['published_at'].fillna(news_df['published_at'].mode()[0])

news_df['updated_by_id'] = news_df['updated_by_id'].fillna(news_df['updated_by_id'].mode()[0])

news_df['created_by_id'] = news_df['created_by_id'].fillna(news_df['created_by_id'].mode()[0])

In [364]:
print(news_df.isnull().sum())

id                0
title             0
description       0
published_date    0
breaking_news     0
source_url        0
created_at        0
updated_at        0
published_at      0
created_by_id     0
updated_by_id     0
dtype: int64


In [365]:
news_df['content'] = news_df['title'] + " " + news_df['description']

In [366]:
sinhala_stop_words = [
    "අය", "අතර", "ඉස්සර", "ඉන්", "එක", "එය", "ඔබ", "ඔයා", "ඔහු", "ඔවුන්",
    "ඕනෑ", "ආයුබෝවන්", "ආදිය", "ආගම", "ඇයි", "ඇතුලත", "ඇත", "ඉන්පසු",
    "ඉස්සර", "එක්", "එක", "එවක්", "ඔ", "ඔක්කෝ", "ඔය", "ඔබ", "ඔබට",
    "ඔයාලා", "ඔයාව", "ඔයා", "උදාහරණ", "උපුටා", "ඉදිරියට", "උදාව",
    "එදා", "එවා", "ඒ", "ඒවා", "ඕන", "ඔන්", "ඔක්කෝම", "ඔවුන්", "ඉදිරියට",
    "අප", "අපේ", "අයියා", "ඇතුළත්", "ඇත", "උදවිය", "එත්", "ඉතා",
    "ඉස්සර", "ඊයේ", "ඔබට", "ඔබගේ", "ඒ", "ඒවා", "එහි", "එන්න", "ඉස්සර",
    "එවක්", "ඕක", "ආයුබෝවන්", "ඇත", "උසස්", "ඉහළ", "ඔබේ", "අය", "ඔව්",
    "එක්", "අතර", "අපේ", "උත්තම", "අද", "ඉන්", "ඉතා", "ඉතාම", "ඉහත",
    "ඉස්සර", "ඉස්සරට", "අයියා", "අවශ්‍ය", "ඔයාලට"
]

tfidf_vectorizer = TfidfVectorizer(stop_words=sinhala_stop_words)

tfidf_matrix = tfidf_vectorizer.fit_transform(news_df['content'])

cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

cosine_sim_df = pd.DataFrame(cosine_sim, index=news_df['title'], columns=news_df['title'])

print(cosine_sim_df)



title                                               අද ඩොලරයේ අගය  \
title                                                               
අද ඩොලරයේ අගය                                            1.000000   
‘ක්ලීන් ශ‍්‍රී ලංකා ලිඛිතව තියෙනවා.’ මාලිමා මන්...       0.016283   
පාපන්දු ගෝල කණුවක් කඩා වැටී පාසැල් සිසුවෙකු ජීව...       0.033219   
සුජීව සේනසිංහගේ මූලික අයිතිවාසිකම් පෙත්සම විභාග...       0.037680   
නීති විරෝධී ධීවර දැල් දෙසීය පනහක් නීතියේ රැහැනට          0.017437   
...                                                           ...   
එමිල් රංජන්ට දුන් සම්මානය ගැන ඇමති නලින්දගෙන් ප...       0.040910   
(Video) ජීවිතය හරියට plan කරපු මිනිස්සුන්ගේ පොත...       0.015411   
(Video) 5000 නෝට්ටු අවලංගු කළොත්? ආණ්ඩුව හිරවිය...       0.000000   
කැළණියේ නයා ගැන මට කීවේ මාධ්‍යවේදියෙක් - දිලිත්        0.003954   
අංජනමක් බලා නිධන් හොයන්න නිවස මැද අඩි 18 ක් හාරලා        0.023381   

title                                               ‘ක්ලීන් ශ‍්‍රී ලංකා ලිඛිතව තියෙනවා.’ මාලිමා මන්ත‍්

In [367]:
with open('cbf_model_cosine_sim.pkl', 'wb') as f:
    pickle.dump(cosine_sim, f)


In [368]:
with open('cbf_model_tfidf.pkl', 'wb') as f:
    pickle.dump(tfidf_vectorizer, f)

In [369]:
def recommend_content_based(news_id, cosine_sim=cosine_sim, top_n=5):
    idx = news_df[news_df['id'] == news_id].index[0]
    sim_scores = list(enumerate(cosine_sim[idx]))

    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    sim_scores = sim_scores[1:top_n+1]

    recommended_indices = [x[0] for x in sim_scores]

    return news_df.iloc[recommended_indices]['id'].tolist()

recommend_content_based(23996)


[23097, 23794, 23158, 23819, 23710]

In [370]:
recommended_articles = recommend_content_based(23465, top_n=5)
print(recommended_articles)


[23429, 23027, 23423, 23688, 23554]


## CF

In [371]:
from sklearn.neighbors import NearestNeighbors

In [372]:
rec_feedback_df = pd.read_csv('rec_feedback.csv')

users_df = pd.read_csv('users.csv')

In [373]:
rec_feedback_df.head()

Unnamed: 0,id,feedback_type,time_stamp,comment,user_id,item_id,created_at,updated_at,created_by_id,updated_by_id
0,49554,click,2025-02-13 10:03:47.440412,,1182,23942,,,,
1,49553,read,2025-02-13 10:03:42.411734,,1182,23942,,,,
2,49552,click,2025-02-13 10:01:54.947236,,1058,23942,,,,
3,49550,click,2025-02-13 10:01:31.955569,,1058,23930,,,,
4,49549,click,2025-02-13 10:01:10.809916,,1058,23941,,,,


In [374]:
rec_feedback_df.dtypes

Unnamed: 0,0
id,int64
feedback_type,object
time_stamp,object
comment,float64
user_id,int64
item_id,int64
created_at,float64
updated_at,float64
created_by_id,float64
updated_by_id,float64


In [375]:
print(rec_feedback_df.isnull().sum())

id                  0
feedback_type       0
time_stamp          0
comment          1000
user_id             0
item_id             0
created_at       1000
updated_at       1000
created_by_id    1000
updated_by_id    1000
dtype: int64


In [376]:
rec_feedback_df = rec_feedback_df.drop(columns=['comment', 'created_at', 'updated_at', 'created_by_id', 'updated_by_id'])

rec_feedback_df.dropna(subset=['feedback_type', 'user_id', 'item_id'], inplace=True)

print(rec_feedback_df.isnull().sum())

id               0
feedback_type    0
time_stamp       0
user_id          0
item_id          0
dtype: int64


In [377]:
user_item_matrix = rec_feedback_df.pivot_table(index='user_id', columns='item_id', values='feedback_type', aggfunc='count', fill_value=0)

In [378]:
print(user_item_matrix.head())

item_id  15257  15921  16418  16432  16519  16609  16620  16657  16686  16697  \
user_id                                                                         
257          0      0      0      0      0      0      0      0      0      0   
258          0      0      0      0      0      0      0      0      0      0   
261          0      0      0      0      0      0      0      0      0      0   
262          0      0      0      0      0      0      0      0      0      0   
268          0      0      0      0      0      0      0      0      0      0   

item_id  ...  23940  23941  23942  23946  23949  23952  23953  23956  23959  \
user_id  ...                                                                  
257      ...      0      0      0      0      0      0      0      0      0   
258      ...      0      0      0      0      0      0      0      0      0   
261      ...      0      0      0      0      0      0      0      0      0   
262      ...      0      0      0    

In [379]:
knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=6, n_jobs=-1)
knn.fit(user_item_matrix.values)

In [380]:
def recommend_collaborative(user_id, user_item_matrix=user_item_matrix, knn=knn, top_n=5):

    if user_id not in user_item_matrix.index:
        print(f"User {user_id} not found. Returning top items instead.")
        return user_item_matrix.sum(axis=0).sort_values(ascending=False).head(top_n).index.tolist()

    user_index = user_item_matrix.index.get_loc(user_id)

    distances, indices = knn.kneighbors(user_item_matrix.iloc[user_index, :].values.reshape(1, -1), n_neighbors=top_n+1)

    similar_users = user_item_matrix.index[indices.flatten()[1:]].tolist()

    recommended_items = rec_feedback_df[rec_feedback_df['user_id'].isin(similar_users)]['item_id'].value_counts().index.tolist()

    return recommended_items[:top_n]

In [381]:
with open('collaborative_model.pkl', 'wb') as f:
    pickle.dump(knn, f)

In [382]:
user_feedback = rec_feedback_df[rec_feedback_df['user_id'] == 2303]
print(user_feedback)


        id feedback_type                  time_stamp  user_id  item_id
15   49538         click  2025-02-13 09:09:24.359686     2303    23941
16   49537          read   2025-02-13 09:09:23.22432     2303    23941
41   49505         click  2025-02-13 08:02:16.231793     2303    23937
564  48867         click  2025-02-11 03:04:49.348695     2303    23820
565  48866          read  2025-02-11 03:04:49.330479     2303    23820
880  48473          read  2025-02-10 08:02:30.883303     2303    23749
881  48472         click  2025-02-10 08:02:30.516871     2303    23749


In [383]:
top_recommendations = recommend_collaborative(2303, user_item_matrix, knn, top_n=5)
print("Top 5 Recommendations:", top_recommendations)

Top 5 Recommendations: [23749, 23820, 23900, 23769, 23942]


## Hybrid Model

In [403]:
with open('cbf_model_cosine_sim.pkl', 'rb') as f:
    cosine_sim = pickle.load(f)

with open('cbf_model_tfidf.pkl', 'rb') as f:
    tfidf_vectorizer = pickle.load(f)

with open('collaborative_model.pkl', 'rb') as f:
    knn = pickle.load(f)

In [404]:
# Content-Based Filtering
def recommend_content_based(news_id, cosine_sim=cosine_sim, top_n=5):
    if news_id not in news_df['id'].values:
        print(f"News ID {news_id} not found.")
        return []

    idx = news_df[news_df['id'] == news_id].index[0]
    sim_scores = sorted(enumerate(cosine_sim[idx]), key=lambda x: x[1], reverse=True)
    recommended_indices = [x[0] for x in sim_scores[1:top_n + 1]]

    return news_df.iloc[recommended_indices]['id'].tolist()

In [405]:
# Collaborative Filtering
def recommend_collaborative(user_id, user_item_matrix=user_item_matrix, knn=knn, top_n=5):
    if user_id not in user_item_matrix.index:
        print(f"User {user_id} not found. Returning top items.")
        return user_item_matrix.sum(axis=0).sort_values(ascending=False).head(top_n).index.tolist()

    user_index = user_item_matrix.index.get_loc(user_id)
    distances, indices = knn.kneighbors(user_item_matrix.iloc[user_index, :].values.reshape(1, -1), n_neighbors=top_n + 1)
    similar_users = user_item_matrix.index[indices.flatten()[1:]].tolist()
    recommended_items = rec_feedback_df[rec_feedback_df['user_id'].isin(similar_users)]['item_id'].value_counts().index.tolist()

    return recommended_items[:top_n]

In [406]:
# Hybrid Filtering
def recommend_hybrid(user_id, news_id, alpha=0.5, top_n=5):
    cbf_recommendations = recommend_content_based(news_id, top_n=top_n)
    cf_recommendations = recommend_collaborative(user_id, top_n=top_n)

    hybrid_scores = {}
    for item in set(cbf_recommendations + cf_recommendations):
        cbf_score = 1 if item in cbf_recommendations else 0
        cf_score = 1 if item in cf_recommendations else 0
        hybrid_scores[item] = alpha * cbf_score + (1 - alpha) * cf_score

    return sorted(hybrid_scores.keys(), key=lambda x: hybrid_scores[x], reverse=True)[:top_n]

In [407]:
user_id = 2303
news_id = 23996
top_n = 5

recommendations = recommend_hybrid(user_id, news_id, alpha=0.5, top_n=top_n)
print("Top Hybrid Recommendations:", recommendations)

Top Hybrid Recommendations: [23769, 23749, 23942, 23819, 23820]


In [408]:
with open('hybrid_recommendation_model.pkl', 'wb') as f:
    pickle.dump(recommend_hybrid, f)