In [None]:

import nltk
import pandas as pd
import numpy as np
import re
import string
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
import os
!pip install indic-nlp-library
from indicnlp.tokenize import sentence_tokenize, indic_tokenize
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer

In [None]:

train_path = "../input/telugu-nlp/telugu_news/train_telugu_news.csv"
telugu_news_df = pd.read_csv(train_path)
telugu_news_df.head()

# Text pre-processing

In [None]:
# Remove rows where 'heading' is null
telugu_news_df[telugu_news_df["heading"].isna() == True]

In [None]:
topic_dic = {}

c = 0
for un in telugu_news_df["topic"].unique():
    if un not in topic_dic:
        topic_dic[un] = c
        c += 1
        
topic_dic

In [None]:
inv_topic_dict = {v: k for k, v in topic_dic.items()}

In [None]:
def func_topic(s):
    return topic_dic[s]

telugu_news_df["topic"] = telugu_news_df["topic"].apply(func_topic)
date_df = telugu_news_df["date"]
telugu_news_df["body_processed"] = telugu_news_df["body"].str.replace('\u200c', '')
telugu_news_df["body_processed"] = telugu_news_df["body_processed"].str.replace('\n', '')
telugu_news_df["body_processed"] = telugu_news_df["body_processed"].str.replace('\t', '')
telugu_news_df["body_processed"] = telugu_news_df["body_processed"].str.replace('\xa0', '')

In [None]:
def get_count(df):
    
    unvals = list(df.unique())
    op = [0]*len(unvals)
    
    i = 0
    for un in unvals:
        op[i] = df[df == un].shape[0]
        i += 1
        
    return [list(unvals),op]

cont = get_count(telugu_news_df["topic"])
clables = cont[0]

labels = []
for c in clables:
    labels.append(inv_topic_dict[c])

plt.figure(figsize=(10,8))
plt.title("Distribution of the telugu news toics", fontsize = 14.5)
plt.style.use('seaborn-colorblind')
plt.pie(np.array(cont[1]), labels=labels, autopct='%1.2f%%', shadow=True)
plt.show()

In [None]:
PUNCT = string.punctuation

def remove_punctuation(text):
    return text.translate(str.maketrans('', '', PUNCT))
telugu_news_df["body_processed"] = telugu_news_df["body_processed"].apply(lambda text: remove_punctuation(text))

In [None]:
test_path = "../input/telugu-nlp/telugu_news/test_telugu_news.csv"

test_news_df = pd.read_csv(test_path)

test_news_df.head()

In [None]:
y_test = test_news_df["topic"].apply(func_topic)
test_news_df["body_processed"] = test_news_df["body"].str.replace('\u200c', '')
test_news_df["body_processed"] = test_news_df["body_processed"].str.replace('\n', '')
test_news_df["body_processed"] = test_news_df["body_processed"].str.replace('\t', '')
test_news_df["body_processed"] = test_news_df["body_processed"].str.replace('\xa0', '')

test_news_df["body_processed"] = test_news_df["body_processed"].apply(lambda text: remove_punctuation(text))

# Using Count Vectorizer to get the data into sk-learn's format
Using Count Vectorizer to get the feature vectors and eliminate the stopwords (based on term and inverse document freqency and selecting the top k words in the vacabulary for model development purpose)

In [None]:
categories = [i for i in range(5)]

test_text = []
for t in test_news_df["body_processed"]:
  test_text.append(t)

x_test = test_text

print(len(x_test) , len(y_test))

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

categories = [i for i in range(5)]

text_topic = []
for i in range(5):
  curr_text = ""

  for text in telugu_news_df[telugu_news_df["topic"] == i]["body_processed"]:
    curr_text += text
    curr_text += " "
    
  text_topic.append(curr_text)

len(text_topic)

for i in range(5):
  print(len(text_topic[i]))

In [None]:
from indicnlp.tokenize import indic_tokenize  


def get_all_vocab(tot_text):
  dic = {}
  for t in indic_tokenize.trivial_tokenize(tot_text): 
    if t not in dic:
      dic[t] = 1
    else:
      dic[t] += 1

  return dic 
  
tot_text = ""
for i in range(5):
  tot_text += text_topic[i]

In [None]:
tot_vocab = get_all_vocab(tot_text)
tot_vocab = {k: v for k, v in sorted(tot_vocab.items(), key=lambda item: item[1], reverse = True)}

print(len(tot_vocab))


In [None]:
x_train = text_topic
y_train = categories

In [None]:
import regex 
from indicnlp.tokenize import indic_tokenize

# Using custom analyser for the count vectorizer (as telugu is an indeic language)
def custom_analyzer(text):
    words = regex.findall(r'\w{1,}', text) #extract words of at least 2 letters
    for w in words:
        yield w

In [None]:
# We are using both uni-grams and bi-grams here to get the feature vectors

# We selected only the top 100000 words from the corpus to represent our data

count_vec = CountVectorizer(max_df = 0.75,min_df=0.1, lowercase = False , analyzer = custom_analyzer, max_features=100000, ngram_range=(1,2))

x_train_features = count_vec.fit_transform(x_train)

x_train_features.shape

In [None]:
# Stop words identified from the corpus by using the term-frequencies and the inverse document frequencies
len(count_vec.stop_words_)

In [None]:
# Getting the testing data's features
x_test_features = count_vec.transform(x_test)

# Using Multinomial Naive Baye's classifier from sk-learn to classify the given telugu texts in the test dataset

In [None]:
from sklearn.naive_bayes import MultinomialNB

clf = MultinomialNB()
clf.fit(x_train_features, y_train)
MultinomialNB()

print("Test score :- ", clf.score(x_test_features, y_test))

In [None]:
from sklearn.metrics import classification_report

y_pred_test = clf.predict(x_test_features)
target_names = list(inv_topic_dict.values())

print(classification_report(y_test, y_pred_test, target_names=target_names))

In [None]:
from sklearn.metrics import confusion_matrix, classification_report,accuracy_score,average_precision_score,f1_score
confusion_mat = confusion_matrix(y_test, y_pred_test)

plt.figure(figsize=(10,4))
plt.title("Confusion matrix for test data")
plt.xlabel("Predicted class")
plt.ylabel("True class")

sns.heatmap(confusion_mat, annot=True, cmap="Greens",  xticklabels = target_names,
           yticklabels=target_names)
plt.show()

In [None]:
# Function for predicting a sample text with the trained model

def predict_text_sample(test_text, inv_topic_dict, clf, count_vec):
  test_sample = [test_text]
  x_test_sample_fetaures = count_vec.transform(test_sample)
  y_pred_test_sample = clf.predict(x_test_sample_fetaures)

  return inv_topic_dict[y_pred_test_sample[0]]

In [None]:
# This text is about some movie news from a telugu blogsite
test_text ="ఇండియన్ స్క్రీన్ మీద పోటీ పడటం అయిపోయింది అందుకే ఇప్పుడు మన సినిమాలు ఫారిన్ రిలీజ్ లో పోటీ పడుతున్నాయి. ఇండియన్ సినిమాలు ముఖ్యంగా సౌత్ సినిమాలు రిలీజ్ అవుతున్నాయి అంటే వరల్డ్ వైడ్ మార్కెట్ ఓపెన్ అవుతుంది. తెలుగు తమిళ హిందీ భాషల సినిమాలు సబ్ టైటిల్స్ తో విధేశాల్లో కూడా రిలీజ్ అవుతున్నాయి"

print("Prediced class is " , predict_text_sample(test_text, inv_topic_dict, clf, count_vec))

In [None]:
# This sample text is about some political news from a telugu news website (Eenadu)

test_text = "హైదరాబాద్: తెలంగాణలో సంచలనం సృష్టించిన ‘ఎమ్మెల్యేలకు ఎర’ కేసులో హైకోర్టు కీలక తీర్పు వెల్లడించింది. ఈ కేసులో ముగ్గురు నిందితుల రిమాండ్‌కు ఉన్నత న్యాయస్థానం అనుమతించింది. నిందితులు వెంటనే సైబరాబాద్ కమిషనర్‌ స్టీఫెన్‌ రవీంద్ర ఎదుట లొంగిపోవాలని ఆదేశించింది. ఒకవేళ లొంగిపోకపోతే వారిని అరెస్టు చేసి ఏసీబీ కోర్టులో హాజరుపర్చాలని.. ఆ తర్వాత రిమాండ్‌కు తరలించాలని పోలీసులను ధర్మాసనం ఆదేశించింది."
print("Prediced class is " , predict_text_sample(test_text, inv_topic_dict, clf, count_vec))

In [None]:
# This sample text is about some sports news from a telugu news website (Eenadu)

test_text = "జట్టు స్కోరు 3 పరుగుల వద్ద రెండో ఓవర్లోనే బెయిర్ స్టో అవుట్ కాగా.. ఆ తర్వాత అఫ్ఘానిస్తా్న్ బౌలర్ల ఉచ్చులో ఇంగ్లండ్ బ్యాటర్లు విలవిలలాడిపోయారు. "
print("Prediced class is " , predict_text_sample(test_text, inv_topic_dict, clf, count_vec))

# Using Neural Networks to classify the given telugu texts

In [None]:
# from tensorflow.keras.preprocessing.text import Tokenizer
# from tensorflow.keras.preprocessing.sequence import pad_sequences
# from tensorflow.keras.models import Sequential
# from tensorflow.keras.layers import Embedding, LSTM, Dense
# from tensorflow.keras.utils import to_categorical  # Import to_categorical

# # Preprocessing
# tokenizer = Tokenizer(num_words=5000)  # Adjust 'num_words' as needed
# tokenizer.fit_on_texts(telugu_news_df['body'])
# sequences = tokenizer.texts_to_sequences(telugu_news_df['body'])
# data = pad_sequences(sequences, maxlen=200)  # Adjust 'maxlen' as needed
# # Check the number of unique categories in 'topic'
# num_categories = len(telugu_news_df['topic'].unique())

# # Adjust the final Dense layer of your model to match the number of categories
# model = Sequential()
# model.add(Embedding(input_dim=5000, output_dim=64, input_length=200))  # Adjust these dimensions as needed
# model.add(LSTM(64, return_sequences=True))
# model.add(LSTM(32))
# model.add(Dense(num_categories, activation='softmax'))  # Ensure this matches the number of categories

# # Compile the model
# model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# # Convert labels to categorical
# labels = to_categorical(telugu_news_df['topic'], num_classes=num_categories)

# # Train the model
# model.fit(data, labels, epochs=10, batch_size=32)  # Adjust epochs and batch_size as needed


In [None]:
# def predict_category(text, tokenizer, model, inv_topic_dict):
#     # Preprocess the text
#     sequence = tokenizer.texts_to_sequences([text])
#     padded_sequence = pad_sequences(sequence, maxlen=200)

#     # Predict
#     prediction = model.predict(padded_sequence)
#     predicted_category_index = np.argmax(prediction)
#     predicted_category = inv_topic_dict[predicted_category_index]

#     return predicted_category

In [None]:
# # Example
# sample_text = "ఆగండ్రా బాబూ రేయ్ ఆగండ్రే.. అంటూ తన ఫ్యాన్స్‌ కేరింతల్ని రెట్టింపు చేస్తూ తన మాస్ మేనరిజంతో స్పీచ్ ఇచ్చారు రవితేజ. ఈ సినిమా ప్రతి బ్లాక్.. అంత అందంగా వచ్చిందంటే.. సినిమాటోగ్రాఫర్ మది.. యాక్షన్ ఎపిసోడ్ చేసిన రామ్ లక్ష్మణ్, పీటర్ హెయిన్స్. ఆ ట్రైన్ ఎపిసోడ్‌ చేసింది పీటర్ హెయిన్స్. ఈ సినిమా రియల్ క్యారెక్టర్ కాబట్టి.. రియల్ ఎమోషన్స్‌తో సినిమా చేశారు. రియల్ యాక్షన్ ఎమోషన్స్ అంత బాగా రావడానికి కారణం ఏంటంటే.. రామ్ లక్ష్మణ్‌లు కూడా.. టైగర్ నాగేశ్వరరావు ఏరియాకి చెందిన చీరాల వారే. ఈ టైగర్ నాగేశ్వరరావు గురించి బాగా తెలిసిన వాళ్లు రామ్ లక్ష్మణ్‌లు."

# predicted_category = predict_category(sample_text, tokenizer, model, inv_topic_dict)
# print("Predicted Category:", predicted_category)

# Creating Sample User Data for 100 Users

In [None]:
telugu_news_df.head()

In [None]:
from sklearn.preprocessing import normalize

# Assume 100 users 
num_users = 100
num_articles = telugu_news_df.shape[0]

# Assume each user has a preferred topic
user_preferences = np.random.randint(0, 5, num_users)

# Assume each user has a unique popularity score for each article
user_article_popularity = np.random.rand(num_users, num_articles)

# Simulate user-article interaction matrix
user_article_matrix = np.zeros((num_users, num_articles))

for user_idx in range(num_users):
    user_topic = user_preferences[user_idx]
    for article_idx in range(num_articles):
        if telugu_news_df.loc[article_idx, 'topic'] == user_topic:
            interaction = np.random.randint(0, 5)
        else:
            interaction = np.random.randint(0, 3)
        user_article_matrix[user_idx, article_idx] = interaction * user_article_popularity[user_idx, article_idx]

# Normalize interactions to scale [0, 4]
user_article_matrix = normalize(user_article_matrix, norm='max', axis=1) * 4

# Compute the cosine similarity matrix
user_similarity_matrix = cosine_similarity(user_article_matrix)

# Content based Recommendation

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
def generate_ngrams(tokens, n=2):
    """Generate ngrams from tokens."""
    return [' '.join(tokens[i:i+n]) for i in range(len(tokens)-n+1)]

def custom_analyzer_with_ngrams(text, n=2):
    """Tokenize text and generate n-grams."""
    words = indic_tokenize.trivial_tokenize(text)
    ngrams = generate_ngrams(words, n)
    return words + ngrams  # Return unigrams and bigrams

In [None]:
def get_recommendations_by_index(article_index, df_content, top_n=10):
    # Compute tfidf with both unigrams and bigrams
    vect = TfidfVectorizer(analyzer=custom_analyzer_with_ngrams, ngram_range=(1,2), max_df=0.85, min_df=0.05)
    count_matrix = vect.fit_transform(df_content.body_processed.values)
    
    # Get the tf-idf vector for the specified article
    article_vector = count_matrix[article_index]

    # Compute the cosine similarity matrix for the specified article
    cosine_sim = linear_kernel(article_vector, count_matrix).flatten()
    
    # Get the similarity scores
    sim_scores = list(enumerate(cosine_sim))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)  # Sort by similarity
    sim_scores = sim_scores[1:top_n+1]  # Skip the first one as it is the article itself
    similar_article_indices = [i[0] for i in sim_scores]
    scores = [i[1] for i in sim_scores]
    
    # Retrieve the similar articles and their scores
    similar_articles = df_content['body_processed'].iloc[similar_article_indices]
    return similar_articles, scores

# Example usage
article_index = 0  
recommended_articles, scores = get_recommendations_by_index(article_index, telugu_news_df, 10)

# Displaying the recommended articles and their similarity scores
print("Original Article:", telugu_news_df.iloc[article_index]['body_processed'][:100], "\n")
for idx, (article, score) in enumerate(zip(recommended_articles, scores)):
    print(f"Score: {score:.4f} - Article: {article[:100]}...")  # Displaying first 100 chars


# Collaborative Filtering Recommendation

## User-user Filtering

In [None]:
def recommend_articles_user_user_with_scores(user_index, df_content, top_n=5):
    # Find similar users
    similar_users_scores = user_similarity_matrix[user_index]
    similar_users_indices = np.argsort(similar_users_scores)[::-1][1:]  # Exclude the user itself
    
    # Get the articles liked by similar users
    recommended_articles_indices = user_article_matrix[similar_users_indices].sum(axis=0)
    sorted_article_indices = np.argsort(recommended_articles_indices)[::-1]

    # Filter out articles the user has already interacted with and get top N articles
    recommended_article_indices = [idx for idx in sorted_article_indices if user_article_matrix[user_index, idx] == 0][:top_n]
    
    # Get the similarity scores for top N similar users
    top_similar_users_indices = similar_users_indices[:top_n]
    top_similar_users_scores = similar_users_scores[top_similar_users_indices]

    # Retrieve the recommended articles
    recommended_articles = df_content.iloc[recommended_article_indices]
    return recommended_articles, top_similar_users_scores
    

# Example usage
user_index = 0  # index of the user
recommended_articles, user_scores = recommend_articles_user_user_with_scores(user_index, telugu_news_df, 10)

# Displaying the recommended articles and the similarity scores of the users who liked these articles
for idx, (article, score) in enumerate(zip(recommended_articles['body'], user_scores)):
    print(f"Score: {score:.4f} - Article: {article[:100]}...")  # Displaying first 100 chars 

## Item-Item Filtering

In [None]:
# Compute the cosine similarity matrix between articles for Item-Item Collaborative Filtering
article_similarity_matrix = cosine_similarity(user_article_matrix.T)

# Function to recommend similar articles for a given article using Item-Item Collaborative Filtering
def recommend_similar_articles_with_scores(article_index, df_content, top_n=5):
    # Compute similarity scores
    similar_articles_scores = article_similarity_matrix[article_index]
    
    # Get top N similar articles, excluding the article itself
    similar_articles_indices = np.argsort(similar_articles_scores)[::-1][1:top_n+1]
    top_similar_scores = similar_articles_scores[similar_articles_indices]
    
    # Retrieve the recommended articles and their similarity scores
    recommended_articles = df_content.iloc[similar_articles_indices]
    return recommended_articles, top_similar_scores

# Example usage
article_index = 0  # index of the article 
recommended_articles, scores = recommend_similar_articles_with_scores(article_index, telugu_news_df, 10)

# Displaying the original article
print("Original Article:", telugu_news_df.iloc[article_index]['body'][:100], "\n")

# Displaying the recommended articles and their similarity scores
for idx, (article, score) in enumerate(zip(recommended_articles['body'][:100], scores)):
    print(f"Score: {score:.4f} - Article: {article[:100]}...")  # Displaying first 100 chars 

# Matrix Factorization using SVD

In [None]:
from scipy.linalg import svd

# Performing SVD
U, sigma, Vt = svd(user_article_matrix)

# Number of latent features 
n_latent_features = 75

# Reduce the matrices U, Sigma, and Vt 
U_reduced = U[:, :n_latent_features]
sigma_reduced = np.diag(sigma[:n_latent_features])
Vt_reduced = Vt[:n_latent_features, :]

# Reconstruct the user-article interaction matrix
reconstructed_matrix = np.dot(np.dot(U_reduced, sigma_reduced), Vt_reduced)

In [None]:
def recommend_articles_svd_with_scores(user_index, df_content, top_n=5):
    # Predicted user ratings from the reconstructed matrix
    user_ratings = reconstructed_matrix[user_index]

    # Normalize scores to a 0-1 range for consistency
    max_rating = np.max(user_ratings)
    normalized_scores = user_ratings / max_rating if max_rating != 0 else user_ratings
    
    # Sort articles by predicted rating, excluding already interacted articles
    sorted_article_indices = np.argsort(normalized_scores)[::-1]
    recommended_article_indices = [idx for idx in sorted_article_indices if user_article_matrix[user_index, idx] == 0][:top_n]

    # Retrieve the recommended articles and their normalized scores
    recommended_articles = df_content.iloc[recommended_article_indices]
    scores = normalized_scores[recommended_article_indices]

    return recommended_articles, scores


# Example usage
user_index = 0  # index of the user
recommended_articles, scores = recommend_articles_svd_with_scores(user_index, telugu_news_df, 10)

# Displaying the recommended articles and their predicted interest scores
for idx, (article, score) in enumerate(zip(recommended_articles['body'], scores)):                                          
    print(f"Score: {score:.4f} - Article: {article[:100]}...")  # Displaying first 100 chars 

# Hybrid Content-Based Filtering and Matrix Factorization 

In [None]:
def weighted_hybrid_recommendations(article_index, user_index, df_content, top_n=10, weight_content_based=0.6, weight_matrix_factorization=0.4):
    
    print(f"User Index: {user_index}")
    user_topic_preference = user_preferences[user_index]
    print(f"User Topic Preference: {user_topic_preference}\n")
    print(f"Original Article Index: {article_index}")
    print("Original Article Details:")
    print("Title:", df_content.loc[article_index, 'heading'])
    print("Body:", df_content.loc[article_index, 'body'][:200] + "...\n")
    
    # Get content-based recommendations
    cb_recommendations, cb_scores = get_recommendations_by_index(article_index, df_content, top_n)

    # Get matrix factorization recommendations
    mf_recommendations, mf_scores = recommend_articles_svd_with_scores(user_index, df_content, top_n)

    # Combine recommendations and scores with weights
    combined_scores = {}
    
    # Processing content-based scores
    for idx, score in zip(cb_recommendations.index, cb_scores):
        body = df_content.loc[idx, 'body_processed']
        combined_scores[body] = combined_scores.get(body, 0) + score * weight_content_based

    # Processing matrix factorization scores
    for idx, score in zip(mf_recommendations.index, mf_scores):
        body = df_content.loc[idx, 'body_processed']
        combined_scores[body] = combined_scores.get(body, 0) + score * weight_matrix_factorization

    # Sort articles based on combined scores
    sorted_articles = sorted(combined_scores.items(), key=lambda x: x[1], reverse=True)[:top_n]

    # Get top N recommendations
    top_articles = [(article, score) for article, score in sorted_articles]

    return top_articles

# Example usage
user_index = 0 
article_index = 0
top_recommendations = weighted_hybrid_recommendations(article_index, user_index, telugu_news_df, top_n=10)

# Displaying the hybrid recommendations
for article, score in top_recommendations:
    print(f"Score: {score:.4f} - Article: {article[:100]}...")


# Hybrid Content-Based Filtering and User-user filtering

In [None]:
def hybrid_recommendations_1(article_index, user_index, df_content, user_similarity_matrix, user_article_matrix, top_n=10, weight_content_based=0.3, weight_user_user=0.7):
    
    print(f"User Index: {user_index}")
    user_topic_preference = user_preferences[user_index]
    print(f"User Topic Preference: {user_topic_preference}\n")
    print(f"Original Article Index: {article_index}")
    print("Original Article Details:")
    print("Title:", df_content.loc[article_index, 'heading'])
    print("Body:", df_content.loc[article_index, 'body'][:200] + "...\n")
    
    # Get content-based recommendations
    cb_recommendations, cb_scores = get_recommendations_by_index(article_index, df_content, top_n)

    # Get user-user collaborative filtering recommendations
    uu_recommendations, uu_scores = recommend_articles_user_user_with_scores(user_index, df_content, top_n)

    # Combine recommendations and scores
    combined_scores = {}

    # Iterate through content-based recommendations
    for idx, score in zip(cb_recommendations.index, cb_scores):
        body = df_content.loc[idx, 'body_processed']
        combined_scores[body] = combined_scores.get(body, 0) + score * weight_content_based

    # Iterate through user-user collaborative recommendations
    for idx, score in zip(uu_recommendations.index, uu_scores):
        body = df_content.loc[idx, 'body_processed']
        combined_scores[body] = combined_scores.get(body, 0) + score * weight_user_user

    # Sort articles based on combined scores
    sorted_articles = sorted(combined_scores.items(), key=lambda x: x[1], reverse=True)[:top_n]

    # Get top N recommendations
    top_articles = sorted_articles

    return top_articles

# Example usage
user_index = 0 
article_index = 0
top_recommendations = hybrid_recommendations_1(article_index, user_index, telugu_news_df, user_similarity_matrix, user_article_matrix, top_n=10)

# Displaying the hybrid recommendations
for article, score in top_recommendations:
    print(f"Score: {score:.4f} - Article: {article[:100]}...")

# Hybrid Item-Item Filtering and User-User Filtering

In [None]:
def weighted_hybrid_recommendations_item_user(article_index, user_index, df_content, user_similarity_matrix, article_similarity_matrix, top_n=10, weight_item_item=0.7, weight_user_user=0.3):
    
    print(f"User Index: {user_index}")
    user_topic_preference = user_preferences[user_index]
    print(f"User Topic Preference: {user_topic_preference}\n")
    print(f"Original Article Index: {article_index}")
    print("Original Article Details:")
    print("Title:", df_content.loc[article_index, 'heading'])
    print("Body:", df_content.loc[article_index, 'body'][:200] + "...\n")
    
    # Item-Item Collaborative Filtering recommendations and scores
    ii_recommendations, ii_scores = recommend_similar_articles_with_scores(article_index, df_content, top_n)

    # User-User Collaborative Filtering recommendations and scores
    uu_recommendations, uu_scores = recommend_articles_user_user_with_scores(user_index, df_content, top_n)

    # Combine recommendations and scores with weights
    combined_scores = {}

    # Processing Item-Item scores
    for idx, score in zip(ii_recommendations.index, ii_scores):
        body = df_content.loc[idx, 'body_processed']
        combined_scores[body] = combined_scores.get(body, 0) + score * weight_item_item

    # Processing User-User scores
    for idx, score in zip(uu_recommendations.index, uu_scores):
        body = df_content.loc[idx, 'body_processed']
        combined_scores[body] = combined_scores.get(body, 0) + score * weight_user_user

    # Sort articles based on combined scores
    sorted_articles = sorted(combined_scores.items(), key=lambda x: x[1], reverse=True)[:top_n]

    # Get top N recommendations
    top_articles = [(article, score) for article, score in sorted_articles]

    return top_articles

# Example usage
user_index = 0
article_index = 0
top_recommendations = weighted_hybrid_recommendations_item_user(article_index, user_index, telugu_news_df, user_similarity_matrix, article_similarity_matrix, top_n=10)

# Displaying the hybrid recommendations
for article, score in top_recommendations:
    print(f"Score: {score:.4f} - Article: {article[:100]}...")


# Hybrid Item-Item filtering and Matrix Factorization 

In [None]:
def weighted_hybrid_recommendations(article_index, user_index, df_content, top_n=10, weight_item_item=0.3, weight_matrix_factorization=0.7):
    
    print(f"User Index: {user_index}")
    user_topic_preference = user_preferences[user_index]
    print(f"User Topic Preference: {user_topic_preference}\n")
    print(f"Original Article Index: {article_index}")
    print("Original Article Details:")
    print("Title:", df_content.loc[article_index, 'heading'])
    print("Body:", df_content.loc[article_index, 'body'][:200] + "...\n")

    # Item-Item Collaborative Filtering recommendations and scores
    ii_recommendations, ii_scores = recommend_similar_articles_with_scores(article_index, df_content, top_n)

    # Matrix Factorization recommendations and scores
    mf_recommendations, mf_scores = recommend_articles_svd_with_scores(user_index, df_content, top_n)

    # Combine recommendations and scores with weights
    combined_scores = {}

    # Processing Item-Item scores
    for idx, score in zip(ii_recommendations.index, ii_scores):
        body = df_content.loc[idx, 'body_processed']
        combined_scores[body] = combined_scores.get(body, 0) + score * weight_item_item

    # Processing Matrix Factorization scores
    for idx, score in zip(mf_recommendations.index, mf_scores):
        body = df_content.loc[idx, 'body_processed']
        combined_scores[body] = combined_scores.get(body, 0) + score * weight_matrix_factorization

    # Sort articles based on combined scores
    sorted_articles = sorted(combined_scores.items(), key=lambda x: x[1], reverse=True)[:top_n]

    # Get top N recommendations
    top_articles = [(article, score) for article, score in sorted_articles]

    return top_articles

# Example usage
user_index = 0 
article_index = 0
top_recommendations = weighted_hybrid_recommendations(article_index, user_index, telugu_news_df, top_n=10)

# Displaying the hybrid recommendations
for article, score in top_recommendations:
    print(f"Score: {score:.4f} - Article: {article[:100]}...")
