# Working code

In [17]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import joblib
from scipy import sparse
from implicit.als import AlternatingLeastSquares


In [None]:

# Load news data
news_data = pd.read_csv(r"C:\Users\HP\Desktop\news.tsv", sep='\t', header=None)

# Combine title and description into a single text field
news_data['combined_text'] = news_data[3] + " " + news_data[4]

# Drop rows with missing values in the combined text field
news_data_cleaned = news_data.dropna(subset=['combined_text'])

# Vectorize the combined text field using TF-IDF
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
tfidf_matrix = vectorizer.fit_transform(news_data_cleaned['combined_text'])

# Save the vectorizer and TF-IDF matrix for future use
joblib.dump(vectorizer, 'vectorizer.pkl')
joblib.dump(tfidf_matrix, 'tfidf_matrix.pkl')

In [48]:
cosine_sim_matrix = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Save the cosine similarity matrix
joblib.dump(cosine_sim_matrix, 'cosine_sim_matrix.pkl')

['cosine_sim_matrix.pkl']

In [49]:
# Load user behavior data
behaviors = pd.read_csv(r"C:\Users\HP\Desktop\behaviors.tsv", sep='\t', header=None, names=['ImpressionID', 'UserID', 'Time', 'History', 'Impressions'])

# Extract user interactions (impressions with clicks)
data = []
for index, row in behaviors.iterrows():
    user_id = row['UserID']
    impressions = row['Impressions'].split()  # List of news items and click flags
    for impression in impressions:
        news_id, click_flag = impression.split('-')  # Split on the "-"
        data.append((user_id, news_id, int(click_flag)))  # Store as (user_id, news_id, click_flag)

# Create a DataFrame from the extracted data
interaction_df = pd.DataFrame(data, columns=['user_id', 'news_id', 'click_flag'])

# Encode user_id and news_id as categorical values
interaction_df['user_id'] = interaction_df['user_id'].astype('category')
interaction_df['news_id'] = interaction_df['news_id'].astype('category')

In [54]:
def get_content_recommendations(user_id, interaction_df, news_data_cleaned, cosine_sim_matrix, N=10):
    """
    Recommend N articles for a user based on content similarity.
    Args:
        user_id: The target user's ID.
        interaction_df: DataFrame containing user interactions with articles.
        news_data_cleaned: DataFrame containing cleaned news articles.
        cosine_sim_matrix: Precomputed cosine similarity matrix.
        N: Number of recommendations to return.
    
    Returns:
        List of recommended article IDs.
    """
    # Get all news articles the user has interacted with
    user_history = interaction_df[interaction_df['user_id'] == user_id]
    
    # Filter out news articles that are not present in the cleaned news data
    valid_news_ids = news_data_cleaned[0].values  # News article IDs from news_data_cleaned
    user_history_filtered = user_history[user_history['news_id'].isin(valid_news_ids)]
    
    if user_history_filtered.empty:
        return f"No valid articles found for user {user_id}."
    
    # Create a mapping of news_id to its index in news_data_cleaned
    news_id_to_index = {news_id: idx for idx, news_id in enumerate(news_data_cleaned[0])}
    
    # Get the indices of these articles in the cleaned news_data
    user_article_indices = [news_id_to_index[news_id] for news_id in user_history_filtered['news_id']]

    # Compute the average similarity score for all articles based on the user's clicked articles
    sim_scores = cosine_sim_matrix[user_article_indices].mean(axis=0)
    
    # Get the indices of the top N most similar articles
    top_article_indices = sim_scores.argsort()[-N:][::-1]
    
    # Get the corresponding article IDs for the top recommendations
    recommended_articles = news_data_cleaned.iloc[top_article_indices][0].values
    
    return recommended_articles

# Example user ID
user_id = 'U13740'

# Get recommendations for the user
recommended_articles = get_content_recommendations(user_id, interaction_df, news_data_cleaned, cosine_sim_matrix, N=10)

print(f"Recommended articles for user {user_id}: {recommended_articles}")

Recommended articles for user U13740: ['N43787' 'N27101' 'N36836' 'N25091' 'N43587' 'N34544' 'N52294' 'N560'
 'N20147' 'N26767']


In [10]:
behaviors = pd.read_csv(
    r"C:\Users\HP\Desktop\Flask-News-app\data\behaviors.tsv",
    sep='\t',
    header=None,
    names=['ImpressionID', 'UserID', 'Time', 'History', 'Impressions']
)


In [13]:
data = []
for index, row in behaviors.iterrows():
    user_id = row['UserID']
    impressions = row['Impressions'].split()  # List of news items and click flags
    for impression in impressions:
        if '-' in impression:
            news_id, click_flag = impression.split('-')  # Split on the "-"
            try:
                click_flag = int(click_flag)
                data.append((user_id, news_id, click_flag))  # Store as (user_id, news_id, click_flag)
            except ValueError:
                # Handle cases where click_flag is not an integer
                continue

In [14]:
interaction_df = pd.DataFrame(data, columns=['user_id', 'news_id', 'click_flag'])

# Filter out interactions where click_flag is not positive (assuming click_flag=1 indicates a click)
interaction_df = interaction_df[interaction_df['click_flag'] > 0].reset_index(drop=True)

# Encode user_id and news_id as categorical values
interaction_df['user_id'] = interaction_df['user_id'].astype('category')
interaction_df['news_id'] = interaction_df['news_id'].astype('category')

# Create mappings for user_id and news_id
user_id_mapping = {cat: code for code, cat in enumerate(interaction_df['user_id'].cat.categories)}
news_id_mapping = {cat: code for code, cat in enumerate(interaction_df['news_id'].cat.categories)}
reverse_news_id_mapping = {code: cat for cat, code in news_id_mapping.items()}

# Encode user_id and news_id
interaction_df['user_id_encoded'] = interaction_df['user_id'].cat.codes
interaction_df['news_id_encoded'] = interaction_df['news_id'].cat.codes

In [18]:
user_item_matrix = sparse.csr_matrix(
    (interaction_df['click_flag'], 
     (interaction_df['user_id_encoded'], interaction_df['news_id_encoded']))
)
joblib.dump(user_item_matrix, 'models/user_item_matrix.pkl')
# Initialize and train ALS model
als_model = AlternatingLeastSquares(factors=50, regularization=0.1, iterations=20)
als_model.fit(user_item_matrix)

  check_blas_config()
100%|██████████| 20/20 [00:03<00:00,  5.43it/s]


In [12]:
def get_collaborative_recommendations(user_id, interaction_df, als_model, user_id_mapping, reverse_news_id_mapping, user_item_matrix, N=10):
    """
    Recommend N articles for a user based on collaborative filtering (ALS).
    """
    if user_id not in user_id_mapping:
        return f"UserID {user_id} not found in interaction data."

    # Get the encoded user ID
    user_id_encoded = user_id_mapping[user_id]
    
    # Check if the user has interacted with any items
    if user_id_encoded >= user_item_matrix.shape[0]:
        return f"UserID {user_id} has no interactions."

    # Get the top N recommendations for the user
    recommended_articles_encoded, _ = als_model.recommend(user_id_encoded, user_item_matrix[user_id_encoded], N=N, filter_already_liked_items=True)
    
    # Map back to original news IDs
    recommended_articles = [reverse_news_id_mapping[news_id] for news_id in recommended_articles_encoded]
    
    return recommended_articles

# Content-Based Recommendation Function
def get_content_recommendations(user_id, interaction_df, news_data_cleaned, cosine_sim_matrix, news_id_mapping, N=10):
    """
    Recommend N articles for a user based on content similarity.
    """
    if user_id not in interaction_df['user_id'].cat.categories:
        return f"UserID {user_id} not found in interaction data."
    
    # Get all news articles the user has interacted with
    user_history = interaction_df[interaction_df['user_id'] == user_id]
    
    # Filter out news articles that are not present in the cleaned news data
    valid_news_ids = set(news_data_cleaned[0].astype(str))
    user_history_filtered = user_history[user_history['news_id'].isin(valid_news_ids)]
    
    if user_history_filtered.empty:
        return f"No valid articles found for user {user_id}."
    
    # Create a mapping of news_id to its index in news_data_cleaned
    news_id_to_index = {news_id: idx for idx, news_id in enumerate(news_data_cleaned[0].astype(str))}
    
    # Get the indices of these articles in the cleaned news_data
    user_article_indices = [news_id_to_index[news_id] for news_id in user_history_filtered['news_id']]
    
    # Compute the average similarity score for all articles based on the user's clicked articles
    sim_scores = cosine_sim_matrix[user_article_indices].mean(axis=0)
    
    # Get the indices of the top N most similar articles
    top_article_indices = sim_scores.argsort()[-N:][::-1]
    
    # Get the corresponding article IDs for the top recommendations
    recommended_articles = news_data_cleaned.iloc[top_article_indices][0].astype(str).tolist()
    
    return recommended_articles

# Hybrid Recommendation System
def get_hybrid_recommendations(user_id, interaction_df, news_data_cleaned, cosine_sim_matrix, 
                               als_model, user_id_mapping, reverse_news_id_mapping, user_item_matrix, 
                               N=10, alpha=0.5):
    """
    Recommend N articles for a user using a hybrid approach (blending content-based and collaborative filtering).
    
    Args:
    - user_id: The target user ID.
    - interaction_df: DataFrame containing user interactions.
    - news_data_cleaned: DataFrame containing cleaned news articles.
    - cosine_sim_matrix: Precomputed cosine similarity matrix for content-based filtering.
    - als_model: Trained ALS collaborative filtering model.
    - user_id_mapping: Dictionary mapping user IDs to encoded IDs.
    - reverse_news_id_mapping: Dictionary mapping encoded news IDs to original news IDs.
    - user_item_matrix: Sparse matrix used by ALS.
    - N: Number of recommendations to return.
    - alpha: Weight factor to blend collaborative and content-based scores (0 <= alpha <= 1).
    
    Returns:
    - List of recommended article IDs.
    """
    # Get collaborative recommendations
    collaborative_recommendations = get_collaborative_recommendations(
        user_id, interaction_df, als_model, user_id_mapping, reverse_news_id_mapping, user_item_matrix, N
    )
    
    # Get content-based recommendations
    content_recommendations = get_content_recommendations(
        user_id, interaction_df, news_data_cleaned, cosine_sim_matrix, news_id_mapping, N
    )
    
    # Handle cases where recommendations might be strings (error messages)
    if isinstance(collaborative_recommendations, str):
        collaborative_recommendations = []
    if isinstance(content_recommendations, str):
        content_recommendations = []
    
    # Convert all recommendations to lists if they aren't already
    collaborative_recommendations = list(collaborative_recommendations)
    content_recommendations = list(content_recommendations)
    
    # Determine the number of recommendations to take from each method
    num_collaborative = int(N * alpha)
    num_content = N - num_collaborative
    
    # Slice the recommendation lists
    collaborative_slice = collaborative_recommendations[:num_collaborative]
    content_slice = content_recommendations[:num_content]
    
    # Combine the two sets of recommendations, ensuring no duplicates
    combined_recommendations = collaborative_slice + [item for item in content_slice if item not in collaborative_slice]
    
    # If combined_recommendations are fewer than N, fill the remaining with other recommendations
    if len(combined_recommendations) < N:
        additional_needed = N - len(combined_recommendations)
        # Combine all unique recommendations from both methods
        all_unique = list(set(collaborative_recommendations + content_recommendations))
        # Exclude already recommended items
        additional_recommendations = [item for item in all_unique if item not in combined_recommendations]
        # Add the additional recommendations
        combined_recommendations += additional_recommendations[:additional_needed]
    
    # Return the top N recommendations
    return combined_recommendations[:N]

In [13]:
user_id = 'U13740'

# Get hybrid recommendations for the user
hybrid_recommendations = get_hybrid_recommendations(
    user_id, 
    interaction_df, 
    news_data_cleaned, 
    cosine_sim_matrix, 
    als_model, 
    user_id_mapping, 
    reverse_news_id_mapping, 
    user_item_matrix, 
    N=10, 
    alpha=0.7
)

# Display recommendations
print(f"Hybrid recommendations for user {user_id}: {hybrid_recommendations}")

Hybrid recommendations for user U13740: ['N52622', 'N59981', 'N55204', 'N56193', 'N31947', 'N21707', 'N35937', 'N55689', 'N58133', 'N28910']
