BOT-2

In [7]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import HashingVectorizer
from datasketch import MinHashLSHForest, MinHash
import re
import logging

BOT-2 Function (Content-based filtering and Collabartive filtering function)

In [8]:
class NewsRecommendationBot:
    def __init__(self, user_data_path: str, process_data_path: str):
        """
        Initialize the recommendation bot with data paths and necessary components
        """
        # Set up logging
        logging.basicConfig(level=logging.INFO)
        self.logger = logging.getLogger(__name__)
        
        # Simple stopwords list
        self.stop_words = {
            'a', 'an', 'and', 'are', 'as', 'at', 'be', 'by', 'for', 'from', 
            'has', 'he', 'in', 'is', 'it', 'its', 'of', 'on', 'that', 'the', 
            'to', 'was', 'were', 'will', 'with'
        }
        
        # Load and preprocess data
        self._load_and_preprocess_data(user_data_path, process_data_path)
        
        # Initialize components
        self.logger.info("Initializing components...")
        self.vectorizer = HashingVectorizer(n_features=1000)
        self.lsh_forest = MinHashLSHForest(num_perm=128)
        
        # Prepare data structures
        self.logger.info("Preparing data structures...")
        self._prepare_data()
        self.logger.info("Initialization complete!")
        
        # Print debug information
        self._print_debug_info()
    
    def _load_and_preprocess_data(self, user_data_path: str, process_data_path: str):
        """Load and preprocess the data files"""
        self.logger.info("Loading data...")
        try:
            # Load data
            self.user_data = pd.read_csv(user_data_path)
            self.process_data = pd.read_csv(process_data_path)
            
            # Convert UserId to int if it's not already
            self.user_data['UserId'] = pd.to_numeric(self.user_data['UserId'], errors='coerce')
            
            # Remove any rows with NaN UserIds
            self.user_data = self.user_data.dropna(subset=['UserId'])
            
            # Convert UserId to int
            self.user_data['UserId'] = self.user_data['UserId'].astype(int)
            
            self.logger.info(f"Loaded {len(self.user_data)} user interactions and {len(self.process_data)} articles")
            
        except Exception as e:
            self.logger.error(f"Error loading data: {str(e)}")
            raise
    
    def _print_debug_info(self):
        """Print debug information about the loaded data"""
        self.logger.info("\nDebug Information:")
        self.logger.info(f"Number of unique users: {len(self.user_data['UserId'].unique())}")
        self.logger.info(f"User ID range: {self.user_data['UserId'].min()} to {self.user_data['UserId'].max()}")
        self.logger.info(f"User ID data type: {self.user_data['UserId'].dtype}")
        self.logger.info(f"Number of articles: {len(self.process_data)}")
        self.logger.info("\nFirst few User IDs:")
        self.logger.info(self.user_data['UserId'].head())
    
    def _simple_tokenize(self, text):
        """Simple tokenization function"""
        text = re.sub(r'[^\w\s]', ' ', str(text).lower())
        tokens = [word for word in text.split() if word not in self.stop_words and len(word) > 2]
        return tokens
    
    def _prepare_data(self):
        """Prepare data structures and indexes"""
        # Create article content matrix
        self.article_content = (
            self.process_data['headline'].fillna('') + ' ' +
            self.process_data['category'].fillna('') + ' ' +
            self.process_data['subcategory'].fillna('') + ' ' +
            self.process_data['Entire_News'].fillna('')
        )
        
        # Create MinHash for each article
        self.minhashes = {}
        total_articles = len(self.article_content)
        
        self.logger.info(f"Processing {total_articles} articles...")
        for idx, content in enumerate(self.article_content):
            if (idx + 1) % 100 == 0:
                self.logger.info(f"Processed {idx + 1}/{total_articles} articles")
            
            tokens = self._simple_tokenize(content)
            minhash = MinHash(num_perm=128)
            for token in tokens:
                minhash.update(token.encode('utf-8'))
            self.minhashes[idx] = minhash
            self.lsh_forest.add(str(idx), minhash)
        
        self.lsh_forest.index()
        
        # Create user-article interaction matrix
        self.logger.info("Creating user-article interaction matrix...")
        self.user_article_matrix = pd.pivot_table(
            self.user_data,
            values='TimeSpent',
            index='UserId',
            columns='article_id',
            fill_value=0
        )
    
    def get_recommendations(self, user_id, n_recommendations=10):
        """Get hybrid recommendations for a user"""
        try:
            # Convert user_id to int if it's not already
            user_id = int(user_id)
            
            # Check if user exists
            if user_id not in self.user_data['UserId'].unique():
                self.logger.error(f"User {user_id} not found. Available user IDs: {sorted(self.user_data['UserId'].unique())}")
                raise ValueError(f"User {user_id} not found in the dataset")
            
            # Get user's reading history
            user_articles = self.user_data[self.user_data['UserId'] == user_id]
            
            # Get recommendations
            content_recommendations = []
            for _, row in user_articles.iterrows():
                article_idx = row['article_id']
                if article_idx in self.minhashes:
                    similar_articles = self.lsh_forest.query(self.minhashes[article_idx], 2)
                    content_recommendations.extend(map(int, similar_articles))
            
            # Get collaborative recommendations
            similar_users = []
            for other_user in self.user_data['UserId'].unique():
                if other_user != user_id:
                    other_articles = set(self.user_data[self.user_data['UserId'] == other_user]['article_id'])
                    user_articles_set = set(user_articles['article_id'])
                    similarity = len(other_articles.intersection(user_articles_set)) / len(other_articles.union(user_articles_set))
                    similar_users.append((other_user, similarity))
            
            similar_users = sorted(similar_users, key=lambda x: x[1], reverse=True)[:5]
            
            collaborative_recommendations = []
            for similar_user, _ in similar_users:
                similar_user_articles = self.user_data[
                    (self.user_data['UserId'] == similar_user) &
                    (self.user_data['Clicked'] == 1)
                ]['article_id'].tolist()
                collaborative_recommendations.extend(similar_user_articles)
            
            # Combine recommendations
            all_recommendations = set(content_recommendations + collaborative_recommendations)
            all_recommendations = all_recommendations - set(user_articles['article_id'])
            
            scored_recommendations = []
            for article_id in all_recommendations:
                if article_id not in self.process_data.index:
                    continue
                    
                article_data = self.process_data.loc[article_id]
                category_score = len(user_articles[user_articles['article_id'].isin(
                    self.process_data[self.process_data['category'] == article_data['category']].index
                )]) / len(user_articles)
                popularity_score = len(self.user_data[self.user_data['article_id'] == article_id]) / len(self.user_data)
                final_score = 0.7 * category_score + 0.3 * popularity_score
                
                scored_recommendations.append({
                    'article_id': article_id,
                    'headline': article_data['headline'],
                    'link': article_data['News_Link'],
                    'category': article_data['category'],
                    'score': final_score
                })
            
            # Sort and return top recommendations
            scored_recommendations.sort(key=lambda x: x['score'], reverse=True)
            return scored_recommendations[:n_recommendations]
            
        except Exception as e:
            self.logger.error(f"Error getting recommendations: {str(e)}")
            raise

In [9]:
# Initialize the bot
try:
    bot = NewsRecommendationBot(
        user_data_path='UserProfileData.csv',
        process_data_path='Processes_data.csv'
    )
    
    # Get recommendations for a user
    user_id = 6  # or whichever user ID you want to test
    recommendations = bot.get_recommendations(user_id=user_id)
    
    if recommendations:
        print(f"\nRecommended Articles for User {user_id}:")
        for i, rec in enumerate(recommendations, 1):
            print(f"\n{i}. {rec['headline']}")
            print(f"   Category: {rec['category']}")
            print(f"   Link: {rec['link']}")
            print(f"   Score: {rec['score']:.3f}")
    else:
        print("No recommendations found")
        
except Exception as e:
    print(f"Error: {str(e)}")

INFO:__main__:Loading data...
INFO:__main__:Loaded 1310 user interactions and 2956 articles
INFO:__main__:Initializing components...
INFO:__main__:Preparing data structures...
INFO:__main__:Processing 2956 articles...
INFO:__main__:Processed 100/2956 articles
INFO:__main__:Processed 200/2956 articles
INFO:__main__:Processed 300/2956 articles
INFO:__main__:Processed 400/2956 articles
INFO:__main__:Processed 500/2956 articles
INFO:__main__:Processed 600/2956 articles
INFO:__main__:Processed 700/2956 articles
INFO:__main__:Processed 800/2956 articles
INFO:__main__:Processed 900/2956 articles
INFO:__main__:Processed 1000/2956 articles
INFO:__main__:Processed 1100/2956 articles
INFO:__main__:Processed 1200/2956 articles
INFO:__main__:Processed 1300/2956 articles
INFO:__main__:Processed 1400/2956 articles
INFO:__main__:Processed 1500/2956 articles
INFO:__main__:Processed 1600/2956 articles
INFO:__main__:Processed 1700/2956 articles
INFO:__main__:Processed 1800/2956 articles
INFO:__main__:Pro


Recommended Articles for User 6:

1. Switch Mobility to build advanced manufacturing facility in Spain
   Category: business
   Link: https://www.thehindu.com/business/switch-mobility-to-build-advanced-manufacturing-facility-in-spain/article65245728.ece
   Score: 0.211

2. IndiGo ends streak of quarterly losses; posts Rs 130 crore net profit in Q3 [details inside]
   Category: business
   Link: https://www.ibtimes.co.in/indigo-ends-streak-quarterly-losses-posts-rs-130-crore-net-profit-q3-details-inside-845407
   Score: 0.211

3. No rationale for lowering 30% tax on crypto profits: Revenue Secretary Tarun Bajaj
   Category: business
   Link: https://www.thehindu.com/business/budget/no-rationale-for-lowering-30-tax-on-crypto-profits-bajaj/article38372516.ece
   Score: 0.211

4. Air India: History reveals iconic rise but fall in wrong hands [Chronology]
   Category: business
   Link: https://www.ibtimes.co.in/air-india-history-reveals-iconic-rise-fall-wrong-hands-chronology-845154
   Sco