Bot-1 (New-User Bot)

In [28]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from datetime import datetime
import numpy as np

Bot-1 Trial Run

In [35]:
def get_article_links(csv_file, num_clusters=10):
    """
    Returns links to top articles from each cluster
    """
    # Read and preprocess data
    df = pd.read_csv(csv_file)
    df['DateTime'] = pd.to_datetime(df['DateTime'])
    
    # Create combined text for clustering
    df['combined_text'] = (
        df['category'] + ' ' + 
        df['subcategory'] + ' ' + 
        df['headline'] + ' ' + 
        df['Entire_News']
    )
    
    # Vectorize text
    vectorizer = TfidfVectorizer(max_features=1000, stop_words='english')
    vectors = vectorizer.fit_transform(df['combined_text'])
    
    # Perform clustering
    kmeans = KMeans(n_clusters=num_clusters, random_state=42)
    df['cluster'] = kmeans.fit_predict(vectors)
    
    # Calculate scores
    current_time = datetime.now()
    df['time_diff'] = df['DateTime'].apply(lambda x: (current_time - x).total_seconds() / (24 * 3600))
    df['time_score'] = df['time_diff'].apply(lambda x: -np.exp(-0.1 * x))  # Negative weight for time
    df['final_score'] = (0.4 * df['time_score']) + (0.6 * df['Mean_Time'])  # Positive weight for ratings
    
    # Get top article links from each cluster
    recommended_links = []
    for cluster in range(num_clusters):
        cluster_articles = df[df['cluster'] == cluster]
        top_article = cluster_articles.loc[cluster_articles['final_score'].idxmax()]
        recommended_links.append({
            'cluster': cluster + 1,
            'category': top_article['category'],
            'headline': top_article['headline'],
            'link': top_article['News_Link']
        })
    
    return recommended_links

# Example usage
if __name__ == "__main__":
    csv_file = "Processes_data.csv"
    recommended_links = get_article_links(csv_file)
    
    print("\nRecommended Article Links:")
    print("-" * 50)
    for article in recommended_links:
        print(f"\nCluster {article['cluster']} ({article['category']})")
        print(f"Headline: {article['headline']}")
        print(f"Link: {article['link']}")


Recommended Article Links:
--------------------------------------------------

Cluster 1 (business)
Headline: The ‘untold’ market story
Link: https://www.thehindu.com/business/markets/the-untold-market-story/article65263161.ece

Cluster 2 (business)
Headline: In Sri Lanka, an economic crisis foretold
Link: https://www.thehindu.com/news/international/in-sri-lanka-an-economic-crisis-foretold/article65282210.ece

Cluster 3 (sci-tech)
Headline: If you want to know what’s true, then math is a pretty good place to start, says Abel Prize winner Dennis P. Sullivan
Link: https://www.thehindu.com/sci-tech/science/if-you-want-to-know-whats-true-then-math-is-a-pretty-good-place-to-start-says-abel-prize-winner-dennis-p-sullivan/article65255519.ece

Cluster 4 (entertainment)
Headline: The indie class of 2022
Link: https://www.thehindu.com/entertainment/music/the-indie-class-of-2022/article38416776.ece

Cluster 5 (entertainment)
Headline: When Is Grammys 2022?: Date, Time, Nominations, Performers, H