<a href="https://colab.research.google.com/github/Abubakar-Aliyu-code/Machine-Learning-and-Data-Science-Projects/blob/main/ChatJourno.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
from spacy import load
from spacy.lang.en.stop_words import STOP_WORDS
from collections import Counter
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
import torch
import nltk
from yake import KeywordExtractor



from flask import Flask, request, jsonify

app = Flask(__name__)

nlp = load('en_core_web_sm')

nltk.download('wordnet')


file_path = 'source.csv'

# Load the CSV file
def load_news_data(file_path):
    news_data = pd.read_csv(file_path)
    return news_data


# Preprocess the data
def preprocess_data(news_data):
    news_data.drop_duplicates(inplace=True)
    news_data.columns = [col.strip().replace('.', '_') for col in news_data.columns]
    news_data.fillna('', inplace=True)
    return news_data

# TF-IDF Vectorization
def compute_tfidf_vectorizer():
    tfidf_vectorizer = TfidfVectorizer(stop_words='english')
    return tfidf_vectorizer

def compute_tfidf_matrix(tfidf_vectorizer, processed_news_data):
    tfidf_matrix = tfidf_vectorizer.fit_transform(processed_news_data['CONTENTS'])
    return tfidf_matrix

# Retrieve relevant news articles
def retrieve_relevant_news(user_query, tfidf_vectorizer, tfidf_matrix, processed_news_data, top_n=5):
    user_doc = nlp(user_query)
    user_entities = [entity.text for entity in user_doc.ents]
    language = "en"
    max_ngram_size = 2
    deduplication_threshold = 0.3
    deduplication_algo = 'seqm'
    windowSize = 1
    numOfKeywords = 10

    keyword_extractor = KeywordExtractor(lan=language, n=max_ngram_size, dedupLim=deduplication_threshold, dedupFunc=deduplication_algo, windowsSize=windowSize, top=numOfKeywords, features=None)

    doc = nlp(user_query)
    user_entities = [entity.text for entity in doc.ents if entity.label_ in ['PERSON', 'GPE', 'ORG']]



    def entity_match(content):
        doc = nlp(content)
        content_entities = [entity.text for entity in doc.ents]

        return any(entity in content_entities for entity in user_entities)


    entity_filtered_news = processed_news_data[processed_news_data['CONTENTS'].apply(entity_match)]


    if entity_filtered_news.empty:
        return entity_filtered_news

    # Update tfidf_matrix to use only the filtered news
    tfidf_matrix_filtered = tfidf_vectorizer.transform(entity_filtered_news['CONTENTS'])
    user_tfidf = tfidf_vectorizer.transform([user_query])
    cosine_similarities = linear_kernel(user_tfidf, tfidf_matrix_filtered).flatten()
    related_docs_indices = cosine_similarities.argsort()[:-top_n-1:-1]

    # Ensure indices are within bounds
    related_docs_indices = [idx for idx in related_docs_indices if idx < len(entity_filtered_news)]

    return entity_filtered_news.iloc[related_docs_indices]

# Summarize retrieved news articles
def summarize_text(text, nlp, max_sentence=3):
    doc = nlp(text)
    keyword_counts = Counter([token.text.lower() for token in doc if token.text.lower() not in STOP_WORDS and token.text not in string.punctuation])
    sentence_scores = {}
    for sent in doc.sents:
        for word in sent:
            if word.text.lower() in keyword_counts:
                if sent in sentence_scores:
                    sentence_scores[sent] += keyword_counts[word.text.lower()]
                else:
                    sentence_scores[sent] = keyword_counts[word.text.lower()]

    top_sentences = sorted(sentence_scores, key=sentence_scores.get, reverse=True)[:max_sentence]
    summary = ' '.join([sent.text for sent in sorted(top_sentences, key=lambda sent: sent.start)])

    return summary

def summarize_relevant_news(relevant_news, nlp):
    summaries = []
    for _, row in relevant_news.iterrows():
        summary = summarize_text(row['CONTENTS'], nlp)
        summaries.append({
            'title': row['NEWS TITLE'],
            'link': row['NEWS LINK'],
            'source': row['SOURCE'],
            'summary': summary
        })
    return summaries


news_data = load_news_data(file_path)

news_data = preprocess_data(news_data)

tfidf_vectorizer = compute_tfidf_vectorizer()
tfidf_matrix = compute_tfidf_matrix(tfidf_vectorizer, news_data)

@app.route('/chat', methods=['POST'])
def chat():
    user_query = request.json['query']
    user_entities = retrieve_relevant_news(user_query, tfidf_vectorizer, tfidf_matrix, news_data)
    response = {'entities': user_entities}
    return jsonify(response)

if __name__ == '__main__':
    app.run(debug=True)


ModuleNotFoundError: No module named 'yake'