In [23]:
import json
from collections import defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np


In [24]:
def load_json(file_path):
    data = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            try:
                data.append(json.loads(line))
            except json.JSONDecodeError as e:
                print(f"Error decoding JSON on line: {line}")
                print(e)
    return data



In [25]:
def prepare_data(data):
    documents = []
    index_to_record = []
    for idx, record in enumerate(data):
        text = ' '.join([record.get(key, '') for key in ['category', 'headline', 'short_description']])
        documents.append(text)
        index_to_record.append(idx)
    return documents, index_to_record


In [26]:
def search_tfidf(documents, sentence, index_to_record):
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(documents)
    
    query_vec = vectorizer.transform([sentence])
    cosine_similarities = np.dot(tfidf_matrix, query_vec.T).toarray().flatten()
    
    results = [(index_to_record[i], cosine_similarities[i]) for i in range(len(index_to_record))]
    results = sorted(results, key=lambda x: x[1], reverse=True)
    
    return results


In [27]:
def count_occurrences(data, sentence):
    sentence_lower = sentence.lower()
    count = 0
    
    for record in data:
        for key in ['category', 'headline', 'short_description']:
            if key in record:
                count += record[key].lower().count(sentence_lower)
    
    return count


In [28]:
def main(file_path, sentence):
    data = load_json(file_path)
    documents, index_to_record = prepare_data(data)
    
    results = search_tfidf(documents, sentence, index_to_record)
    total_occurrences = count_occurrences(data, sentence)
    
    print(f"Total occurrences of '{sentence}': {total_occurrences}")
    print(" ")
    for record_idx, score in results[:5]: 
        record = data[record_idx]
        category = record.get('category', 'N/A')
        headline = record.get('headline', 'N/A')
        short_desc = record.get('short_description', 'N/A')
        print(f"Record: {record_idx + 1}")
        print(f"Category: {category}")
        print(f"Headline: {headline}")
        print(f"Short Description: {short_desc}")
        print(f"Similarity Score: {score}")
        print(" ")
        print(" ")




In [29]:
file_path = 'News_Category_Dataset_v3.json'  
sentence = "donald trump"  

main(file_path, sentence)


Total occurrences of 'donald trump': 6585
 
Record: 56049
Category: POLITICS
Headline: Don King Uses The N-Word In Speech Introducing Donald Trump
Short Description: “America needs Donald Trump. We need Donald Trump, especially black people."
Similarity Score: 0.6295646236077033
 
 
Record: 71706
Category: POLITICS
Headline: Donald Trump: 'Nobody Has More Respect For Women Than Donald Trump'
Short Description: Right.
Similarity Score: 0.6241176721849603
 
 
Record: 68599
Category: POLITICS
Headline: Here's What Obama Has To Say About Donald Trump
Short Description: Where's The Donald?
Similarity Score: 0.6054161258538254
 
 
Record: 93240
Category: ENTERTAINMENT
Headline: Which Donald Trump Are You?
Short Description: And you thought there was only one "The Donald."
Similarity Score: 0.5605350180368722
 
 
Record: 94979
Category: SPORTS
Headline: Donald Trump Agrees Hosting Golf Tournament On Trump's Golf Course A Bad Idea
Short Description: Even Donald Trump thinks people should be ba