# Search Engine of News Headlines

In [15]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
import numpy as np
from wordcloud import WordCloud
from nltk.corpus import stopwords
from collections import defaultdict, Counter
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import warnings
warnings.filterwarnings('ignore')

In [27]:
df = pd.read_json('News_Category_Dataset_v3.json', lines=True)
df.head()

Unnamed: 0,link,headline,category,short_description,authors,date
0,https://www.huffpost.com/entry/covid-boosters-...,Over 4 Million Americans Roll Up Sleeves For O...,U.S. NEWS,Health experts said it is too early to predict...,"Carla K. Johnson, AP",2022-09-23
1,https://www.huffpost.com/entry/american-airlin...,"American Airlines Flyer Charged, Banned For Li...",U.S. NEWS,He was subdued by passengers and crew when he ...,Mary Papenfuss,2022-09-23
2,https://www.huffpost.com/entry/funniest-tweets...,23 Of The Funniest Tweets About Cats And Dogs ...,COMEDY,"""Until you have a dog you don't understand wha...",Elyse Wanshel,2022-09-23
3,https://www.huffpost.com/entry/funniest-parent...,The Funniest Tweets From Parents This Week (Se...,PARENTING,"""Accidentally put grown-up toothpaste on my to...",Caroline Bologna,2022-09-23
4,https://www.huffpost.com/entry/amy-cooper-lose...,Woman Who Called Cops On Black Bird-Watcher Lo...,U.S. NEWS,Amy Cooper accused investment firm Franklin Te...,Nina Golgowski,2022-09-22


In [3]:
def preprocess_text(text):
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    # Convert to lowercase
    text = text.lower()
    return text

# Apply preprocessing to the 'text' column
df['cleaned_headline'] = df['headline'].apply(preprocess_text)

In [8]:
# Initialize the TF-IDF Vectorizer
vectorizer = TfidfVectorizer(stop_words='english')

# Fit and transform the cleaned text
X = vectorizer.fit_transform(df['cleaned_headline'])

# X is now a sparse matrix of TF-IDF features
print(X.shape)  # Display the shape of the matrix

(209527, 68712)


In [9]:
feature_names = vectorizer.get_feature_names_out()
print("Number of features:", len(feature_names))

Number of features: 68712


In [32]:
def search_query(query):
    # Preprocess and vectorize the query
    query_cleaned = preprocess_text(query)
    query_vector = vectorizer.transform([query_cleaned])
    
    # Compute cosine similarity between query and all headlines
    similarities = cosine_similarity(query_vector, X)
    
    # Get the indices of the top 10 most similar headlines
    top_indices = similarities[0].argsort()[-10:][::-1]
    
    # Retrieve the top 10 similar headlines and their similarity scores
    top_headlines = df.iloc[top_indices]
    top_scores = similarities[0][top_indices]
    
    return top_headlines[['headline']], top_scores

In [57]:
# Example search query
# query = "Million Americans Roll"
# top_headlines, top_scores = search_query(query)

# Get user input for the search query
user_query = input(f"Enter your search query: ")

# Perform the search
top_headlines, top_scores = search_query(user_query)


print(f"\nTop 10 similar headlines with similarity scores:")
for i, (headline, score) in enumerate(zip(top_headlines['headline'], top_scores), start=1):
    print(f"{i}. Headline: {headline}\n   Similarity Score: {score:.4f}\n")

Enter your search query:  Donald Trump



Top 10 similar headlines with similarity scores:
1. Headline: You Can Be Donald Trump too
   Similarity Score: 1.0000

2. Headline: Which Donald Trump Are You?
   Similarity Score: 0.8193

3. Headline: Donald Trump: 'Nobody Has More Respect For Women Than Donald Trump'
   Similarity Score: 0.7651

4. Headline: Bobby Jindal Rails Against 'Egomaniac' Donald Trump
   Similarity Score: 0.6254

5. Headline: Donald Trump does not have a campaign
   Similarity Score: 0.6143

6. Headline: The Five Reasons Donald Trump Will Be Our Next President
   Similarity Score: 0.6116

7. Headline: Don King Uses The N-Word In Speech Introducing Donald Trump
   Similarity Score: 0.6049

8. Headline: Here's What Obama Has To Say About Donald Trump
   Similarity Score: 0.5865

9. Headline: Donald Trump Is Handling The Transition Exactly Like He Campaigned
   Similarity Score: 0.5649

10. Headline: Donald Trump -- Yes, Donald Trump -- Says High CEO Pay Is 'Disgraceful'
   Similarity Score: 0.5554



In [48]:
# Combine 'headline' and 'short_description' into a single text field
df['combined_text'] = df['headline'] + ' ' + df['short_description']

# Apply preprocessing to the combined text field
df['cleaned_combined_text'] = df['combined_text'].apply(preprocess_text)

# Fit and transform the cleaned combined text
X = vectorizer.fit_transform(df['cleaned_combined_text'])

In [50]:
def search_query_combined(query):
    # Preprocess and vectorize the query
    query_cleaned = preprocess_text(query)
    query_vector = vectorizer.transform([query_cleaned])
    
    # Compute cosine similarity between query and all combined text
    similarities = cosine_similarity(query_vector, X)
    
    # Get the indices of the top 10 most similar entries
    top_indices = similarities[0].argsort()[-10:][::-1]
    
    # Retrieve the top 10 similar headlines, their short descriptions, and similarity scores
    top_results = df.iloc[top_indices][['headline', 'short_description']]
    top_scores = similarities[0][top_indices]
    
    return top_results, top_scores

In [55]:
# Get user input for the search query
user_query = input("Enter your search query: ")

# Perform the search
top_results, top_scores = search_query_combined(user_query)

# Display the results
print(f"\nTop 10 similar headlines and short descriptions with similarity scores: \n")
for i, (headline, short_description, score) in enumerate(zip(top_results['headline'], top_results['short_description'], top_scores), start=1):
    print(f"{i}. Headline: {headline}\n   Short Description: {short_description}\n   Similarity Score: {score:.4f}\n")

Enter your search query:  Donald Trump



Top 10 similar headlines and short descriptions with similarity scores: 

1. Headline: You Can Be Donald Trump too
   Short Description: 
   Similarity Score: 1.0000

2. Headline: Which Donald Trump Are You?
   Short Description: And you thought there was only one "The Donald."
   Similarity Score: 0.8193

3. Headline: Donald Trump: 'Nobody Has More Respect For Women Than Donald Trump'
   Short Description: Right.
   Similarity Score: 0.7651

4. Headline: Bobby Jindal Rails Against 'Egomaniac' Donald Trump
   Short Description: "Donald Trump is for Donald Trump. He believes in nothing other than himself."
   Similarity Score: 0.6254

5. Headline: Donald Trump does not have a campaign
   Short Description: Donald Trump is a candidate without a campaign – and it’s becoming a serious problem.
   Similarity Score: 0.6143

6. Headline: The Five Reasons Donald Trump Will Be Our Next President
   Short Description: 
   Similarity Score: 0.6116

7. Headline: Don King Uses The N-Word In Speech