In [46]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [47]:
data = pd.read_json('News_Category_Dataset_v3.json', lines=True)

In [48]:
# Create DataFrame from the example data
df = pd.DataFrame(data)

In [49]:
# Combine headline and short_description for vectorization

df['text'] = df['headline'] + ' ' + df['short_description']


### USER INPUT

In [50]:
# User input
user_input = "Donald Trump"

In [51]:
# Vectorization
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(df['text'])
user_vector = vectorizer.transform([user_input])


In [52]:
# Compute cosine similarities
cosine_similarities = cosine_similarity(user_vector, tfidf_matrix).flatten()


In [53]:
# Get top 10 similar documents
top_indices = cosine_similarities.argsort()[-10:][::-1]
top_similarities = df.iloc[top_indices].copy()
top_similarities['similarity'] = cosine_similarities[top_indices]


In [56]:
# Prepare results DataFrame
results_df = top_similarities[['headline', 'short_description', 'similarity']]

# Display results DataFrame
results_df.reset_index(drop=True, inplace=True)
results_df

Unnamed: 0,headline,short_description,similarity
0,Don King Uses The N-Word In Speech Introducing...,“America needs Donald Trump. We need Donald Tr...,0.62997
1,Donald Trump: 'Nobody Has More Respect For Wom...,Right.,0.628576
2,You Can Be Donald Trump too,,0.616277
3,Here's What Obama Has To Say About Donald Trump,Where's The Donald?,0.615663
4,Which Donald Trump Are You?,"And you thought there was only one ""The Donald.""",0.573574
5,Donald Trump Agrees Hosting Golf Tournament On...,Even Donald Trump thinks people should be back...,0.562125
6,Bobby Jindal Rails Against 'Egomaniac' Donald ...,"""Donald Trump is for Donald Trump. He believes...",0.55671
7,"Donald Trump -- Yes, Donald Trump -- Says High...",This guy.,0.538259
8,Trump's Collapse,No one believes more in Donald Trump than Trum...,0.523514
9,Donald Trump does not have a campaign,Donald Trump is a candidate without a campaign...,0.505345
