In [1]:
#  Uncomment the line below if you are using Google Colab.
# !pip install -U sentence-transformers

In [2]:
# Import the SentenceTransformer class and utility function class from the sentence_transformers module 
from sentence_transformers import SentenceTransformer, util
# Use the `all-MiniLM-L6-v2` model.
model = SentenceTransformer('all-MiniLM-L6-v2')
# Import pandas
import pandas as pd
pd.set_option('max_colwidth', 200)

2023-12-05 14:39:42.699669: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
# Create a DataFrame for the "news_headlines.csv" 
news_headlines_df = pd.read_csv("Resources/news_headlines.csv")
news_headlines_df

Unnamed: 0,headline,category
0,How To Spend More Time With Your Family While Working Remotely,Business
1,NCAA Football Playoffs Should be Like the NFL,Sports
2,"Hacker Pleads Guilty To Stealing Over 100,000 Passwords for Reddit",Technology
3,Lawmakers Want To Boost School Funding To Address Teacher Walkouts,Politics
4,The Best Sub Shops in the Caribbean You Should Visit This Summer,Travel
5,The Dark Side Of The Bitcoin Mining,Technology
6,Treasury Secretary is Confirmed Today,Politics
7,The 5 Best Restaurants In The World,Travel
8,How to Build a Brand for Your Small Business,Business
9,NY Giants Quarterback Injured After Being Punched By Teammate,Sports


In [4]:
# Convert the "headline" column to a list 
news_headlines = news_headlines_df["headline"].tolist()
news_headlines

['How To Spend More Time With Your Family While Working Remotely',
 'NCAA Football Playoffs Should be Like the NFL',
 'Hacker Pleads Guilty To Stealing Over 100,000 Passwords for Reddit',
 'Lawmakers Want To Boost School Funding To Address Teacher Walkouts',
 'The Best Sub Shops in the Caribbean You Should Visit This Summer',
 'The Dark Side Of The Bitcoin Mining',
 'Treasury Secretary is Confirmed Today',
 'The 5 Best Restaurants In The World',
 'How to Build a Brand for Your Small Business',
 'NY Giants Quarterback Injured After Being Punched By Teammate']

In [5]:
# Get the vector embeddings for the headlines.
news_headlines_embeddings = model.encode(news_headlines)

In [6]:
# Get the vector embeddings from the following news headline. 
new_headline = "Top 10 Hacks for Traveling Like a Pro."
new_headline_embedding = model.encode([new_headline])

In [7]:
# Create a list to store tuples of (news headline, similarity score)
similarities = []

# Loop through the headline embeddings.
for i, headline_embedding in enumerate(news_headlines_embeddings):
    # Calculate the cosine similarity score between each headline embedding and the ew headline embedding. 
    cosine_similarity_score = util.cos_sim(headline_embedding, new_headline_embedding)

    # Store the news headline and similarity score as a tuple in the list.
    similarities.append((news_headlines[i], cosine_similarity_score))

# Sort the list of tuples based on similarity score in descending order
similarities.sort(key=lambda x: x[1], reverse=True)

In [9]:
# Print the sorted results
print(f"News headline to categorize: {new_headline}")
print()

# Loop through the similarities list and get the headline and similarity score.
for i, (headline, similarity_score) in enumerate(similarities):
    # Get the category from the DataFrame for each headline.
    category = news_headlines_df.loc[news_headlines_df['headline'] == headline, 'category'].values[0]
    # Print the rank, category, and the news headline.
    print(f"Rank {i+1}: Category: {category}, Headline: {headline}")
    # Print the similarity score of the news headline.
    print(f"Similarity score: {similarity_score[0][0]}")
    print()

News headline to categorize: Top 10 Hacks for Traveling Like a Pro.

Rank 1: Category: Technology, Headline: Hacker Pleads Guilty To Stealing Over 100,000 Passwords for Reddit
Similarity score: 0.3083580732345581

Rank 2: Category: Travel, Headline: The 5 Best Restaurants In The World
Similarity score: 0.2657448947429657

Rank 3: Category: Travel, Headline: The Best Sub Shops in the Caribbean You Should Visit This Summer
Similarity score: 0.20851679146289825

Rank 4: Category: Business, Headline: How To Spend More Time With Your Family While Working Remotely
Similarity score: 0.15140998363494873

Rank 5: Category: Technology, Headline: The Dark Side Of The Bitcoin Mining
Similarity score: 0.11326367408037186

Rank 6: Category: Sports, Headline: NCAA Football Playoffs Should be Like the NFL
Similarity score: 0.09503547847270966

Rank 7: Category: Business, Headline: How to Build a Brand for Your Small Business
Similarity score: 0.061265695840120316

Rank 8: Category: Politics, Headline:

**Question:** What category is the new headline? 

**Answer:** "Travel".

**Question:** Why did you choose this category?

**Answer:** The headline "Top 10 Hacks for Traveling Like a Pro" is most similar to a "Technology" headline, however we should classify it as "Travel" since the second and third best similarity scores are "Travel". 