In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk import word_tokenize
from nltk.metrics import jaccard_distance
from Levenshtein import distance
from gensim.models import KeyedVectors
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import euclidean_distances

import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [3]:
!pip install Levenshtein

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting Levenshtein
  Downloading Levenshtein-0.21.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (174 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m174.1/174.1 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting rapidfuzz<4.0.0,>=2.3.0 (from Levenshtein)
  Downloading rapidfuzz-3.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.0/3.0 MB[0m [31m47.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rapidfuzz, Levenshtein
Successfully installed Levenshtein-0.21.0 rapidfuzz-3.0.0


# Data Loading

In [5]:
df = pd.read_json('/content/drive/MyDrive/Colab Notebooks/ML/News_Category_Dataset_v3.json', lines=True)
df.head()

Unnamed: 0,link,headline,category,short_description,authors,date
0,https://www.huffpost.com/entry/covid-boosters-...,Over 4 Million Americans Roll Up Sleeves For O...,U.S. NEWS,Health experts said it is too early to predict...,"Carla K. Johnson, AP",2022-09-23
1,https://www.huffpost.com/entry/american-airlin...,"American Airlines Flyer Charged, Banned For Li...",U.S. NEWS,He was subdued by passengers and crew when he ...,Mary Papenfuss,2022-09-23
2,https://www.huffpost.com/entry/funniest-tweets...,23 Of The Funniest Tweets About Cats And Dogs ...,COMEDY,"""Until you have a dog you don't understand wha...",Elyse Wanshel,2022-09-23
3,https://www.huffpost.com/entry/funniest-parent...,The Funniest Tweets From Parents This Week (Se...,PARENTING,"""Accidentally put grown-up toothpaste on my to...",Caroline Bologna,2022-09-23
4,https://www.huffpost.com/entry/amy-cooper-lose...,Woman Who Called Cops On Black Bird-Watcher Lo...,U.S. NEWS,Amy Cooper accused investment firm Franklin Te...,Nina Golgowski,2022-09-22


In [6]:
# Preprocess the text data (you may need additional preprocessing steps depending on your dataset)
df['processed_text'] = df['headline'] + ' ' + df['short_description']
df.head()

Unnamed: 0,link,headline,category,short_description,authors,date,processed_text
0,https://www.huffpost.com/entry/covid-boosters-...,Over 4 Million Americans Roll Up Sleeves For O...,U.S. NEWS,Health experts said it is too early to predict...,"Carla K. Johnson, AP",2022-09-23,Over 4 Million Americans Roll Up Sleeves For O...
1,https://www.huffpost.com/entry/american-airlin...,"American Airlines Flyer Charged, Banned For Li...",U.S. NEWS,He was subdued by passengers and crew when he ...,Mary Papenfuss,2022-09-23,"American Airlines Flyer Charged, Banned For Li..."
2,https://www.huffpost.com/entry/funniest-tweets...,23 Of The Funniest Tweets About Cats And Dogs ...,COMEDY,"""Until you have a dog you don't understand wha...",Elyse Wanshel,2022-09-23,23 Of The Funniest Tweets About Cats And Dogs ...
3,https://www.huffpost.com/entry/funniest-parent...,The Funniest Tweets From Parents This Week (Se...,PARENTING,"""Accidentally put grown-up toothpaste on my to...",Caroline Bologna,2022-09-23,The Funniest Tweets From Parents This Week (Se...
4,https://www.huffpost.com/entry/amy-cooper-lose...,Woman Who Called Cops On Black Bird-Watcher Lo...,U.S. NEWS,Amy Cooper accused investment firm Franklin Te...,Nina Golgowski,2022-09-22,Woman Who Called Cops On Black Bird-Watcher Lo...


# TF-IDF Vectorization


In [7]:
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(df['processed_text'])

In [8]:
# Example given data point
given_data = "It was a bad day."

# Convert given_data into TF-IDF vector
given_data_tfidf = tfidf_vectorizer.transform([given_data])

# Cosine Similarity

In [9]:
# Compute cosine similarity between given_data and all other data points
cosine_similarities = cosine_similarity(given_data_tfidf, tfidf_matrix).flatten()

# Find the index of the most similar data point
most_similar_index_cosine = cosine_similarities.argsort()[-1]

# Get the most similar data point based on cosine similarity
most_similar_data_cosine = df.iloc[most_similar_index_cosine]

# Convert given_data into sets of words
given_data_words = set(word_tokenize(given_data.lower()))

# Jacard Similarity

In [10]:
# Compute Jaccard similarity between given_data and all other data points
jaccard_similarities = []
for _, row in df.iterrows():
    data_words = set(word_tokenize(row['processed_text'].lower()))
    jaccard_similarities.append(1 - jaccard_distance(given_data_words, data_words))

# Find the index of the most similar data point
most_similar_index_jaccard = pd.Series(jaccard_similarities).idxmax()

# Get the most similar data point based on Jaccard similarity
most_similar_data_jaccard = df.iloc[most_similar_index_jaccard]


# Levenshtein Distance

In [11]:
# Convert given_data into character sequences
given_data_chars = list(given_data.lower())

# Compute Levenshtein distance between given_data and all other data points
levenshtein_distances = [distance(given_data_chars, list(text.lower())) for text in df['processed_text']]

# Find the index of the most similar data point
most_similar_index_levenshtein = pd.Series(levenshtein_distances).idxmin()

# Get the most similar data point based on Levenshtein distance
most_similar_data_levenshtein = df.iloc[most_similar_index_levenshtein]

# Comparing Similarities

In [16]:
# Print the most similar data points based on different similarity algorithms
print("Most Similar Data based on four different Similarity Algorithms:\n")
print("Cosine Similarity:")
print(most_similar_data_cosine['processed_text'])
print()
print("Jaccard Similarity:")
print(most_similar_data_jaccard['processed_text'])
print()
print("Levenshtein Distance:")
print(most_similar_data_levenshtein['processed_text'])

Most Similar Data based on four different Similarity Algorithms:

Cosine Similarity:
4 Ways To Beat Your Bad Day When something goes wrong, it’s tempting to write the whole day off as “bad.” But the ceiling of your day’s potential lies

Jaccard Similarity:
The Tan Suit It was a bad day to wear the tan suit.

Levenshtein Distance:
Times Sunday 
