# How similar are document titles to bodies?

### Load packages, define funcs

In [22]:
from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer('all-MiniLM-L6-v2')
from nltk.tokenize import sent_tokenize
import numpy as np

from tqdm import tqdm
tqdm.pandas()

import pandas as pd

import boilerpy3
from boilerpy3 import extractors
extractor = extractors.ArticleExtractor()

In [61]:
pd.set_option("max_colwidth", 500)

In [36]:
def embed_things(text):
    sents = sent_tokenize(text)
    if len(sents) == 1:
        embedding = model.encode(sents[0])
    else:
        embedding = np.mean([model.encode(x) for x in sents], axis=0)
        
    return embedding

## Prove the idea, on a single document

In [None]:
doc = extractor.get_doc_from_url(r"https://www.nbcnews.com/news/us-news/ohio-derailment-waste-prompts-health-concerns-far-away-rcna72987")

In [13]:
title = doc.title
content = doc.content

In [14]:
content = content.replace("\n", " ")
content = content.replace("\xa0", " ")

In [73]:
print("Title:", title,"\n")
print("Body Text:", content[:250]+"...")

Title: Waste from Ohio derailment prompts health concerns up to 1,300 miles away 

Body Text: March 1, 2023, 11:57 PM UTC By Elizabeth Chuck , Gabe Gutierrez and Halle Lukasiewicz EAST LIVERPOOL, Ohio — For 30 years, this small city along the Ohio River has been home to the Heritage Thermal Services incinerator, a controversial hazardous wast...


In [38]:
content_embedding = embed_things(content)
title_embedding = embed_things(title)

In [43]:
util.cos_sim(title_embedding, content_embedding)[0]

tensor([0.4838])

## Cool, let's apply it to a full dataset

In [63]:
df = pd.read_csv(r"D:\Work\Data\medium_articles.csv")
df = df.sample(10000)

In [64]:
df['title_emb'] = df['title'].progress_apply(embed_things)

100%|████████████████████████████████████████████████████████████████████████████| 10000/10000 [01:50<00:00, 90.45it/s]


In [65]:
df['body_emb'] = df['text'].progress_apply(embed_things)

100%|██████████████████████████████████████████████████████████████████████████| 10000/10000 [1:45:08<00:00,  1.59it/s]


In [66]:
df['title similarity to document'] = df.apply(lambda x: float(util.cos_sim(x.title_emb, x.body_emb)[0]), axis=1)

In [68]:
df[['title', 'url','title similarity to document']].sort_values(by='title similarity to document', ascending=False).head(50)

Unnamed: 0,title,url,title similarity to document
111726,Exactly Ryan! Thanks for taking the time to respond :),https://medium.com/@lizporter2019/exactly-ryan-thanks-for-taking-the-time-to-respond-c1141b6620f8,1.0
112338,I want to be part of Writers’ Blokke. @youdecode,https://medium.com/@youdecode/i-want-to-be-part-of-writers-blokke-youdecode-81fd0c90fb0e,1.0
76628,I’m just gonna write down answers to prompts that I have. If that’s cool with everyone,https://medium.com/@lilycook80/im-just-gonna-write-down-answers-to-prompts-that-i-have-if-that-s-cool-with-everyone-6fe0c97bf344,1.0
120840,Code refactoring tools like Resharper help a lot in getting familiar with the new features of C#…,https://medium.com/@bvda.remote/code-refactoring-tools-like-resharper-help-a-lot-in-getting-familiar-with-the-new-features-of-c-74e05c98cd8,0.975105
106204,I have added you as a writer! Thanks for joining Relationship Stories. Happy writing,https://medium.com/@agneslaurens/i-have-added-you-as-a-writer-thanks-for-joining-relationship-stories-happy-writing-ca8522e72181,0.969273
119864,How to remove background in PicsArt | PicsArt tutorial | Sub Edit Official | 2020,https://medium.com/@subedit-com/how-to-remove-background-in-picsart-picsart-tutorial-sub-edit-official-2020-d4f0f41eb83d,0.953619
88577,Anti-Cancer Medicine Supplier Delhi | Pharma Distributors in South | Pharma Distributors in Delhi | Cancer Medicine in India,https://medium.com/@aarkpharma/anti-cancer-medicine-supplier-delhi-pharma-distributors-in-south-pharma-distributors-in-delhi-1ce2b06aef37,0.929857
47840,Low Vitamin D Levels as a Risk Factor for Greater Covid-19 Severity,https://medium.com/microbial-instincts/lack-of-vitamin-d-as-an-independent-risk-factor-for-covid-19-death-82365d0520fa,0.926729
64099,Here Are Seven Memes To Remind Us Of Key Lessons From History,https://medium.com/lessons-from-history/five-memes-to-remind-us-of-key-lessons-from-history-2f1c3a33462e,0.919422
18195,5 Useful JavaScript Time and Date Manipulation Libraries,https://medium.com/javascript-in-plain-english/here-are-5-useful-time-and-date-manipulation-libraries-4fc6ecb8220a,0.913972
