# How similar are document titles to bodies?

### Load packages, define funcs

In [22]:
from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer('all-MiniLM-L6-v2')
from nltk.tokenize import sent_tokenize
import numpy as np

from tqdm import tqdm
tqdm.pandas()

import pandas as pd

import boilerpy3
from boilerpy3 import extractors
extractor = extractors.ArticleExtractor()

In [61]:
pd.set_option("max_colwidth", 500)

In [36]:
def embed_things(text):
    sents = sent_tokenize(text)
    if len(sents) == 1:
        embedding = model.encode(sents[0])
    else:
        embedding = np.mean([model.encode(x) for x in sents], axis=0)
        
    return embedding

## Prove the idea, on a single document

In [None]:
doc = extractor.get_doc_from_url(r"https://www.nbcnews.com/news/us-news/ohio-derailment-waste-prompts-health-concerns-far-away-rcna72987")

In [13]:
title = doc.title
content = doc.content

In [14]:
content = content.replace("\n", " ")
content = content.replace("\xa0", " ")

In [38]:
content_embedding = embed_things(content)
title_embedding = embed_things(title)

In [43]:
util.cos_sim(title_embedding, content_embedding)[0]

tensor([0.4838])

## Cool, let's apply it to a full dataset

In [48]:
df = pd.read_csv(r"D:\Work\Data\medium_articles.csv")
df = df.sample(100)

In [52]:
df['title_emb'] = df['title'].progress_apply(embed_things)

100%|███████████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 115.75it/s]


In [53]:
df['body_emb'] = df['text'].progress_apply(embed_things)

100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [00:43<00:00,  2.29it/s]


In [57]:
df['title similarity to document'] = df.apply(lambda x: float(util.cos_sim(x.title_emb, x.body_emb)[0]), axis=1)

In [62]:
df[['title', 'url','title similarity to document']].sort_values(by='title similarity to document', ascending=False)

Unnamed: 0,title,url,title similarity to document
95453,Why is wine so expensive in Thailand?,https://medium.com/@thaivisa/why-is-wine-so-expensive-in-thailand-7209d57b1b62,0.862271
31049,"UK Transport Costs are Rising, and Only Electric Vehicles can Save the Day",https://medium.com/energitokennews/uk-transport-costs-are-rising-and-only-electric-vehicles-can-save-the-day-eb787287642b,0.787915
32571,Safety Climate: A Brief Overview,https://medium.com/horizonperformance/safety-climate-a-brief-overview-aaaf3556ca14,0.787591
18316,Important update: buy and sell digital products on Ubcoin Market!,https://medium.com/ubcoin-blog/important-update-buy-and-sell-digital-products-on-ubcoin-market-a336cecca286,0.786431
164997,Dead Craze Or Just Starting: Expired Domain Finders,https://medium.com/hackernoon/dead-craze-or-just-starting-expired-domain-finders-60187897c4cf,0.758347
...,...,...,...
40189,How to Host the Holidays at an Airbnb Home,https://medium.com/airbnbmag/destination-gathering-10806732c81b,0.163656
81689,‘The Date’,https://medium.com/@srashtigupta/the-date-a0481b8a9ee8,0.149780
133406,Δεν ήταν μακρινός ο Παράδεισος..,https://medium.com/@efstratiospapanis/%CE%B4%CE%B5%CE%BD-%CE%AE%CF%84%CE%B1%CE%BD-%CE%BC%CE%B1%CE%BA%CF%81%CE%B9%CE%BD%CF%8C%CF%82-%CE%BF-%CF%80%CE%B1%CF%81%CE%AC%CE%B4%CE%B5%CE%B9%CF%83%CE%BF%CF%82-9ab378d42864,0.129821
133303,Trad,https://medium.com/@marcialiss17/trad-7944303bd1,0.103386
