# How similar are document titles to bodies?

### Load packages, define funcs

In [22]:
from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer('all-MiniLM-L6-v2')
from nltk.tokenize import sent_tokenize
import numpy as np

from tqdm import tqdm
tqdm.pandas()

import pandas as pd

import boilerpy3
from boilerpy3 import extractors
extractor = extractors.ArticleExtractor()

In [61]:
pd.set_option("max_colwidth", 500)

In [36]:
def embed_things(text):
    sents = sent_tokenize(text)
    if len(sents) == 1:
        embedding = model.encode(sents[0])
    else:
        embedding = np.mean([model.encode(x) for x in sents], axis=0)
        
    return embedding

## Prove the idea, on a single document

In [None]:
doc = extractor.get_doc_from_url(r"https://www.nbcnews.com/news/us-news/ohio-derailment-waste-prompts-health-concerns-far-away-rcna72987")

In [13]:
title = doc.title
content = doc.content

In [14]:
content = content.replace("\n", " ")
content = content.replace("\xa0", " ")

In [73]:
print("Title:", title,"\n")
print("Body Text:", content[:250]+"...")

Title: Waste from Ohio derailment prompts health concerns up to 1,300 miles away 

Body Text: March 1, 2023, 11:57 PM UTC By Elizabeth Chuck , Gabe Gutierrez and Halle Lukasiewicz EAST LIVERPOOL, Ohio — For 30 years, this small city along the Ohio River has been home to the Heritage Thermal Services incinerator, a controversial hazardous wast...


In [38]:
content_embedding = embed_things(content)
title_embedding = embed_things(title)

In [43]:
util.cos_sim(title_embedding, content_embedding)[0]

tensor([0.4838])

## Cool, let's apply it to a full dataset

In [63]:
df = pd.read_csv(r"D:\Work\Data\medium_articles.csv")
df = df.sample(10000)

In [64]:
df['title_emb'] = df['title'].progress_apply(embed_things)

100%|████████████████████████████████████████████████████████████████████████████| 10000/10000 [01:50<00:00, 90.45it/s]


In [65]:
df['body_emb'] = df['text'].progress_apply(embed_things)

100%|██████████████████████████████████████████████████████████████████████████| 10000/10000 [1:45:08<00:00,  1.59it/s]


In [66]:
df['title similarity to document'] = df.apply(lambda x: float(util.cos_sim(x.title_emb, x.body_emb)[0]), axis=1)

In [74]:
df['title len'] = df['title'].str.len()

In [75]:
df['body len'] = df['text'].str.len()

In [77]:
df['title to body ratio'] = df['title len']/df['body len']

## Highest similarity, no other constraints

In [86]:
df[['title', 'title similarity to document', 'title to body ratio', 'body len', 'url']].sort_values(by='title similarity to document', ascending=False).head(25)

Unnamed: 0,title,title similarity to document,title to body ratio,body len,url
111726,Exactly Ryan! Thanks for taking the time to respond :),1.0,1.0,54,https://medium.com/@lizporter2019/exactly-ryan-thanks-for-taking-the-time-to-respond-c1141b6620f8
112338,I want to be part of Writers’ Blokke. @youdecode,1.0,1.0,48,https://medium.com/@youdecode/i-want-to-be-part-of-writers-blokke-youdecode-81fd0c90fb0e
76628,I’m just gonna write down answers to prompts that I have. If that’s cool with everyone,1.0,1.0,86,https://medium.com/@lilycook80/im-just-gonna-write-down-answers-to-prompts-that-i-have-if-that-s-cool-with-everyone-6fe0c97bf344
120840,Code refactoring tools like Resharper help a lot in getting familiar with the new features of C#…,0.975105,0.843478,115,https://medium.com/@bvda.remote/code-refactoring-tools-like-resharper-help-a-lot-in-getting-familiar-with-the-new-features-of-c-74e05c98cd8
106204,I have added you as a writer! Thanks for joining Relationship Stories. Happy writing,0.969273,0.988235,85,https://medium.com/@agneslaurens/i-have-added-you-as-a-writer-thanks-for-joining-relationship-stories-happy-writing-ca8522e72181
119864,How to remove background in PicsArt | PicsArt tutorial | Sub Edit Official | 2020,0.953619,0.18,450,https://medium.com/@subedit-com/how-to-remove-background-in-picsart-picsart-tutorial-sub-edit-official-2020-d4f0f41eb83d
88577,Anti-Cancer Medicine Supplier Delhi | Pharma Distributors in South | Pharma Distributors in Delhi | Cancer Medicine in India,0.929857,0.317949,390,https://medium.com/@aarkpharma/anti-cancer-medicine-supplier-delhi-pharma-distributors-in-south-pharma-distributors-in-delhi-1ce2b06aef37
47840,Low Vitamin D Levels as a Risk Factor for Greater Covid-19 Severity,0.926729,0.05049,1327,https://medium.com/microbial-instincts/lack-of-vitamin-d-as-an-independent-risk-factor-for-covid-19-death-82365d0520fa
64099,Here Are Seven Memes To Remind Us Of Key Lessons From History,0.919422,0.075123,812,https://medium.com/lessons-from-history/five-memes-to-remind-us-of-key-lessons-from-history-2f1c3a33462e
18195,5 Useful JavaScript Time and Date Manipulation Libraries,0.913972,0.173375,323,https://medium.com/javascript-in-plain-english/here-are-5-useful-time-and-date-manipulation-libraries-4fc6ecb8220a


## How similar are titles to body text on long documents?

In [85]:
df[['title', 'title similarity to document', 'title to body ratio', 'body len', 'url']].sort_values(by='body len', ascending=False).head(25)

Unnamed: 0,title,title similarity to document,title to body ratio,body len,url
118642,Persian History — from 1000 BC to 2000 AD,0.539648,0.00041,100000,https://medium.com/@arash-monzavi-kia/persian-history-a-summary-from-3000-bc-to-2000-ad-57ae71909ecb
74055,"Sequencing the World’s Regulatory Information, w/ Manos SCHIZAS (#35)",0.420035,0.001161,59435,https://medium.com/aperture-hub/sequencing-the-worlds-regulatory-information-w-manos-schizas-35-c65e0d411404
22959,Preserving Our American Democracy in an Era of Repression and Regression,0.363527,0.00134,53745,https://21stcenturycivics.medium.com/preserving-our-american-democracy-in-an-era-of-repression-and-regression-c8918a7f6b79
94536,#Podcast #LeBreakdown When the Moroccan King normalises with israel: “Yawn”,0.468296,0.001435,52277,https://medium.com/@yasserlouati/podcast-lebreakdown-when-the-moroccan-king-normalises-with-israel-yawn-ad023d65bacd
86934,Myth-Making in the Age of Science: Costa Rica’s COVID-19 Narrative,0.691092,0.001279,51590,https://medium.com/@kenmorris-5247/myth-making-in-the-age-of-science-costa-ricas-covid-19-narrative-eea579a2bb51
41109,The Greatest 20th Century President: Jimmy Carter’s Accomplishments,0.642234,0.001332,50294,https://benklesc.medium.com/the-greatest-20th-century-president-jimmy-carters-accomplishments-938ea254b8a6
23667,Conversations with Communists,0.376526,0.000586,49466,https://medium.com/handwaving-freakoutery/conversations-with-communists-34069834ad77
13341,Adam Curry Interview with Ryan Dennis [Transcript],0.295657,0.001014,49290,https://medium.com/ico-alert/dench-musics-adam-curry-podcast-interview-transcript-11ac5100abf8
12366,No Fighting In This (Agile) Dojo with M. David Green,0.405987,0.001055,49284,https://medium.com/programming-leadership/no-fighting-in-this-agile-dojo-with-m-david-green-73a5018d8c65
174502,"Machine Learning, Trust, and the Whole Transparency Thing",0.672423,0.001161,49092,https://medium.com/@o.t.a.janssen/machine-learning-and-the-whole-transparency-thing-ac85577be382


## How similar are Titles to Body text on short documents? 

In [84]:
df[['title', 'title similarity to document', 'title to body ratio', 'body len', 'url']].sort_values(by='body len', ascending=True).head(25)

Unnamed: 0,title,title similarity to document,title to body ratio,body len,url
119557,Principal Components Regression (PRC) ใช้ PCA ในงาน Regression,0.046272,4.133333,15,https://medium.com/@lengyi/principal-components-regression-prc-99119862e35f
45979,禱告守望香港 - 抗疫爭戰。突發禱文 4. 突發禱文 4,0.017479,1.866667,15,https://medium.com/edens-core/%E7%A6%B1%E5%91%8A%E5%AE%88%E6%9C%9B%E9%A6%99%E6%B8%AF-%E6%8A%97%E7%96%AB%E7%88%AD%E6%88%B0-9ed1e4529734
68005,Mithat Bereket ile Yakın Tarihin Tanıklığı — Gez Göz Pusula Sergisi,0.095943,3.35,20,https://medium.com/pusulatv/mithat-bereket-ile-yak%C4%B1n-tarihin-tan%C4%B1kl%C4%B1%C4%9F%C4%B1-gez-g%C3%B6z-pusula-sergisi-42955d11ccb3
61586,El caso de Rocky Aoki,0.280547,1.05,20,https://medium.com/inteligencia-log%C3%ADstica/el-caso-de-rocky-aoki-435dabd9206f
64661,Die taz: Sebuah Catatan Harian (7-Habis) 27 November 1991: Akhir dari Sebuah Awal,0.10586,4.05,20,https://medium.com/literasi/die-taz-sebuah-catatan-harian-7-habis-27-november-1991-akhir-dari-sebuah-awal-426557d80a43
63364,Sevencoin Exchange 進展報告 03/04–03/10,0.239209,1.75,20,https://medium.com/7sevencoin/sevencoin-exchange-%E9%80%B2%E5%B1%95%E5%A0%B1%E5%91%8A-03-04-03-10-17fa7ef0cfb5
125900,CAR BATTERY PROBLEMS: WHEN IS IT TIME FOR A NEW BATTERY FULL INFO,0.084177,3.095238,21,https://medium.com/@ikhokher415/car-battery-problems-when-is-it-time-for-a-new-battery-full-info-62125fd13489
80856,Best smart speakers for the price,0.059862,1.571429,21,https://medium.com/@kelly79743208/best-smart-speakers-for-the-price-11057afcd0eb
68377,Is COVID19 a New Type of Bio-Energetic Disease State? We May Find Out the Hard Way March 2021.,0.174523,4.47619,21,https://medium.com/@nkalex/is-covid19-a-new-type-of-bio-energetic-disease-state-we-may-find-out-the-hard-way-march-2021-1078b2960938
101834,Los contratos inteligentes de IOTA ya están acá,0.191231,2.136364,22,https://medium.com/@animus-coop/los-contratos-inteligentes-de-iota-ya-est%C3%A1n-ac%C3%A1-7e229d5537fc
