In [65]:
from bs4 import BeautifulSoup
import requests
from datetime import datetime
import pandas as pd
from tqdm.auto import tqdm
import re

page = 0
stop_articles = 100
all_articles = []
urls_visited = set()

pbar = tqdm(total=stop_articles)
while True:
    url = f"https://www.broadinstitute.org/news?page={page}"
    response = requests.get(url)
    soup = BeautifulSoup(response.content, "html.parser")
    articles = soup.find_all("a", href=True)

    for article in articles:
        if article["href"].startswith("/news/"):
            url = f"https://www.broadinstitute.org{article['href']}"
            if url in urls_visited:
                continue
            urls_visited.add(url)
            response = requests.get(url)
            soup = BeautifulSoup(response.content, "html.parser")
            title = soup.find("div", class_="hero-section__title").span.h1.text
            author = soup.find("div", class_="hero-section__author").div.text
            date = datetime.strptime(soup.find("div", class_="hero-section__date").find("time").get("datetime"), "%Y-%m-%dT%H:%M:%S%z").strftime("%Y-%m-%d")
            try:
                article = soup.find("div", {"class": re.compile(r"clearfix text-formatted field field--name-field-text field--type-text-long field--label-hidden field__item")})
                all_articles.append({"title": title, "author": author, "date": date, "article": article.text})
                pbar.update(1)
            except Exception as e:
                texts = soup.find_all("div", {"class": re.compile(r"clearfix text-formatted field field--name-body field--type-text-with-summary field--label-hidden field__item")})
                for text in texts:
                    if text.find("div", class_="summary-only") is None:
                        all_articles.append({"title": title.strip(), "author": author.strip(), "date": date, "article": text.text.strip()})
                        pbar.update(1)
                        break

    page += 1
    if len(all_articles) >= stop_articles:
        break

df = pd.DataFrame(all_articles)
df

  0%|          | 0/100 [00:00<?, ?it/s]

Unnamed: 0,title,author,date,article
0,A therapy candidate for fatal prion diseases t...,"\n \n \n By Greta Friar, Whitehead In...",2024-06-27,Prion diseases lead to rapid neurodegeneration...
1,Simple test for flu could improve diagnosis an...,\n \n \n By Allessandra DiCorato\n,2024-06-18,Fewer than one percent of people who get the f...
2,Gut microbiome changes align with increased ri...,"\n \n \n By Jennifer Welsh, Brigham a...",2024-06-25,The largest and most ethnically and geographic...
3,#WhyIScience Q&A: A biochemist uses mass spect...,\n \n \n By Claire Hendershot\r\n\n,2024-06-11,"When Moe Haines first moved from Beirut, Leban..."
4,Improved prime editing system makes gene-sized...,\n \n \n By Allessandra DiCorato\n,2024-06-10,Scientists at the Broad Institute of MIT and H...
...,...,...,...,...
98,Alumni of the Broad’s summer research programs...,\n \n \n By Broad Communications\r\n\n,2023-05-24,The alumni of the Broad Institute’s summer res...
99,Broad’s summer research programs for high scho...,\n \n \n By Allessandra DiCorato\n,2023-05-24,High school junior Michelle Mantilla was ready...
100,"Cells can use uridine, a component of RNA, as ...",\n \n \n By Allessandra DiCorato\n,2023-05-17,"Our bodies burn carbohydrates, proteins, and f..."
101,Viruses in the guts of centenarians may help t...,\n \n \n By Allessandra DiCorato\n,2023-05-15,New research suggests that centenarians — peop...


In [66]:
# attributes we need: title, description, keywords, dataset type, collection period, organism, genes, tissue/cell type, condition, technique, instrument platform, software, usage restrictions, related datasets
# use LLM to extract these attributes from the title, abstract, and keywords

from keybert import KeyBERT

model = KeyBERT('distilbert-base-nli-mean-tokens')

def extract_keywords(text):
    keywords = model.extract_keywords(text)
    # remove confidence
    keywords = [keyword[0] for keyword in keywords]
    return keywords

# extract attributes from the title, abstract, and keywords
df = df.dropna(subset="article")
df["title_attributes"] = df["title"].apply(extract_keywords)
df["article_attributes"] = df["article"].apply(extract_keywords)
df

Unnamed: 0,title,author,date,article,title_attributes,article_attributes
0,A therapy candidate for fatal prion diseases t...,"\n \n \n By Greta Friar, Whitehead In...",2024-06-27,Prion diseases lead to rapid neurodegeneration...,"[therapy, disease, diseases, fatal, candidate]","[neurodegenerative, neurodegeneration, scienti..."
1,Simple test for flu could improve diagnosis an...,\n \n \n By Allessandra DiCorato\n,2024-06-18,Fewer than one percent of people who get the f...,"[flu, surveillance, diagnosis, improve, test]","[influenza, influenzas, scientists, clinicians..."
2,Gut microbiome changes align with increased ri...,"\n \n \n By Jennifer Welsh, Brigham a...",2024-06-25,The largest and most ethnically and geographic...,"[diabetes, microbiome, increased, changes, risk]","[diabetes, bacteriophages, probiotics, genetic..."
3,#WhyIScience Q&A: A biochemist uses mass spect...,\n \n \n By Claire Hendershot\r\n\n,2024-06-11,"When Moe Haines first moved from Beirut, Leban...","[biochemist, cancer, spectrometry, proteins, m...","[biochemistry, startup, biology, genomics, pha..."
4,Improved prime editing system makes gene-sized...,\n \n \n By Allessandra DiCorato\n,2024-06-10,Scientists at the Broad Institute of MIT and H...,"[therapeutic, improved, editing, edits, gene]","[biotech, biomedical, scientists, harvard, med..."
...,...,...,...,...,...,...
98,Alumni of the Broad’s summer research programs...,\n \n \n By Broad Communications\r\n\n,2023-05-24,The alumni of the Broad Institute’s summer res...,"[alumni, professional, summer, personal, resea...","[scientists, alumni, universities, scholars, s..."
99,Broad’s summer research programs for high scho...,\n \n \n By Allessandra DiCorato\n,2023-05-24,High school junior Michelle Mantilla was ready...,"[scientists, students, summer, college, 300]","[harvard, biology, diabetes, scientists, mexican]"
100,"Cells can use uridine, a component of RNA, as ...",\n \n \n By Allessandra DiCorato\n,2023-05-17,"Our bodies burn carbohydrates, proteins, and f...","[rna, uridine, cells, energy, component]","[diabetes, cancer, cancers, obesity, atp]"
101,Viruses in the guts of centenarians may help t...,\n \n \n By Allessandra DiCorato\n,2023-05-15,New research suggests that centenarians — peop...,"[viruses, pathogens, help, centenarians, guts]","[microbiology, bacteria, bacteriophages, immun..."


In [71]:
df_dropped = df.copy()
df_dropped["Title"] = df["title"]
df_dropped["Description"] = df["article"].replace(r"\n", " ", regex=True).replace(r"\s+", " ", regex=True)
df_dropped["Keywords"] = (df["title_attributes"] + df["article_attributes"]).apply(lambda x: " ".join(list(set(word.replace(" ", "_") for word in x))) + " broad news")
df_dropped["Dataset Type"] = ""
df_dropped["Collection Period"] = df["date"].astype(str)
df_dropped["Organism"] = ""
df_dropped["Genes"] = ""
df_dropped["Tissue/Cell Type"] = ""
df_dropped["Condition"] = ""
df_dropped["Technique"] = ""
df_dropped["Instrument Platform"] = ""
df_dropped["Software"] = ""
df_dropped["Usage Restrictions"] = ""
df_dropped["Related Datasets"] = ""
df_dropped = df_dropped[['Title', 'Description', 'Keywords', 'Dataset Type', 'Collection Period', 'Organism', 'Genes', 'Tissue/Cell Type', 'Condition', 'Technique', 'Instrument Platform', 'Software', 'Usage Restrictions', 'Related Datasets']]
df_dropped

Unnamed: 0,Title,Description,Keywords,Dataset Type,Collection Period,Organism,Genes,Tissue/Cell Type,Condition,Technique,Instrument Platform,Software,Usage Restrictions,Related Datasets
0,A therapy candidate for fatal prion diseases t...,Prion diseases lead to rapid neurodegeneration...,candidate neurodegenerative diseases fatal bio...,,2024-06-27,,,,,,,,,
1,Simple test for flu could improve diagnosis an...,Fewer than one percent of people who get the f...,influenza diagnosis improve influenzas surveil...,,2024-06-18,,,,,,,,,
2,Gut microbiome changes align with increased ri...,The largest and most ethnically and geographic...,increased bacteria microbiome diabetes bacteri...,,2024-06-25,,,,,,,,,
3,#WhyIScience Q&A: A biochemist uses mass spect...,"When Moe Haines first moved from Beirut, Leban...",proteins genomics biology spectrometry cancer ...,,2024-06-11,,,,,,,,,
4,Improved prime editing system makes gene-sized...,Scientists at the Broad Institute of MIT and H...,edits therapeutic editing gene biotech medicin...,,2024-06-10,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
98,Alumni of the Broad’s summer research programs...,The alumni of the Broad Institute’s summer res...,research professional alumni summer scholars s...,,2023-05-24,,,,,,,,,
99,Broad’s summer research programs for high scho...,High school junior Michelle Mantilla was ready...,college biology 300 summer harvard students di...,,2023-05-24,,,,,,,,,
100,"Cells can use uridine, a component of RNA, as ...","Our bodies burn carbohydrates, proteins, and f...",cancers cells rna cancer obesity diabetes atp ...,,2023-05-17,,,,,,,,,
101,Viruses in the guts of centenarians may help t...,New research suggests that centenarians — peop...,immunology viruses help pathogens bacteria bac...,,2023-05-15,,,,,,,,,


In [72]:
df_dropped.to_csv("broad_newsletter.csv", index=False)

In [73]:
df_dropped["Filename"] = ""
df_dropped["User"] = df["author"].str.strip().str.replace(r"^\s*By\s+", "", regex=True)
df_dropped["Post ID"] = df_dropped.index + len(pd.read_csv("posts.tsv", sep="\t")) + 1
df_dropped["Likes"] = 0
df_dropped.to_csv("broad_newsletter_post.tsv", index=False, sep="\t")

In [74]:
import numpy as np

len(np.unique([keyword for keyword_list in df_dropped["Keywords"].str.split(" ").values for keyword in keyword_list]))

469