In [78]:
import datetime
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd

df_list = []
for season in ["fall", "spring"]:
    for year in range(2010, 2025):
        url_base = "https://www.broadinstitute.org"
        url = f"{url_base}/talks/{season}-{year}/mia"
        response = requests.get(url)
        if response.status_code != 200:
            print(f"Failed to get {url}")
            continue
        soup = BeautifulSoup(response.text, "html.parser")

        talks = []
        row = 0
        for link in soup.find_all("td", {'headers': re.compile(r'view-.*')}):
            if row == len(talks):
                talks.append({})
            if 'talks-date' in link.get('headers')[0]:
                talks[row]["date"] = link.text.strip()
            elif 'field-speaker' in link.get('headers')[0]:
                talks[row]["speakers"] = [speaker_affil.split("\n\n")[0] for speaker_affil in link.text.strip().split("\n\n\n \n")]
            elif 'title' in link.get('headers')[0]:
                talks[row]["title"] = link.text.strip().rstrip("\n [Video]")
                talks[row]["link"] = url_base + link.find("a").get("href")
                row += 1

        df = pd.DataFrame(talks)

        def get_abstract(link):
            response = requests.get(link)
            soup = BeautifulSoup(response.text, "html.parser")
            abstract = soup.find("div", class_="block block-layout-builder block-field-blocknodetalksbody")
            if abstract is not None:
                return abstract.text.strip()
            else:
                return None

        df["abstract"] = df["link"].apply(get_abstract)
        df["date"] = pd.to_datetime(df["date"], format="%b %d")
        df["date"] = df["date"].apply(lambda x: x.replace(year=year))
        df_list.append(df)

df = pd.concat(df_list)
df = df.sort_values("date")
df

Failed to get https://www.broadinstitute.org/talks/fall-2010/mia
Failed to get https://www.broadinstitute.org/talks/fall-2011/mia
Failed to get https://www.broadinstitute.org/talks/fall-2012/mia
Failed to get https://www.broadinstitute.org/talks/fall-2013/mia
Failed to get https://www.broadinstitute.org/talks/fall-2014/mia
Failed to get https://www.broadinstitute.org/talks/fall-2015/mia
Failed to get https://www.broadinstitute.org/talks/fall-2016/mia
Failed to get https://www.broadinstitute.org/talks/fall-2017/mia
Failed to get https://www.broadinstitute.org/talks/fall-2018/mia
Failed to get https://www.broadinstitute.org/talks/fall-2024/mia
Failed to get https://www.broadinstitute.org/talks/spring-2010/mia
Failed to get https://www.broadinstitute.org/talks/spring-2011/mia
Failed to get https://www.broadinstitute.org/talks/spring-2012/mia
Failed to get https://www.broadinstitute.org/talks/spring-2013/mia
Failed to get https://www.broadinstitute.org/talks/spring-2014/mia
Failed to get h

Unnamed: 0,date,speakers,title,link,abstract
0,2019-02-14,[Pascal Notin],Hybrid protein language models for fitness pre...,https://www.broadinstitute.org/talks/fitness-m...,The ability to accurately model the fitness la...
1,2019-02-14,"[Noor Youssef, Sarah Faye Gurev]",Unsupervised viral antibody escape prediction ...,https://www.broadinstitute.org/talks/using-pas...,Effective pandemic preparedness relies on pred...
2,2019-03-06,"[Sandeep Kambhampati, Philipp Schneider, Kai C...",Postdoc flash talks,https://www.broadinstitute.org/talks/postdoc-f...,
3,2019-03-13,[Žiga Avsec],Accurate proteome-wide missense variant effect...,https://www.broadinstitute.org/talks/alpha-mis...,The vast majority of missense variants observe...
4,2019-03-13,[Jun Cheng],Alpha Missens,https://www.broadinstitute.org/talks/alpha-mis...,The vast majority of missense variants observe...
...,...,...,...,...,...
12,2024-05-01,[Simon Kozlov],Combining protein language and structure model...,https://www.broadinstitute.org/talks/combining...,All known organisms need all 20 canonical amin...
13,2024-05-08,[Matthew McPartlon],"Protein Design with Deep Learning: Progress, C...",https://www.broadinstitute.org/talks/protein-d...,The human proteome comprises tens of thousands...
14,2024-05-08,[Joshua Meier],Unlocking Generative AI for Drug Discovery wit...,https://www.broadinstitute.org/talks/tbd-10,Generative AI has the potential to greatly inc...
15,2024-05-29,[Marinka Zitnik],Geometric deep learning and generative models ...,https://www.broadinstitute.org/talks/tbd-7,Computational therapeutic target discovery req...


In [79]:
# attributes we need: title, description, keywords, dataset type, collection period, organism, genes, tissue/cell type, condition, technique, instrument platform, software, usage restrictions, related datasets
# use LLM to extract these attributes from the title, abstract, and keywords

from keybert import KeyBERT

model = KeyBERT('distilbert-base-nli-mean-tokens')

def extract_keywords(text):
    keywords = model.extract_keywords(text)
    # remove confidence
    keywords = [keyword[0] for keyword in keywords]
    return keywords

# extract attributes from the title, abstract, and keywords
df = df.dropna(subset="abstract")
df["title_attributes"] = df["title"].apply(extract_keywords)
df["abstract_attributes"] = df["abstract"].apply(extract_keywords)
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["title_attributes"] = df["title"].apply(extract_keywords)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["abstract_attributes"] = df["abstract"].apply(extract_keywords)


Unnamed: 0,date,speakers,title,link,abstract,title_attributes,abstract_attributes
0,2019-02-14,[Pascal Notin],Hybrid protein language models for fitness pre...,https://www.broadinstitute.org/talks/fitness-m...,The ability to accurately model the fitness la...,"[protein, prediction, fitness, design, hybrid]","[biotherapeutic, hard, massive, proteins, fitn..."
1,2019-02-14,"[Noor Youssef, Sarah Faye Gurev]",Unsupervised viral antibody escape prediction ...,https://www.broadinstitute.org/talks/using-pas...,Effective pandemic preparedness relies on pred...,"[vaccines, antibody, viral, future, prediction]","[vaccines, vaccine, viruses, antibodies, antib..."
3,2019-03-13,[Žiga Avsec],Accurate proteome-wide missense variant effect...,https://www.broadinstitute.org/talks/alpha-mis...,The vast majority of missense variants observe...,"[prediction, accurate, proteome, alpha, missense]","[genetic, pathogenicity, pathogenic, genes, ge..."
4,2019-03-13,[Jun Cheng],Alpha Missens,https://www.broadinstitute.org/talks/alpha-mis...,The vast majority of missense variants observe...,"[missens, alpha]","[genetic, pathogenicity, pathogenic, genes, ge..."
5,2019-03-20,[Yiqun Chen],Testing data-driven hypotheses post-clustering,https://www.broadinstitute.org/talks/valid-hyp...,This primer talk is motivated by the practice ...,"[hypotheses, testing, clustering, data, post]","[biomedical, researchers, research, genes, rna]"
...,...,...,...,...,...,...,...
12,2024-05-01,[Simon Kozlov],Combining protein language and structure model...,https://www.broadinstitute.org/talks/combining...,All known organisms need all 20 canonical amin...,"[coli, protein, redesign, amino, alphabet]","[coli, proteinmpnn, genes, 20, protein]"
13,2024-05-08,[Matthew McPartlon],"Protein Design with Deep Learning: Progress, C...",https://www.broadinstitute.org/talks/protein-d...,The human proteome comprises tens of thousands...,"[protein, learning, design, deep, challenges]","[proteins, protein, evolutionary, evolution, p..."
14,2024-05-08,[Joshua Meier],Unlocking Generative AI for Drug Discovery wit...,https://www.broadinstitute.org/talks/tbd-10,Generative AI has the potential to greatly inc...,"[unlocking, drug, discovery, generative, models]","[libraries, drug, learning, intensive, screening]"
15,2024-05-29,[Marinka Zitnik],Geometric deep learning and generative models ...,https://www.broadinstitute.org/talks/tbd-7,Computational therapeutic target discovery req...,"[protein, learning, discovery, generative, geo...","[biology, biomolecular, oncology, proteins, pr..."


In [80]:
df_dropped = df.copy()
df_dropped["Title"] = df["title"]
df_dropped["Description"] = df["abstract"].replace(r"\n", " ", regex=True)
df_dropped["Keywords"] = (df["title_attributes"] + df["abstract_attributes"]).apply(lambda x: " ".join(list(set(word.replace(" ", "_") for word in x))) + " mia meeting")
df_dropped["Dataset Type"] = ""
df_dropped["Collection Period"] = df["date"].astype(str)
df_dropped["Organism"] = ""
df_dropped["Genes"] = ""
df_dropped["Tissue/Cell Type"] = ""
df_dropped["Condition"] = ""
df_dropped["Technique"] = ""
df_dropped["Instrument Platform"] = ""
df_dropped["Software"] = ""
df_dropped["Usage Restrictions"] = ""
df_dropped["Related Datasets"] = ""
df_dropped["Link"] = df["link"]
df_dropped = df_dropped[['Title', 'Description', 'Keywords', 'Dataset Type', 'Collection Period', 'Organism', 'Genes', 'Tissue/Cell Type', 'Condition', 'Technique', 'Instrument Platform', 'Software', 'Usage Restrictions', 'Related Datasets', 'Link']]
df_dropped

Unnamed: 0,Title,Description,Keywords,Dataset Type,Collection Period,Organism,Genes,Tissue/Cell Type,Condition,Technique,Instrument Platform,Software,Usage Restrictions,Related Datasets,Link
0,Hybrid protein language models for fitness pre...,The ability to accurately model the fitness la...,biotherapeutic fitness hybrid protein design p...,,2019-02-14,,,,,,,,,,https://www.broadinstitute.org/talks/fitness-m...
1,Unsupervised viral antibody escape prediction ...,Effective pandemic preparedness relies on pred...,antibodies viruses future vaccine prediction a...,,2019-02-14,,,,,,,,,,https://www.broadinstitute.org/talks/using-pas...
3,Accurate proteome-wide missense variant effect...,The vast majority of missense variants observe...,proteome genes genome alpha missense accurate ...,,2019-03-13,,,,,,,,,,https://www.broadinstitute.org/talks/alpha-mis...
4,Alpha Missens,The vast majority of missense variants observe...,genes genome alpha missens genetic pathogenici...,,2019-03-13,,,,,,,,,,https://www.broadinstitute.org/talks/alpha-mis...
5,Testing data-driven hypotheses post-clustering,This primer talk is motivated by the practice ...,genes data research post biomedical researcher...,,2019-03-20,,,,,,,,,,https://www.broadinstitute.org/talks/valid-hyp...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12,Combining protein language and structure model...,All known organisms need all 20 canonical amin...,genes amino protein proteinmpnn coli redesign ...,,2024-05-01,,,,,,,,,,https://www.broadinstitute.org/talks/combining...
13,"Protein Design with Deep Learning: Progress, C...",The human proteome comprises tens of thousands...,proteome protein challenges deep design protei...,,2024-05-08,,,,,,,,,,https://www.broadinstitute.org/talks/protein-d...
14,Unlocking Generative AI for Drug Discovery wit...,Generative AI has the potential to greatly inc...,intensive learning unlocking screening models ...,,2024-05-08,,,,,,,,,,https://www.broadinstitute.org/talks/tbd-10
15,Geometric deep learning and generative models ...,Computational therapeutic target discovery req...,oncology protein learning biology biomolecular...,,2024-05-29,,,,,,,,,,https://www.broadinstitute.org/talks/tbd-7


In [81]:
df_dropped.to_csv("mia_talks.csv", index=False)

In [82]:
df_dropped["Filename"] = ""
df_dropped["User"] = df["speakers"].apply(lambda x: ", ".join(x))
df_dropped["Post ID"] = df_dropped.index + len(pd.read_csv("posts.tsv", sep="\t")) + 1
df_dropped["Likes"] = 0
df_dropped.to_csv("mia_talks_post.tsv", index=False, sep="\t")

In [83]:
import numpy as np

len(np.unique([keyword for keyword_list in df_dropped["Keywords"].str.split(" ").values for keyword in keyword_list]))

101