# Pubmed Query: 

`(Wendell Lim[author] AND Francisco[affil]) OR (Lim WA[author] AND Francisco[affil])`

In [20]:
# Load in the HTML containing all the data
with open('papers.html') as f:
    html = f.read()


In [21]:
html_lines = html.split('\n')

# Break into an entry for each paper
organized_lines = []
for i, line in enumerate(html_lines):
    block = ""
    
    # keep adding the next line to block until a line containing 'title' is found
    if 'title' in line:
        block += line
        j = i + 1
        while 'title' not in html_lines[j] and j < len(html_lines)-1:
            block += html_lines[j]
            j += 1

    if block != "":
        organized_lines.append(block)


In [22]:
from bs4 import BeautifulSoup
import pandas as pd
papers = []

for line in organized_lines:
    soup = BeautifulSoup(line, 'html.parser')
    # get the title from the title div
    div = soup.find_all('div', class_='title')[0]
    title = div.get_text()
    try:
        pdf = soup.find('a')['href']
        pdf = pdf.split('/')[-1]

        if 'pdf' in pdf:
            p = {}
            p['title'] = title
            p['filename'] = pdf
            papers.append(p)
    except Exception as e:
        pass


In [23]:
edf = pd.DataFrame(papers)
len(edf)

64

In [24]:
# Read the pubmed data
df = pd.read_csv('limlab_papers_all.csv')


In [25]:
import re

def clean_title(title):
    # Convert to lowercase
    title = title.lower()
    # Remove punctuation
    title = re.sub(r'[^\w\s]', '', title)
    # Split into a list of words
    return title.split()

def compare_titles(title1, title2):
    # Clean the titles
    title1_words = clean_title(title1)
    title2_words = clean_title(title2)
    # Compare the lists of words
    common_words = set(title1_words).intersection(title2_words)
    if len(common_words) >= (0.8 * min(len(title1_words), len(title2_words))):
        return True
    else:
        return False

In [26]:
for i, row in df.iterrows():
    for j, jrow in edf.iterrows():
        # if compare_titles if true, add the filename from jrow to row
        

        if compare_titles(row['Title'], jrow['title']):
            # if the titles are similar, add the filename to the pubmed data
            df.loc[i, 'filename'] = jrow['filename']


In [27]:
df

Unnamed: 0,PMID,Title,Authors,Citation,First Author,Journal,Publication Year,Create Date,PMCID,NIHMS ID,DOI,filename
0,36608674,Harnessing synthetic biology to engineer organ...,"Trentesaux C, Yamada T, Klein OD, Lim WA.",Cell Stem Cell. 2023 Jan 5;30(1):10-19. doi: 1...,Trentesaux C,Cell Stem Cell,2023,1/7/23,,,10.1016/j.stem.2022.12.013,
1,36520915,Synthetic cytokine circuits that drive T cells...,"Allen GM, Frankel NW, Reddy NR, Bhargava HK, Y...",Science. 2022 Dec 16;378(6625):eaba1624. doi: ...,Allen GM,Science,2022,12/15/22,,,10.1126/science.aba1624,allen_2022.pdf
2,36509107,Programming Multicellular Assembly with Synthe...,"Stevens AJ, Harris AR, Gerdts J, Kim KH, Trent...",Nature. 2022 Dec 12. doi: 10.1038/s41586-022-0...,Stevens AJ,Nature,2022,12/12/22,,,10.1038/s41586-022-05622-z,stevens_2022.pdf
3,36480602,Decoding CAR T cell phenotype using combinator...,"Daniels KG, Wang S, Simic MS, Bhargava HK, Cap...",Science. 2022 Dec 16;378(6625):1194-1200. doi:...,Daniels KG,Science,2022,12/8/22,,,10.1126/science.abq0225,daniels_2022.pdf
4,36423287,The emerging era of cell engineering: Harnessi...,Lim WA.,Science. 2022 Nov 25;378(6622):848-852. doi: 1...,Lim WA,Science,2022,11/24/22,,,10.1126/science.add9665,lim_2022.pdf
...,...,...,...,...,...,...,...,...,...,...,...,...
108,10903934,Improving SH3 domain ligand selectivity using ...,"Nguyen JT, Porter M, Amoui M, Miller WT, Zucke...",Chem Biol. 2000 Jul;7(7):463-73. doi: 10.1016/...,Nguyen JT,Chem Biol,2000,7/25/00,,,10.1016/s1074-5521(00)00130-7,
109,10488080,PSD-95 assembles a ternary complex with the N-...,"Christopherson KS, Hillier BJ, Lim WA, Bredt DS.",J Biol Chem. 1999 Sep 24;274(39):27467-73. doi...,Christopherson KS,J Biol Chem,1999,9/17/99,,,10.1074/jbc.274.39.27467,
110,10338211,Structure of the enabled/VASP homology 1 domai...,"Prehoda KE, Lee DJ, Lim WA.",Cell. 1999 May 14;97(4):471-80. doi: 10.1016/s...,Prehoda KE,Cell,1999,5/25/99,,,10.1016/s0092-8674(00)80757-6,
111,10221915,Unexpected modes of PDZ domain scaffolding rev...,"Hillier BJ, Christopherson KS, Prehoda KE, Bre...",Science. 1999 Apr 30;284(5415):812-5.,Hillier BJ,Science,1999,4/30/99,,,,bjh_1999_comm.pdf


In [28]:
# for each row in df, create a row with Publication Year_first author last name
def filename(row):
    return str(row['Publication Year']) + '_' + row['First Author'].split(' ')[0] + '.pdf'

df['alt_filename'] = df.apply(filename, axis=1)

In [29]:
!ls

1. Parse PDF Data.ipynb         limlab_papers_all.csv
2. Generate Jade from CSV.ipynb out.jade
final_papers copy.csv           papers.html
final_papers.csv                [34mpdfs[m[m


In [30]:
import glob
filenames = glob.glob('pdfs/*.pdf')
filenames = [file[5:] for file in filenames]

In [31]:
# check if either filename is present in the folder
for i, row in df.iterrows():
    if row['filename'] in filenames:
        df.loc[i, 'actual_file'] = row['filename']
    elif row['alt_filename'] in filenames:
        df.loc[i, 'actual_file'] = row['alt_filename']
    else:
        df.loc[i, 'actual_file'] = None

In [32]:
df.to_csv('final_papers.csv', index=False)