In [2]:
import pandas as pd
import requests
from tqdm.notebook import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed, wait
import os

# File paths
input_csv = 'drug_docs_ph1to4_all.csv'
output_csv = 'drug_docs_with_refs.csv'

# Load full input CSV
df = pd.read_csv(input_csv)

In [3]:
def get_references(doi):
    if pd.isna(doi):
        return ''
    try:
        response = requests.get(f'https://api.crossref.org/works/{doi}', timeout=10)
        if response.status_code == 200:
            data = response.json()
            refs = data.get('message', {}).get('reference', [])
            dois = [ref.get('DOI') for ref in refs if ref.get('DOI')]
            return ' '.join(dois)
    except:
        return ''
    return ''

def process_row(idx, doi):
    return idx, get_references(doi)

In [4]:
from math import ceil

batch_size = 200

# Detect existing output CSV to resume
if os.path.exists(output_csv):
    df_out = pd.read_csv(output_csv)
    start_row = len(df_out)
    print(f"Resuming from row {start_row}")
else:
    df_out = pd.DataFrame(columns=df.columns.tolist() + ['references'])
    start_row = 0
    print("Starting from row 0")

num_rows = len(df)
num_batches = (num_rows + batch_size - 1) // batch_size

pbar = tqdm(total=num_rows, initial=start_row, unit='rows')

for batch_start in range(start_row, num_rows, batch_size):
    batch_end = min(batch_start + batch_size, num_rows)
    batch_rows = df.iloc[batch_start:batch_end]

    results = []

    with ThreadPoolExecutor(max_workers=16) as executor:
        futures = [executor.submit(process_row, i, doi) for i, doi in zip(batch_rows.index, batch_rows['doi'])]
        for future in as_completed(futures):
            results.append(future.result())
            pbar.update(1)

    # Update DataFrame
    for idx, refs in results:
        df.at[idx, 'references'] = refs

    # Append batch to output CSV
    df_batch = df.iloc[batch_start:batch_end].copy()
    if start_row == 0 and batch_start == 0:
        df_batch.to_csv(output_csv, index=False)
    else:
        df_batch.to_csv(output_csv, index=False, mode='a', header=False)

    # print(f"Saved rows {batch_start}-{batch_end-1}")

pbar.close()

Resuming from row 135000


 69%|######8   | 135000/195654 [00:00<?, ?rows/s]

KeyboardInterrupt: 

In [5]:
import duckdb
import re

# Load patterns
with open('hit_synonyms.txt', 'r') as f:
    patterns = [line.strip() for line in f if line.strip()]

# Create regex pattern
combined_pattern = '|'.join(patterns)

# Load data into DuckDB
conn = duckdb.connect()
conn.execute("CREATE TABLE papers AS SELECT * FROM read_csv_auto('drug_docs_with_refs.csv')")

# Find citation pairs
query = f"""
SELECT DISTINCT 
    hit_paper.molregno as hit_molregno,
    hit_paper.doi as hit_paper,
    hit_paper.title as hit_title,
    drg_paper.molregno as drug_molregno,
    drg_paper.doi as drug_paper,
    drg_paper.title as drug_title
FROM papers drg_paper
JOIN papers hit_paper ON drg_paper.references LIKE '%' || hit_paper.doi || '%'
WHERE regexp_matches(hit_paper.title, '{combined_pattern}', 'i')
    AND drg_paper.doi != hit_paper.doi
    AND drg_paper.molregno == hit_paper.molregno
ORDER BY hit_paper.molregno
"""

citation_pairs = conn.execute(query).df()
print(f'Found {len(citation_pairs)} citation pairs')
citation_pairs

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Found 2509 citation pairs


Unnamed: 0,hit_molregno,hit_paper,hit_title,drug_molregno,drug_paper,drug_title
0,115,10.1021/jm801400g,Use of acetylcholine binding protein in the se...,115,10.1021/jm300030r,Synthesis and nicotinic receptor activity of c...
1,115,10.1021/jm801400g,Use of acetylcholine binding protein in the se...,115,10.1016/j.bmc.2011.08.028,Acetylcholine binding protein (AChBP) as templ...
2,115,10.1021/jm801400g,Use of acetylcholine binding protein in the se...,115,10.1021/jm100834y,Surface plasmon resonance biosensor based frag...
3,115,10.1021/jm801400g,Use of acetylcholine binding protein in the se...,115,10.1016/j.bmcl.2011.12.008,"Structure-based design, synthesis and structur..."
4,146,10.1016/j.ejmech.2016.06.042,Engineering another class of anti-tubercular l...,146,10.1016/j.ejmech.2020.112967,An appraisal of anti-mycobacterial activity wi...
...,...,...,...,...,...,...
2504,374052,10.1021/jm970530e,Physicochemical high throughput screening: par...,374052,10.1021/jm060230+,Parallel artificial membrane permeability assa...
2505,374052,10.1021/jm970530e,Physicochemical high throughput screening: par...,374052,10.1021/jm0309001,Surface activity profiling of drugs applied to...
2506,374052,10.1021/jm001101a,Experimental and computational screening model...,374052,10.1016/j.bmc.2007.03.040,QSAR study on permeability of hydrophobic comp...
2507,374052,10.1021/jm970530e,Physicochemical high throughput screening: par...,374052,10.1016/j.bmc.2009.08.022,Computational modeling of novel inhibitors tar...
