<a href="https://colab.research.google.com/github/AkhilaMadasthu/Data-Analysis-on-Job-Market/blob/main/PubMedA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install external libraries
!pip install xmltodict tqdm




In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import requests
import xml.etree.ElementTree as ET
import pandas as pd
import re
from tqdm import tqdm

In [None]:
# Step 1: Search PubMed for a query
query = "cancer immunotherapy"
esearch_url = f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
esearch_params = {
    "db": "pubmed",
    "term": query,
    "retmode": "json",
    "retmax": 10  # Increase this as needed
}
response = requests.get(esearch_url, params=esearch_params)
id_list = response.json()["esearchresult"]["idlist"]
id_string = ",".join(id_list)

In [None]:
# Step 2: Fetch article details using EFetch
efetch_url = f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
efetch_params = {
    "db": "pubmed",
    "id": id_string,
    "retmode": "xml"
}
efetch_response = requests.get(efetch_url, params=efetch_params)
root = ET.fromstring(efetch_response.text)


In [None]:
# Step 3: Define matching function
def is_pharma_or_biotech(text):
    pharma_keywords = [
        "pharma", "pharmaceutical", "biotech", "therapeutics", "inc", "ltd", "gmbh",
        "s.a.", "pvt", "company", "corporation", "biosciences", "biopharma",
        "lifesciences", "life sciences", "labs", "technology", "industries"
    ]
    text_lower = text.lower()
    return any(keyword in text_lower for keyword in pharma_keywords)


In [None]:
# Step 4: Parse and filter articles
results = []

for article in root.findall(".//PubmedArticle"):
    pmid = article.findtext(".//PMID")
    title = article.findtext(".//ArticleTitle")
    pub_date_node = article.find(".//PubDate")
    pub_date = pub_date_node.findtext("Year") if pub_date_node is not None else "N/A"

    authors = article.findall(".//Author")
    non_academic_authors = []
    company_affiliations = []
    email = None

    for author in authors:
        aff_list = author.findall(".//AffiliationInfo/Affiliation")
        for aff in aff_list:
            aff_text = aff.text or ""
            if is_pharma_or_biotech(aff_text):
                last = author.findtext("LastName") or ""
                fore = author.findtext("ForeName") or ""
                full_name = f"{fore} {last}".strip()
                non_academic_authors.append(full_name)
                company_affiliations.append(aff_text)

            # Find email
            email_match = re.search(r"[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+", aff_text)
            if email_match:
                email = email_match.group()

    if non_academic_authors:
        results.append({
            "PubmedID": pmid,
            "Title": title,
            "Publication Date": pub_date,
            "Non-academic Author(s)": "; ".join(non_academic_authors),
            "Company Affiliation(s)": "; ".join(company_affiliations),
            "Corresponding Author Email": email if email else "N/A"
        })


In [None]:
# Step 5: Save to CSV
df = pd.DataFrame(results)
df.to_csv("pharma_affiliated_papers.csv", index=False)
df


Unnamed: 0,PubmedID,Title,Publication Date,Non-academic Author(s),Company Affiliation(s),Corresponding Author Email
0,40747233,Systemic treatment of liver cancer: Current st...,2025,Chun-Bo Li; Yu-Ting Ning; Nai-Ying Shen; Ben W...,"Department of General Surgery, No. 215 Hospita...",luogangxueshu@163.com.
1,40747229,Molecular tumor boards in pancreatic cancer wi...,2025,Yan Yan; Jing Tang,"Department of Clinical Pharmacy, Beijing Tsing...",tj_0208@163.com.
2,40747115,Self-propelled gas nanomotor-integrated micron...,2025,Chungchi Lee; Shanghui Huang; Huiling Liu; Xin...,Key Laboratory of Biomaterials of Guangdong Hi...,
3,40747030,Second Generation Tiancimycin-Based Antibody-D...,2025,Alexander F Kiefer; Yuan Jin; Andrew D Steele;...,"Department of Chemistry, The Herbert Wertheim ...",


In [None]:
df[df["Company Affiliation(s)"] != "None"]


Unnamed: 0,PubmedID,Title,Publication Date,Non-academic Author(s),Company Affiliation(s),Corresponding Author Email
0,40747233,Systemic treatment of liver cancer: Current st...,2025,Chun-Bo Li; Yu-Ting Ning; Nai-Ying Shen; Ben W...,"Department of General Surgery, No. 215 Hospita...",luogangxueshu@163.com.
1,40747229,Molecular tumor boards in pancreatic cancer wi...,2025,Yan Yan; Jing Tang,"Department of Clinical Pharmacy, Beijing Tsing...",tj_0208@163.com.
2,40747115,Self-propelled gas nanomotor-integrated micron...,2025,Chungchi Lee; Shanghui Huang; Huiling Liu; Xin...,Key Laboratory of Biomaterials of Guangdong Hi...,
3,40747030,Second Generation Tiancimycin-Based Antibody-D...,2025,Alexander F Kiefer; Yuan Jin; Andrew D Steele;...,"Department of Chemistry, The Herbert Wertheim ...",


In [None]:
df[df["Company Affiliation(s)"] != "None"][["Title", "Company Affiliation(s)"]]


Unnamed: 0,Title,Company Affiliation(s)
0,Systemic treatment of liver cancer: Current st...,"Department of General Surgery, No. 215 Hospita..."
1,Molecular tumor boards in pancreatic cancer wi...,"Department of Clinical Pharmacy, Beijing Tsing..."
2,Self-propelled gas nanomotor-integrated micron...,Key Laboratory of Biomaterials of Guangdong Hi...
3,Second Generation Tiancimycin-Based Antibody-D...,"Department of Chemistry, The Herbert Wertheim ..."
