In [2]:
import xml.etree.ElementTree as ET
from xml.etree.ElementTree import Element
import pandas as pd


def get_author_info_from_article_num(root: Element, article_num: int):

    article = root.findall(".//PubmedArticle")[article_num]

    authors = article.findall(".//AuthorList/Author")

    title = article.find(".//ArticleTitle").text if article.find(".//ArticleTitle").text is not None else ""
    pmid = article.find(".//PMID").text if article.find(".//PMID").text is not None else ""
    year = article.find(".//PubDate/Year").text if article.find(".//DateRevised/Year").text is not None else ""

    keyword_list = [keyword.text for keyword in article.findall(".//KeywordList/Keyword")]

    mesh_list = [mesh.text for mesh in article.findall(".//MeshHeading/DescriptorName[@UI]")]
    
    authors_info = []
    
    for author in authors:

        forename = author.find(".//ForeName").text if author.find(".//ForeName") is not None else ""
        lastname = author.find(".//LastName").text if author.find(".//LastName") is not None else ""
        initials = author.find(".//Initials").text if author.find(".//Initials") is not None else ""
        identity = author.find(".//AffiliationInfo/Identifier[@Source='GRID']").text if author.find(".//AffiliationInfo/Identifier[@Source='GRID']") is not None else ""

        affiliation_list = [aff.text for aff in author.findall(".//Affiliation")]

        author_info = {
            "forename": forename,
            "lastname": lastname,
            "initials": initials,
            "identity": identity,
            "affiliation": affiliation_list
        }
        authors_info.append(author_info)
    
    output = {
        "title": title,
        "pmid": pmid,
        "year": year,
        "keyword_list": keyword_list,
        "mesh_list": mesh_list,
        "authors_info": authors_info
    }
    
    return output


def get_all_data_for_each_article(root, article_cap):

    data = []

    for i in range(article_cap):

        data.append(get_author_info_from_article_num(root, i))

    return data


def flatten_article_data(article_data):

    flattened_data = []

    for article in article_data:
        for author in article['authors_info']:
            for affiliation in author['affiliation']:
                flattened_dict = {
                    "title": article['title'],
                    "pmid": article['pmid'],
                    "year": article['year'],
                    "keyword_list": article['keyword_list'],
                    "mesh_list": article['mesh_list'],
                    "forename": author['forename'],
                    "lastname": author['lastname'],
                    "initials": author['initials'],
                    "identity": author['identity'],
                    "affiliation": affiliation
                }
                flattened_data.append(flattened_dict)

    return flattened_data

tree = ET.parse("pubmed_result_sjogren.xml")

root = tree.getroot()

# data = get_all_data_for_each_article(root, len(root.findall(".//PubmedArticle")))

data = get_all_data_for_each_article(root, 5)

flattened_data = flatten_article_data(data)

print(flattened_data)

df = pd.DataFrame(flattened_data)

print(df)

[{'forename': 'Ourania D', 'lastname': 'Argyropoulou', 'initials': 'OD', 'identity': '', 'affiliation': 'Department of Pathophysiology, School of Medicine, National and Kapodistrian University of Athens, Athens, Greece.'}, {'forename': 'Daphne M', 'lastname': 'Peelen', 'initials': 'DM', 'identity': '', 'affiliation': 'Department of Rheumatology, Amsterdam Rheumatology & Immunology Center.'}, {'forename': 'Ben G J C', 'lastname': 'Zwezerijnen', 'initials': 'BGJC', 'identity': '', 'affiliation': 'Department of Radiology and Nuclear Medicine.'}, {'forename': 'Esther J', 'lastname': 'Nossent', 'initials': 'EJ', 'identity': '', 'affiliation': 'Department of Pulmonary Medicine and Amsterdam Cardiovascular Sciences, Amsterdam UMC, Vrije Universiteit, Amsterdam, The Netherlands.'}, {'forename': 'Lilian J', 'lastname': 'Meijboom', 'initials': 'LJ', 'identity': '', 'affiliation': 'Department of Radiology and Nuclear Medicine.'}, {'forename': 'Otto S', 'lastname': 'Hoekstra', 'initials': 'OS', 'i