# Merge Publications

This script reads XML files containing information about publications, extracts relevant data, and merges it into a CSV file named 'dataset.csv'. The CSV file includes columns for professor name, publication type, venue, title, authors, and year.


In [1]:
import os
import re
import csv
from tqdm import tqdm
import xml.etree.ElementTree as ET

In [2]:
xml_folder = './../dat/xml' 
publications_file = './../dat/Publications.csv'

In [3]:
def get_publication_type(pub_element):
    # Determine the publication type based on the presence of 'publtype' attribute or specific venue elements
    if 'publtype' in pub_element.attrib:
        return 'Informal or Other Publication'
    
    # Check for specific venue elements ('journal', 'booktitle') to determine publication type
    venue_elements = ['journal', 'booktitle']
    for element in venue_elements:
        if pub_element.find(element) is not None:
            return 'Journal Article' if element == 'journal' else 'Conference or Workshop Paper'
    
    # Default to 'Informal or Other Publication' if no specific venue element is found
    return 'Informal or Other Publication'

def extract_data_from_xml(xml_file):
    # Parse the XML file
    tree = ET.parse(xml_file)
    root = tree.getroot()

    # Extract professor name, removing trailing digits
    professor_name = re.sub(r'\d+$', '', root.attrib['name']).strip()

    # Initialize a list to store extracted publications
    publications = []

    # Iterate through each article or inproceedings element in the XML
    for article in root.findall('.//r/*'):
        # Check if the element is an 'article' or 'inproceedings'
        if article.tag in ['article', 'inproceedings']:
            # Extract publication type
            publication_type = get_publication_type(article)

            # Extract venue, considering both 'journal' and 'booktitle'
            venue = next((article.find(element).text for element in ['journal', 'booktitle'] if article.find(element) is not None), '')

            # Extract title, authors, and year
            title = article.find('title').text if article.find('title') is not None else ''
            authors = ';'.join(re.sub(r'\d+$', '', author.text).strip() for author in article.findall('author'))
            year = article.find('year').text if article.find('year') is not None else ''

            # Append the extracted data to the list
            publications.append({
                'Professor': professor_name,
                'Publication Type': publication_type,
                'Venue': venue,
                'Title': title,
                'Authors': authors,
                'Year': year
            })

    # Return the list of extracted publications
    return publications

In [4]:
# Initialize an empty list for extracted publications
publications = []

# Iterate through XML files in the specified folder, extracting data
for xml_file in tqdm(os.listdir(xml_folder)):
    if xml_file.endswith('.xml'):
        xml_path = os.path.join(xml_folder, xml_file)
        publications.extend(extract_data_from_xml(xml_path))

# Write extracted publications to a CSV file
with open(publications_file, 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=publications[0].keys())
    writer.writeheader()
    writer.writerows(publications)

100%|██████████████████████████████████████████████████████████████████████████████████| 27/27 [00:00<00:00, 92.86it/s]
