In [3]:
import os
import pandas as pd
from tqdm import tqdm
import xml.etree.ElementTree as ET
import re
from pathlib import Path

file_dir = Path(os.getcwd()).resolve()
meta_path = file_dir / 'jstor_data' / 'metadata'

In [6]:
files = meta_path.iterdir()
cols = ['id', 'type', 'title', 'author', 'year', 'lang', 'file_x', 'file_y']
df = pd.DataFrame(columns=cols)

for i, f in tqdm(enumerate(files), desc='Reading metadata files'):
    tree = ET.parse(f)
    root = tree.getroot()
    f_string = str(f)
    m = re.search('metadata/(.+?).xml', f_string)
    if m:
        id = m.group(1)
    else:
        id = 'ID not found'
    type = root.attrib['article-type']
    # title handling
    title_group = root.find('front/article-meta/title-group')
    if title_group is not None and len(title_group.getchildren()) > 0:
        title = list(title_group.itertext())[1]
    else:
        title = ''
    # author handling
    contrib_group = root.find('front/article-meta/contrib-group')
    if contrib_group is not None and len(contrib_group.getchildren()) > 0:
        author = ' '.join([list(c.itertext())[0] for c in root.find('front/article-meta/contrib-group/contrib/string-name')])
    else:
        author = ''
    lang = list(root.find('front/article-meta/custom-meta-group/custom-meta/meta-value').itertext())[0]
    year = int(list(root.find('front/article-meta/pub-date/year').itertext())[0])
    file_x = str(f).split("metadata/")[1]
    file_y = file_x.replace('.xml', '-ngram1.txt')
    df.loc[i] = [id, type, title, author, year, lang, file_x, file_y]
print(f"\nCollected {df.shape[0]} articles")

Reading metadata files: 1644it [00:11, 146.39it/s]
Collected 1644 articles



In [7]:
df.head()

Unnamed: 0,id,type,title,author,year,lang,file_x,file_y
0,journal-article-10.2307_1486423,research-article,A Sociological Portrait of German Jewish Immig...,Stephen G. Mostov,1978,eng,journal-article-10.2307_1486423.xml,journal-article-10.2307_1486423-ngram1.txt
1,journal-article-10.2307_1486465,research-article,"Lawrence Perlman's ""Buber's Anti-Kantianism"": ...",Steven T. Katz,1990,eng,journal-article-10.2307_1486465.xml,journal-article-10.2307_1486465-ngram1.txt
2,journal-article-10.2307_1486694,misc,Collected Studies,,1996,eng,journal-article-10.2307_1486694.xml,journal-article-10.2307_1486694-ngram1.txt
3,journal-article-10.2307_4131787,book-review,,Gordon M. Freeman,2003,eng,journal-article-10.2307_4131787.xml,journal-article-10.2307_4131787-ngram1.txt
4,journal-article-10.2307_1566672,book-review,\n,James S. Diamond,2000,eng,journal-article-10.2307_1566672.xml,journal-article-10.2307_1566672-ngram1.txt


In [8]:
df.to_csv("articles_raw.csv", index=False)