This program begins with the jstor data and creates a dataframe that is used for topic modelling.

In [None]:
import pandas as pd
import re
from tqdm import tqdm
import xml.etree.ElementTree as ET
from pathlib import Path
import os
import sys
import numpy as np
import json
import datetime
import pickle

import nltk
from nltk import ne_chunk_sents, ne_chunk, pos_tag, word_tokenize
from nltk.tree import Tree
nltk.download('punkt')
import spacy

from collections import Counter
import operator

from nltk.corpus import stopwords

# from nltk.stem.porter import PorterStemmer
# stemmer = PorterStemmer()

from nltk.stem.wordnet import WordNetLemmatizer
lemma = WordNetLemmatizer()

In [None]:
def get_citations(doc_type, doc_root):
    if doc_type != 'research-article':
        return []
    xml_cits = doc_root.findall('back/fn-group/fn/p/mixed-citation')
    citations = []
    for i in range(len(xml_cits)):
        try:
            cit_author = xml_cits[i].find('person-group/string-name/surname').text
        except AttributeError:
            cit_author = ''
        try:
            cit_title = xml_cits[i].find('source').text
        except AttributeError:
            cit_title = ''
        try:
            cit_year = xml_cits[i].find('year').text
        except AttributeError:
            cit_year = ''
        try:
            cit_reference = xml_cits[i].text
        except AttributeError:
            cit_reference = ''
        source = (cit_author, cit_title, cit_year, cit_reference)
        citations.append(source)
    return citations

In [None]:
def xml2csv(src_path):
    """Creates an initial dataset from XML files found in src_path. Columns of the CSV include
    id, author, title, year, type, and language. Returns a pandas dataframe.

    Args:
        src_path (String): path to directory of XML files to pull metadata from
    """
    src_path = Path(src_path).resolve()
    files = src_path.iterdir()
    cols = ['id', 'type', 'title', 'auth1', 'year', 'lang','citations']
    df = pd.DataFrame(columns=cols)
    for i, f in tqdm(enumerate(files), desc='Reading metadata files'):    
        tree = ET.parse(f)
        root = tree.getroot()
        id = str(f).split("metadata/")[0].split(".x")
        type = root.attrib['article-type']
        # title handling
        title_group = root.find('front/article-meta/title-group')
        if title_group is not None and len(title_group.getchildren()) > 0:
            title = list(title_group.itertext())[1]
        else:
            title = ''
        # author handling
        contrib_group = root.find('front/article-meta/contrib-group')
        if contrib_group is not None and len(contrib_group.getchildren()) > 0:
            auth1 = ' '.join([list(c.itertext())[0] for c in root.find('front/article-meta/contrib-group/contrib/string-name')])
        else:
            auth1 = ''
        lang = list(root.find('front/article-meta/custom-meta-group/custom-meta/meta-value').itertext())[0]
        year = int(list(root.find('front/article-meta/pub-date/year').itertext())[0])
        # citation handling
        citations = get_citations(type, root)
        df.loc[i] = [id, type, title, auth1, year, lang, citations]
    print(f"\nCollected {df.shape[0]} articles")
    return df

In [None]:
# clean auth1 values by splitting merged names
def format_names(name):
    """Splits merged strings representing author names into forename and surname.
    Does not modify correctly formatted names.

    Arguments:
        name {String} -- Merged fore and surnames
    """
    n_caps = len(re.findall('[A-Z]', name))
    n_spaces = len(re.findall(' ', name))
    if any("\u0590" <= c <= "\u05EA" for c in name):
        # pass formatting for non-English names
        return name
    if n_caps - n_spaces != 1:
        comps = re.findall('[A-Z][^A-Z]*', name)
        # remove whitespace before or after components
        comps = [c.strip() for c in comps]
        f_name = " ".join(comps).replace("- ", "-").replace("I ", "I")
        return f_name
    else:
        return name

In [None]:
def remove_misc_articles(df):
    """Removes articles with the type 'misc' and stores them in a
    separate dataframe. Returns a tuple of the misc dataframe
    and a copy of df with the misc article rows removed.

    Args:
        df (Pandas dataframe): Dataframe from which to remove misc rows

    Returns:
        [Tuple]: (misc dataframe, copy of original dataframe with misc removed)
    """
    clean_df = df.copy()
    misc_indices = df[df['type'] == 'misc'].index
    misc_df = df.loc[misc_indices]
    clean_df.drop(misc_indices, axis=0, inplace=True)
    return (clean_df, misc_df)

In [None]:
def add_text (df1):
    ocr_dir='DIRECTORY WITH OCR TEXT FILES'
    for i in range (1,len(df1)):
        df1.loc[i,'id'][0]=df1.loc[i,'id'][0].replace('metadata','ocr')
        text_id=df1.loc[i,'id'][0]+'.txt'
        with open (text_id,'r',encoding='utf8') as infile:
            f=infile.read()
            df1.loc[i,'text']=f
    return df1

In [None]:
refdf=xml2csv('DIRECTORY WITH DFR METADATA FILES')
refdf['auth1'].apply(format_names)
ref_df=remove_misc_articles(refdf)
ref_df1=ref_df[0]
ref_df1=ref_df1.reset_index()
ref_df1['text']=''
reffinal_df=add_text(ref_df1)

reffinal_df.to_csv('referenceDF')


In [None]:
reffinal_df

In [None]:
reffinal_df['text'] = [''.join(x.strip().split('**********')) for x in reffinal_df['text']]
reffinal_df['text'] = [' '.join(x.split('_______')) for x in reffinal_df['text']]
reffinal_df['text'] = [''.join(x.split('\n                    ')) for x in reffinal_df['text']]
reffinal_df['text'] = [' '.join(x.split('         ')) for x in reffinal_df['text']]
reffinal_df['text'] = [' '.join(x.split('<plain_text>')) for x in reffinal_df['text']]
reffinal_df['text'] = [' '.join(x.split('</plain_text>')) for x in reffinal_df['text']]
reffinal_df['text'].replace('[^A-Za-z0-9]+',' ',regex=True,inplace=True)


In [None]:
def get_continuous_chunks(named_entities,text):
    chunked = ne_chunk(pos_tag(word_tokenize(text)))
    prev = None
    continuous_chunk = []
    current_chunk = []
    for i in chunked:
        if type(i) == Tree:
            current_chunk.append(" ".join([token for token, pos in i.leaves()]))
        elif current_chunk:
            named_entity = " ".join(current_chunk)
            if named_entity not in continuous_chunk:
                continuous_chunk.append(named_entity)
                current_chunk = []
            else:
                continue
    named_entities += continuous_chunk

In [None]:
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')
named_entities = []
article = 0
for a in reffinal_df['text']:
    get_continuous_chunks(named_entities,a)
    article = a
print(named_entities)

In [None]:
len(named_entities)

In [None]:
with open('list_of_named_entities.pickle', 'wb') as file:
    pickle.dump(named_entities, file)

In [None]:
named_entities_counts = Counter(named_entities)
len(named_entities_counts)

In [None]:
named_entities_counts = sorted(named_entities_counts.items(), key=operator.itemgetter(1),reverse=True)
 
with open('dict_of_named_entities_counts.pickle', 'wb') as file:
    pickle.dump(named_entities_counts, file)

In [None]:
# Create final list of 1000 most occurring named entities to remove from text
common_entities = []
for i in np.arange(0,1000):
    common_entities.append(
#         [
            named_entities_counts[i][0]
#                                ,named_entities_counts[i][1]]
    )
common_entities

In [None]:
#Copy output above and paste into brackets, and manually remove any words you want to retain in the text
entities_to_remove=[
    'YOUR WORDS HERE'
]

In [None]:
len(entities_to_remove)

In [None]:
entities_to_remove=sorted(entities_to_remove)
entities_to_remove

In [None]:
with open('entities_to_remove.pickle', 'wb') as file:
    pickle.dump(entities_to_remove, file)

In [None]:
def remove_entities(article):
    for entity in entities_to_remove:
        if ' '+entity+' ' in article:
            article = article.replace(entity+' ','') 
        elif ' '+entity+'.' in article:
            article = article.replace(' '+entity,'')
        elif ' '+entity+',' in article:
            article = article.replace(' '+entity,'')
        elif ' '+entity+':' in article:
            article = article.replace(' '+entity,'')
        elif ' '+entity+'-' in article:
            article = article.replace(' '+entity,'')
        elif ' '+entity+';' in article:
            article = article.replace(' '+entity,'')
        elif ' '+entity+'"' in article:
            article = article.replace(' '+entity,'')
        elif ' '+entity+"'" in article:
            article = article.replace(' '+entity,'')
        elif ' '+entity+"]" in article:
            article = article.replace(' '+entity,'')
        elif ' '+entity+")" in article: # added later
            article = article.replace(' '+entity,'')
        elif ' '+entity+"?" in article:
            article = article.replace(' '+entity,'')
        elif ' '+entity+"!" in article: # added later
            article = article.replace(' '+entity,'')
        elif '"'+entity+' ' in article:
            article = article.replace(entity+' ','')
        elif "'"+entity+' ' in article:
            article = article.replace(entity+' ','')
        elif "["+entity+' ' in article:
            article = article.replace(entity+' ','')
        elif "("+entity+' ' in article: # added later
            article = article.replace(entity+' ','')
        elif "["+entity+']' in article:
            article = article.replace(entity,'')
        elif "("+entity+')' in article: # added later
            article = article.replace(entity,'')
        elif "'"+entity+"'" in article:
            article = article.replace(entity,'')
        elif '"'+entity+'"' in article:
            article = article.replace(entity,'')
    return(article)

In [None]:
reffinal_df['text_noent'] = [remove_entities(x) for x in reffinal_df['text']]

In [None]:
with open('raw_data_cleaned_named_ent_removed.pickle', 'wb') as file:
    pickle.dump(reffinal_df, file)

In [None]:
reffinal_df['tokenized_text'] = [word_tokenize(x) for x in reffinal_df['text_noent']]
# Remove punctuation
reffinal_df['tokenized_nopunc'] = [[word for word in x if word.isalpha()] for x in reffinal_df['tokenized_text']]
# Remove capitalization
reffinal_df['tokenized_nopunc_lower'] = [[word.lower() for word in x] for x in reffinal_df['tokenized_nopunc']]

# Alternative method, if we'd be interested in keeping numbers as well:
# import string
# exclude = set(string.punctuation) 
# punc_free = ''.join(ch for ch in stop_free if ch not in exclude)

In [None]:
# Check
reffinal_df.iloc[200]['tokenized_nopunc_lower']

In [None]:
#You will refine your stopwords.  You may want to import a file as well.
custom_stop_words = ['ab', 'al', 'alten', 'america', 'atlanta', 'au', 'av', 'avrov', 'b', 'ba', 'bauer', 'berlin', 'BOOK',
                    'boston', 'brill', 'brown', 'c', 'cad', 'cambridge', 'cf', 'ch', 'chap', 'chapter', 'charles',
                    'chicago', 'chs', 'cit', 'cite', 'claremont', 'college', 'craig', 'cum', 'd', 'dans', 'de', 'dennis',
                    'diese', 'dissertation', 'dm', 'dtr', 'ed', 'eds', 'eerdmans', 'ek', 'elisabeth', 'en', 'et',
                    'ev', 'ez', 'f', 'far', 'ff', 'fiir', 'g', 'gar', 'george', 'geschichte', 'gott', 'gottes',
                    'grand', 'h', 'ha', 'hall', 'hartford', 'hat', 'haven', 'henry', 'I', 'ia', 'ibid', 'io',
                    'isbn', 'iv', 'ivye', 'ix', 'jeremias', 'jesu', 'k', 'ka', 'kai', 'kal', 'kat', 'kee', 'ki', 'kim',
                    'kirche', 'klein', 'knox', 'l', 'la', 'le', 'leiden', 'leipzig', 'les', 'life', 'line', 'loc', 'louisville', 'm',
                    'ma', 'madison', 'marie', 'marshall', 'mohr', 'n', 'na', 'neuen', 'ni', 'nu', 'nur', 'o', 'ol',
                    'om', 'op', 'ov', 'ovadd', 'ovk', 'oxford', 'paper', 'pp', 'paulus', 'ph', 'philadelphia', 'point', 'post',
                    'pres', 'president', 'press', 'pro', 'prof', 'professor','quod', 'r', 'ra', 'rab', 'rapids', 'refer', 'review','REVIEWS'
                    'reviews', 'ro', 'robert', 'robinson', 'rov', 's', 'sa', 'schmidt', 'schriften', 'scott', 'sec',
                    'section', 'seiner', 'sheffield', 'siebeck', 'stanely', 'studien', 't', 'text', 'thee', 'theologie',
                    'they', 'thing', 'thou', 'thy', 'tiibingen','tion', 'tov', 'tr', 'tv', 'u', 'um', 'univ', 'University', 'unto', 'v',
                    'van', 'verse','view', 'vol', 'volume', 'vs', 'vss', 'vv', 'w', 'william', 'world' 'wunt',
                    'y', 'yap', 'ye', 'york', 'zeit','-PRON-', 'jews','jewish', 'judaism', 'page_sequence','page','book','text','doe', 
                    'books','publish','include','say','die','der','des','das','und','ha','ha-','new','ica','ceede', 'sequence', 
                     'ibn', 'ben','say','br','ts','aj','thing','iii','nx','va','pr','give','way','nn','im','ny','mn','rn','nm',
                    'ri','nl','gt']
    

In [None]:
nlp = spacy.load('en_core_web_sm')
nltk_stop = nlp.Defaults.stop_words
nltk_list=list(nltk_stop)
en_stop = sorted(list(nltk_list + custom_stop_words))
# en_stop=en_stop.extend (custom_stop_words)

    

In [None]:
reffinal_df['tokenized_nopunc_lower_nostop'] = [[word for word in x if not word in en_stop] for x in reffinal_df['tokenized_nopunc_lower']]

In [None]:
# Check
len(reffinal_df.iloc[500]['tokenized_nopunc_lower_nostop'])

In [None]:
#Just another way to keep adding stopwords to remove from the text
extra_stop_words = [
    'big','small','low','high',
    'none',
    'may',
    'among',
    'within',
    'don','t',
    'day',
    'etc',
    'around',
    'frequent',
    'including',
    'even',
    'can',
    'likely',
    'will',
    'like',
    'today',
    'bit',
    'put',
    'aim',
    's',
    'got',
    'really',
    'huge',
    'see',
    'almost',
    'already',
    'much',
    'recent',   #
    'many',
    'change',    #
    'changes',       #
    'someone',
    'said',
    'says',
    'gives',
    'give',
#     'people',
    'new',
    'say',
    'least','first','last','second',
    'one','two',
    'go',
    'goes',
    'take',
    'going',
    'taking',
    'just',
    'can'
    'cannot',
    'keep',
    'keeps',
    'also',
    'done',
    'good',
    'get',
    'without',
    'told',
    'might',
    'time',
    'unable',  #
    'able',  #
    'know',
    'end',
    'now',
    'want',
    'didn',
    'back',
    'doesn',
    'couldn',
    'since',
    'shouldn',
    'seen',
    'works',
    'zero',
    'every',
    'each',
    'other',
    'ever',
    'neither',
    'll',
    'mr',
    'ms',
    'mrs',
    'think',
    'tomorrow',
    'way',
    'still',
    'know',
    'later',
    'fine',    #
    'let',
    'went',
    'night',
    've',
    'must',
    'act',  #
    're',
    'c','b', 'a',
    'done',
    'began',
    'ones',
    'm',
    'soon',
    'word',
    'along',
    'main',
    'q',
    'lot',
    'e', 'd',
    'entire',
    'year',
    'mean',
    'means',
    'important',
    'always',
    'something',
    'rather',
    'either',
    'makes',
    'make',
    'uses',
    'use',
    'enough',
    'w','d',
    'never',
    'giving',
    'o',
    'involve',
    'involes',
    'involving',
    'little',
    'inside',
    'sat',
    'third','fourth','fifth','sixth',
    'next',
    'given',
    'million','billion','millions','billions',
    'option',
    'options',
    'full',
    'complete',
    'need',
    'needs',
    'set',
    'manage',
    'sets',
    'manages',
    'bring','brings','brought',
    'try','tries','tried'
    'week',
    'former',
    'monday','tuesday','wednesday','thursday','friday','saturday','sunday',
    'spent','spend', 'spends',
    'month','months',
    'send','sends','sent',
    'went',
    'january','february','march','april','may','june','july','august','september','october','november','december',
    'allow',
    'process',
#     'old',
    'times',
    'nearly',
    'looking','looks','look',
    'thinly',
    'becoming',
    'stay','stays',
    'took','takes','take',
    'types', 'type',
    'thought', 'though',
    'idea',
    'clear','clearly',
    'behind',
    'half',
    'us',
    'less',
    'claim','claims',
    'long', 'short',
    'smaller','larger','bigger','largest','biggest','smallest','longer','shorter','short','long',
    'extreme','severe',
    'largely',
    'anymore',
    'years',
    'spoke',
    'give','gave','given','gives',
    'reportedly','supposedly','alledgedly',
    'please',
    'received','receive','receives',
    'longtime',
    'best',
    'existing',
    'putting','put','puts',
    
    'whose',
    'yesterday',
    
    
    
    'thing',   #added later
    'week',
    'another',
    'month',
    'day',
    'come']

In [None]:
reffinal_df['tokenized_nopunc_lower_nostop_extra'] = [[word for word in x if not word in extra_stop_words] for x in reffinal_df['tokenized_nopunc_lower_nostop']]

In [None]:
reffinal_df['tokenized_nopunc_lower_nostop_extra_lemmatized'] = [[lemma.lemmatize(word) for word in x] for x in reffinal_df['tokenized_nopunc_lower_nostop_extra']]

In [None]:
with open('clean_data_full.pickle', 'wb') as file:
    pickle.dump(reffinal_df, file)

    # Drop interim colums
raw_small = pd.DataFrame(reffinal_df['tokenized_nopunc_lower_nostop_extra_lemmatized'])
raw_small.rename(columns={'tokenized_nopunc_lower_nostop_extra_lemmatized':'article_text'},inplace=True)

with open('clean_data_small.pickle', 'wb') as file:
    pickle.dump(raw_small, file)


In [None]:
research_article_interim=pd.DataFrame(reffinal_df[reffinal_df.type.eq('research-article')])
book_review_interim=pd.DataFrame(reffinal_df[reffinal_df.type.eq('book-review')])
research_article_small = pd.DataFrame(research_article_interim['tokenized_nopunc_lower_nostop_extra_lemmatized'])
research_article_small.rename(columns={'tokenized_nopunc_lower_nostop_extra_lemmatized':'article_text'},inplace=True)
book_review_small = pd.DataFrame(book_review_interim['tokenized_nopunc_lower_nostop_extra_lemmatized'])
book_review_small.rename(columns={'tokenized_nopunc_lower_nostop_extra_lemmatized':'article_text'},inplace=True)
citations_small=pd.DataFrame(reffinal_df.filter(['id','citations'],axis=1))

with open('clean_data_research_small.pickle', 'wb') as file:
    pickle.dump(research_article_small, file)

with open('clean_data_book_small.pickle', 'wb') as file:
    pickle.dump(book_review_small, file)

with open('clean_data_citations_small.pickle', 'wb') as file:
    pickle.dump(citations_small, file)

In [None]:
citations_small.to_csv('citations.csv')