In [205]:
import pandas as pd
import re
from tqdm import tqdm
import xml.etree.ElementTree as ET
from pathlib import Path
import os
import sys
import numpy as np
import json
import datetime
import pickle

import nltk
from nltk import ne_chunk_sents, ne_chunk, pos_tag, word_tokenize
from nltk.tree import Tree
nltk.download('punkt')
import spacy

from collections import Counter
import operator

from nltk.corpus import stopwords

# from nltk.stem.porter import PorterStemmer
# stemmer = PorterStemmer()

from nltk.stem.wordnet import WordNetLemmatizer
lemma = WordNetLemmatizer()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\msatlow\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [216]:
def get_citations(doc_type, doc_root):
    if doc_type != 'research-article':
        return []
    xml_cits = doc_root.findall('back/fn-group/fn/p/mixed-citation')
    citations = []
    for i in range(len(xml_cits)):
        try:
            cit_author = xml_cits[i].find('person-group/string-name/surname').text
        except AttributeError:
            cit_author = ''
        try:
            cit_title = xml_cits[i].find('source').text
        except AttributeError:
            cit_title = ''
        try:
            cit_year = xml_cits[i].find('year').text
        except AttributeError:
            cit_year = ''
        try:
            cit_reference = xml_cits[i].text
        except AttributeError:
            cit_reference = ''
        source = (cit_author, cit_title, cit_year, cit_reference)
        citations.append(source)
    return citations

In [217]:
def xml2csv(src_path):
    """Creates an initial dataset from XML files found in src_path. Columns of the CSV include
    id, author, title, year, type, and language. Returns a pandas dataframe.

    Args:
        src_path (String): path to directory of XML files to pull metadata from
    """
    src_path = Path(src_path).resolve()
    files = src_path.iterdir()
    cols = ['id', 'type', 'title', 'auth1', 'year', 'lang','citations']
    df = pd.DataFrame(columns=cols)
    for i, f in tqdm(enumerate(files), desc='Reading metadata files'):    
        tree = ET.parse(f)
        root = tree.getroot()
        id = str(f).split("metadata/")[0].split(".x")
        type = root.attrib['article-type']
        # title handling
        title_group = root.find('front/article-meta/title-group')
        if title_group is not None and len(title_group.getchildren()) > 0:
            title = list(title_group.itertext())[1]
        else:
            title = ''
        # author handling
        contrib_group = root.find('front/article-meta/contrib-group')
        if contrib_group is not None and len(contrib_group.getchildren()) > 0:
            auth1 = ' '.join([list(c.itertext())[0] for c in root.find('front/article-meta/contrib-group/contrib/string-name')])
        else:
            auth1 = ''
        lang = list(root.find('front/article-meta/custom-meta-group/custom-meta/meta-value').itertext())[0]
        year = int(list(root.find('front/article-meta/pub-date/year').itertext())[0])
        # citation handling
        citations = get_citations(type, root)
        df.loc[i] = [id, type, title, auth1, year, lang, citations]
    print(f"\nCollected {df.shape[0]} articles")
    return df

In [208]:
# clean auth1 values by splitting merged names
def format_names(name):
    """Splits merged strings representing author names into forename and surname.
    Does not modify correctly formatted names.

    Arguments:
        name {String} -- Merged fore and surnames
    """
    n_caps = len(re.findall('[A-Z]', name))
    n_spaces = len(re.findall(' ', name))
    if any("\u0590" <= c <= "\u05EA" for c in name):
        # pass formatting for non-English names
        return name
    if n_caps - n_spaces != 1:
        comps = re.findall('[A-Z][^A-Z]*', name)
        # remove whitespace before or after components
        comps = [c.strip() for c in comps]
        f_name = " ".join(comps).replace("- ", "-").replace("I ", "I")
        return f_name
    else:
        return name

In [209]:
def remove_misc_articles(df):
    """Removes articles with the type 'misc' and stores them in a
    separate dataframe. Returns a tuple of the misc dataframe
    and a copy of df with the misc article rows removed.

    Args:
        df (Pandas dataframe): Dataframe from which to remove misc rows

    Returns:
        [Tuple]: (misc dataframe, copy of original dataframe with misc removed)
    """
    clean_df = df.copy()
    misc_indices = df[df['type'] == 'misc'].index
    misc_df = df.loc[misc_indices]
    clean_df.drop(misc_indices, axis=0, inplace=True)
    return (clean_df, misc_df)

In [210]:
def add_text (df1):
    ocr_dir='/Users/msatlow/Dropbox/AJSProject/data/jstor_data/ocr/'
    for i in range (1,len(df1)):
        df1.loc[i,'id'][0]=df1.loc[i,'id'][0].replace('metadata','ocr')
        text_id=df1.loc[i,'id'][0]+'.txt'
        with open (text_id,'r',encoding='utf8') as infile:
            f=infile.read()
            df1.loc[i,'text']=f
    return df1

In [225]:
refdf=xml2csv('/Users/msatlow/Dropbox/AJSProject/data/jstor_data/metadata')
refdf['auth1'].apply(format_names)
ref_df=remove_misc_articles(refdf)
ref_df1=ref_df[0]
ref_df1=ref_df1.reset_index()
ref_df1['text']=''
reffinal_df=add_text(ref_df1)

reffinal_df.to_csv('referenceDF')


Reading metadata files: 1644it [00:11, 148.29it/s]



Collected 1644 articles


In [230]:
reffinal_df

Unnamed: 0,index,id,type,title,auth1,year,lang,citations,text
0,1,[C:\Users\msatlow\Dropbox\AJSProject\data\jsto...,research-article,The Beginnings of Modern Hebrew Literature: Pe...,Arnold J. Band,1988,eng,"[(Bate, The Burden of the Past and the English...",
1,2,[C:\Users\msatlow\Dropbox\AJSProject\data\jsto...,research-article,Sholem Aleichem: Mythologist of the Mundane,David G. Roskies,1988,eng,"[(Roskies, Prooftexts, 1986, \nDavid G. Roskie...","<plain_text><page sequence=""1"">SHOLEM ALEICHEM..."
2,3,[C:\Users\msatlow\Dropbox\AJSProject\data\jsto...,research-article,"""A Word for My Blood:"" A Reading of Kadya Molo...",Kathryn Hellerstein,1988,eng,"[(Molodowsky, Fun mayn elterzeydns yerushe, , ...","<plain_text><page sequence=""1"">""A WORD FOR MY ..."
3,4,[C:\Users\msatlow\Dropbox\AJSProject\data\jsto...,research-article,Warding off Chaos: Repetition and Obsession in...,Janet Hadda,1988,eng,"[(Thorner, International Journal of Psycho-Ana...","<plain_text><page sequence=""1"">WARDING OFF CHA..."
4,5,[C:\Users\msatlow\Dropbox\AJSProject\data\jsto...,research-article,Binary Oppositions in the Poetry of Amir Gilboa,Warren Bargad,1988,eng,"[(Tamir-Ghez, PTL, 1978, \nNomi Tamir-Ghez, ""B...","<plain_text><page sequence=""1"">BINARY OPPOSITI..."
5,6,[C:\Users\msatlow\Dropbox\AJSProject\data\jsto...,research-article,"""Kill Your Ordinary Common Sense and Maybe You...",David C. Jacobson,1988,eng,"[(Appelfeld, Masot beguf rishon, 1979, \nAharo...","<plain_text><page sequence=""1"">""KILL YOUR ORDI..."
6,7,[C:\Users\msatlow\Dropbox\AJSProject\data\jsto...,research-article,Ghost-Writing: Philip Roth's Portrait of the A...,Donald Kartiganer,1988,eng,"[(, Reading Myself and Others, 1985, \nReading...","<plain_text><page sequence=""1"">GHOST-WRITING: ..."
7,8,[C:\Users\msatlow\Dropbox\AJSProject\data\jsto...,research-article,"Reinventing Bruno Schulz: Cynthia Ozick's ""The...",Naomi Sokoloff,1988,eng,"[(Wieniewska, The Street of Crocodiles, 1977, ...","<plain_text><page sequence=""1"">REINVENTING BRU..."
8,13,[C:\Users\msatlow\Dropbox\AJSProject\data\jsto...,research-article,Mishnaic Literary History and the History of a...,Martin S. Jaffee,1986,eng,"[(Epstein, Introduction to the Text of the Mis...","<plain_text><page sequence=""1"">MISHNAIC LITERA..."
9,14,[C:\Users\msatlow\Dropbox\AJSProject\data\jsto...,research-article,The Post: Rav Ashi Amoraim: Transition or Cont...,Richard Kalmin,1986,eng,"[(Brüll, Jahrbücher für jüdische Geschichte un...","<plain_text><page sequence=""1"">THE POST-RAV AS..."


In [231]:
reffinal_df['text'] = [''.join(x.strip().split('**********')) for x in reffinal_df['text']]
reffinal_df['text'] = [' '.join(x.split('_______')) for x in reffinal_df['text']]
reffinal_df['text'] = [''.join(x.split('\n                    ')) for x in reffinal_df['text']]
reffinal_df['text'] = [' '.join(x.split('         ')) for x in reffinal_df['text']]
reffinal_df['text'] = [' '.join(x.split('<plain_text>')) for x in reffinal_df['text']]
reffinal_df['text'] = [' '.join(x.split('</plain_text>')) for x in reffinal_df['text']]
reffinal_df['text'].replace('[^A-Za-z0-9]+',' ',regex=True,inplace=True)


In [138]:
def get_continuous_chunks(named_entities,text):
    chunked = ne_chunk(pos_tag(word_tokenize(text)))
    prev = None
    continuous_chunk = []
    current_chunk = []
    for i in chunked:
        if type(i) == Tree:
            current_chunk.append(" ".join([token for token, pos in i.leaves()]))
        elif current_chunk:
            named_entity = " ".join(current_chunk)
            if named_entity not in continuous_chunk:
                continuous_chunk.append(named_entity)
                current_chunk = []
            else:
                continue
    named_entities += continuous_chunk

In [145]:
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')
named_entities = []
article = 0
for a in reffinal_df['text']:
    get_continuous_chunks(named_entities,a)
    article = a
print(named_entities)

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\msatlow\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\msatlow\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\msatlow\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\words.zip.
IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [146]:
len(named_entities)

269840

In [147]:
with open('list_of_named_entities.pickle', 'wb') as file:
    pickle.dump(named_entities, file)

In [148]:
named_entities_counts = Counter(named_entities)
len(named_entities_counts)

187442

In [149]:
named_entities_counts = sorted(named_entities_counts.items(), key=operator.itemgetter(1),reverse=True)
 
with open('dict_of_named_entities_counts.pickle', 'wb') as file:
    pickle.dump(named_entities_counts, file)

In [150]:
# Create final list of 1000 most occurring named entities to remove from text
common_entities = []
for i in np.arange(0,1000):
    common_entities.append(
#         [
            named_entities_counts[i][0]
#                                ,named_entities_counts[i][1]]
    )
common_entities

['Jewish',
 'Israel',
 'Hebrew',
 'Jews',
 'BOOK',
 'New York',
 'University',
 'Christian',
 'English',
 'Judaism',
 'German',
 'Jerusalem',
 'God',
 'Torah',
 'European',
 'Europe',
 'Jewish Jewish',
 'AJS',
 'America',
 'Bible',
 'American',
 'Talmud',
 'New',
 'Jewish Studies',
 'Moses',
 'French',
 'Germany',
 'Greek',
 'California',
 'Book',
 'Palestine',
 'Israeli',
 'Yiddish',
 'David',
 'Abraham',
 'United States',
 'Roman',
 'Modern',
 'Mishnah',
 'Middle',
 'Lord',
 'Palestinian',
 'Maimonides',
 'Egypt',
 'Western',
 'Zionist',
 'Jewish Christian',
 'Chicago',
 'Russian',
 'Jacob',
 'Sefer',
 'France',
 'See',
 'Spain',
 'Arabic',
 'Babylonian',
 'Muslim',
 'Biblical',
 'Spanish',
 'Ibid',
 'Land',
 'Joseph',
 'Temple',
 'Israelite',
 'Sabbath',
 'Poland',
 'Medieval',
 'British',
 'Scripture',
 'Hebrew University',
 'Israel Israel',
 'R',
 'Zionism',
 'Eastern',
 'Rabbinic',
 'London',
 'Middle Ages',
 'Italian',
 'Islamic',
 'East',
 'Italy',
 'Christianity',
 'Old',
 'Ea

In [232]:
entities_to_remove=[
  'Jewish',
 'Israel',
 'Hebrew',
 'Jews',
 'BOOK',
 'New York',
 'University',
 
 
 'Judaism',
 
 'Jerusalem',
 
 
 
 'Jewish Jewish',
 'AJS',
 
 'New',
 'Jewish Studies',
 'Moses',
 
 'California',
 'Book',
 'Palestine',
 
 'David',
 'Abraham',
 'United States',
 
 'Modern',
 'Mishnah',
 
 'Lord',
 
 'Jewish Christian',
 'Chicago',
 
 'Jacob',

 'France',
 
 
 'Ibid',
 
 
 'Hebrew University',
 'Israel Israel',
 'R',
 
 
 'London',
 
 'East',
 
 'Christianity',
 

 
 'Society',
 
 
 'Pennsylvania',
 'Cohen',
 'Solomon',
 
 
 'American Jewish',
 'Hebrew Hebrew',
 
 'Isaac',

 
 'Late',

 'Tel Aviv',
 
 'Tel',
 'Second',
 'Religion',
 
 
 'Sinai',
 'Congress',
 
 
 'Chris',
 
 'How',
 
 'Life',

 'Levi',
 
 
 'Jewish Research',
 'Ancient',
 'American Academy',

 
 'Studies',
 
 'III',
 'House',

 
 'Christian Christian',
 
 
 'Ha',
 
 'Paul',
 
 'German Jewish',
 
 
 'Jewry',
 'American American',
 
 
 'Song',
 
 'Torah Torah',

 
 'AJS Review',
 'Judaica',
 
 'First',
 
 'Berkeley University',
 
 'Leiden E',
 
 'Origins',
 'Princeton University',
 
 'New York Oxford University',
 'Songs',
 'Land Israel',
 
 'Jews Jews',
 
 

 
 'Yiddish Yiddish',
 'Jewish Jewish Jewish',
 
 'Emergence',
 'Jewish Israel',

 'Jewish Hebrew',
 
 'Association',
 'Century',
 
 'Part',
 'Social',
 
 'Leiden Brill',
 'Jewish Judaism',
 'Yisra',
 'Development',
 'REVIEW',
 'Atlanta Scholars',
 
 'Institute',
 
 'Epstein',
 'Benjamin',
 
 'Jewish New York',
 'Hebrew Jewish',
 
 'Israel Jewish',
 
 'Venice',
 'North',
 'Him',
 'Community',
 
 'German German',
 'Culture',
 'Bialik',
 
 
 'John',
 'Oriental',
 'Children',
 
 'Jewish European',
 
 'Religious History',
 'Jewish History',
 'Journal',

 'Com',
 'Yohai',
 'Reviews',
 
 'Israel God',
 
 'Simon',
 'Jewish Europe',
 'New York New York',
 'University New York',
 'Al',
 'Ibid Ibid',
 'Austrian',
 'Meir',
 
 'Brandeis University',
 
 'Pales',
 
 'God God',
 'South',
 
 'Jewish Community',
 'Maimonides Maimonides',
 'Hebrew Yiddish',
 'Text',

 'Concept',
 'Numbers',
 
 'Sciences',
 
 'Alabama',
 'Mishnah Mishnah',
 'Neusner',

 'Christian Jews',
 'Rise',
 
 'Rab',
 'Texts',
 'Alexander',
 
 'Albany State University',
 'Brooklyn',
 
 'Light',
 
 'Harvard',
 'Perplexed',
 'Ger',
 
 'Britain',
 'Mai',
 
 'Jerusa',
 'Hebrew English',
 'Maimonides Guide',
 'School',
 'Central',
 'Idea',
 'Sha',
 
 
 'Iran',
 'Dutch',
 'Israeli Israeli',
 'God Israel',

 'Bar Ilan',

 'Hebrew Arabic',
 
 'Oxford',
 
 'Wayne State University',
 
 'Cambridge',
 'V',
 'Historical',
 'Writings',
 'Menahem',
 'Yosef',
 'Jewish America',
 'R Yohanan',

 'Amer',
 'European Jewish',
 
 'Essays',
 'European Jewry',
 'Albeck',

 'Asher',
 'Jordan',
 'Who',
 
 'New York City',
 'Jewish Society',
 'Katz',
 'De',
 

 'Bloomington Indiana University',
 
 'San',
 'Compare',
 
 'Epistle',
 'Mishnah Tosefta',
 'Lieberman',
 'Kingdom',
 'Turkey',
 'Damascus',
 'Gershom',
 'Alexander Altmann',
 'City',
 'Baron',
 'Garden',
 
 'Toronto',
 'Northern',
 'Religious',
 'Sephardi',
 'Judaic',
 'Jewish BOOK',
 
 'R Judah',
 'Death',
 'Beginnings',
 'Torah Israel',
 'Center',
 'Ramat Gan Bar Ilan University',
 'Nahman',
 'Yiddish Hebrew',
 
 'Columbia',
 
 'Jews Christian',

 'Science',
 'Gordon',
 
 'Jacob Katz',
 'Jewish Culture',

 'Jewish Israeli',
 'Boyarin',
 'Sabbath Sabbath',
 'Rachel',
 'Frank',
 'Eliezer',
 
 'Arabic Hebrew',
 'Don',
 'Goitein',
 'Davidic',
 
 'Context',
 
 'Israel Israeli',
 'Nature',
 'Tal',
 'Michigan',
 

 'Boston',
 'Poetry',
 'Six',

 'Dan',


 'Baby',
 'Works',
 
 'Republic',
 'Israel Torah',
 'North African',
 'Indian',
 
 'Chicago University',
 'Note',
 'Books',
 
 'Wisconsin',
 'Identity',
 'Origin',
 'THE',
 'Ma',
 
 'Introduction',
 'Babylonian Palestinian',
 
 'No',
 
 'Le',
 'Jewish Yiddish',
 'Council',
 
 'Pale',
 'High Middle Ages',

 'Quest',
 'Authority',
 
 'New York Cambridge University',
 
 
 'Holo',
 'Israel Hebrew',
 'Central Europe',
 'Eli',
 'Further',
 
 
 'Weiss',
 'Rav Kahana',
 
 'Change',
 'German Jewry',

 'Jeru',
 
 'Jerusalem Mosad',
 'Nebraska',
 'Poetics',
 
 'Florence',
 'BOOK Jewish',
 'Moshe',
 'India',
 
 
 'Hebrew German',

 'Bar Ilan University',
 'Israel Lord',

 'Rubenstein',
 
 'Princeton NJ Princeton University',
 'Rabbah',
 'America New York',
 
 'Polish Jewish',
 'Kaplan',
 'Impact',
 
 
 'Stern',

 'Yehudit',
 'Anglo',
 
 'Mordecai',
 
 'Search',

 'Arnold',

 'French French',
 

 'Lord Lord',
 'New York New York University',

 'David David',
 'Limits',
 
 'PhD',
 'Mohr Siebeck',
 'Fiction',
 'Cornell University',
 
 'Post',
 
 'Con',
 'Breslau',
 
 
 'Fall',
 
 'Letters',

 'University Library',
 
 'Return',
 
 
 
 'Les',
 'Code',
 'Evolution',

 'European Jews',
 'London University',
 
 'Cracow',
 
 'Use',
 'BCE',
 'Dan Miron',
 'Cincinnati',
 
 
 'Department',
 'Talmud Talmud',
 
 'Revue',

 'Michael',
 'Chronicle',
 'Bible Bible',
 'Encyclopaedia',
 'Vatican',
 'Palestinian Palestinian',
 'Mass Harvard University',
 
 'Magnes',
 'Philadelphia',
 
 'Hence',
 'Rhetoric',
 'Jerusalem Mossad',
 
 'Sixteenth Century',
 

 'Jewish National',
 'Marcus',
 'Saul Lieberman',
 
 'K',

 'Stein',


 'Yehuda',

 'English Hebrew',
 'Judaism Jewish',
 
 'Twersky',

 'Hebrew Israel',
 'New York America',

 'New York University',
 'City University',
 'Work',
 'Tel Aviv Am',
 'Juden',

 
 'Kol',
 'Jews Judaism',
 'Moshe Idel',
 'Cambridge University',
 
 'Jerusalem Bialik',

 'Role',
 
 'Good',
 'Euro',
 'Gershon Shaked',
 'Geschichte',
 'Family',
 'Yehudim',
 'Jewish Muslim',
 
 'Human',
 
 'Classical',

 'Minnesota',
 
 'Russian Russian',
 'Jonathan',
 'God Himself',
 'Yitzhak Baer',
 'State',
 
 'Rosh',
 'Method',
 
 'Me',
 'Thou',
 'Almighty',
 'Union',
 'Pines',
 
 
 'Judaism Judaism',
 
 'North America',
 
 'Steven Fraade',
 'Johns Hopkins University',

 'AJSAJS',
 'DAVID',
 
 'Shir',

 'Kegan Paul',
 'Narrative',
 
 'Greek Greek',
 'Rabbi Shimon',

 'Board',
 'Active',

 'Columbia University',
 'American American Jewish',
 'Days',
 'Social History',
 'Berkeley',
 'Los',
 'Yisrael',
 
 'Leon',
 'Baal',
 
 'Oxford University',
 'Nineteenth Century',
 'MSS',
 'Cohen Cohen',
 'Altmann',
 
 'Anglo American',
 'Jewish United States',
 'Louis',
 'Song Songs',
 'Baron A Social',
 'Poor',
 
 'Inn',
 'Ben Gurion University',
 
 'Israeli Israel',
 
 'Simon Dubnow',
 'Lord Israel',

 'Wolfson',

 'Tin',
 'Encyclopedia',
 'Jewish Eastern Europe',
 
 'Public',
 'Negev',
 'Men',
 'Which',
 
 'Princeton Princeton University',
 
 'Schwartz',
 'Friedman',
 'Jerusalem Jewish',

 'Der',
 'Peretz',
 'Childhood',
 'Scrip',
 
 'Child',
 'Structure',
 'Philip',
 'Due',
 'Toledo',
 'Ivrit',
 'Ark',
 'Urbach',
 'Niedergang',
 
 'Ginzberg',
 'Son',
 
 'Ben',
 
 'Jerusalem Hebrew',
 'Beit',
 'Wars',
 'Praise',
 'American America',
 'Europe Jewish',
 
 'Main',
 'Narbonne',
 'Conflict'
]

In [233]:
len(entities_to_remove)

458

In [234]:
entities_to_remove=sorted(entities_to_remove)
entities_to_remove

['AJS',
 'AJS Review',
 'AJSAJS',
 'Abraham',
 'Active',
 'Al',
 'Alabama',
 'Albany State University',
 'Albeck',
 'Alexander',
 'Alexander Altmann',
 'Almighty',
 'Altmann',
 'Amer',
 'America New York',
 'American Academy',
 'American America',
 'American American',
 'American American Jewish',
 'American Jewish',
 'Ancient',
 'Anglo',
 'Anglo American',
 'Arabic Hebrew',
 'Ark',
 'Arnold',
 'Asher',
 'Association',
 'Atlanta Scholars',
 'Austrian',
 'Authority',
 'BCE',
 'BOOK',
 'BOOK Jewish',
 'Baal',
 'Baby',
 'Babylonian Palestinian',
 'Bar Ilan',
 'Bar Ilan University',
 'Baron',
 'Baron A Social',
 'Beginnings',
 'Beit',
 'Ben',
 'Ben Gurion University',
 'Benjamin',
 'Berkeley',
 'Berkeley University',
 'Bialik',
 'Bible Bible',
 'Bloomington Indiana University',
 'Board',
 'Book',
 'Books',
 'Boston',
 'Boyarin',
 'Brandeis University',
 'Breslau',
 'Britain',
 'Brooklyn',
 'California',
 'Cambridge',
 'Cambridge University',
 'Center',
 'Central',
 'Central Europe',
 'Cent

In [235]:
with open('entities_to_remove.pickle', 'wb') as file:
    pickle.dump(entities_to_remove, file)

In [236]:
def remove_entities(article):
    for entity in entities_to_remove:
        if ' '+entity+' ' in article:
            article = article.replace(entity+' ','') 
        elif ' '+entity+'.' in article:
            article = article.replace(' '+entity,'')
        elif ' '+entity+',' in article:
            article = article.replace(' '+entity,'')
        elif ' '+entity+':' in article:
            article = article.replace(' '+entity,'')
        elif ' '+entity+'-' in article:
            article = article.replace(' '+entity,'')
        elif ' '+entity+';' in article:
            article = article.replace(' '+entity,'')
        elif ' '+entity+'"' in article:
            article = article.replace(' '+entity,'')
        elif ' '+entity+"'" in article:
            article = article.replace(' '+entity,'')
        elif ' '+entity+"]" in article:
            article = article.replace(' '+entity,'')
        elif ' '+entity+")" in article: # added later
            article = article.replace(' '+entity,'')
        elif ' '+entity+"?" in article:
            article = article.replace(' '+entity,'')
        elif ' '+entity+"!" in article: # added later
            article = article.replace(' '+entity,'')
        elif '"'+entity+' ' in article:
            article = article.replace(entity+' ','')
        elif "'"+entity+' ' in article:
            article = article.replace(entity+' ','')
        elif "["+entity+' ' in article:
            article = article.replace(entity+' ','')
        elif "("+entity+' ' in article: # added later
            article = article.replace(entity+' ','')
        elif "["+entity+']' in article:
            article = article.replace(entity,'')
        elif "("+entity+')' in article: # added later
            article = article.replace(entity,'')
        elif "'"+entity+"'" in article:
            article = article.replace(entity,'')
        elif '"'+entity+'"' in article:
            article = article.replace(entity,'')
    return(article)

In [237]:
reffinal_df['text_noent'] = [remove_entities(x) for x in reffinal_df['text']]

In [238]:
with open('raw_data_cleaned_named_ent_removed.pickle', 'wb') as file:
    pickle.dump(reffinal_df, file)

In [239]:
reffinal_df['tokenized_text'] = [word_tokenize(x) for x in reffinal_df['text_noent']]
# Remove punctuation
reffinal_df['tokenized_nopunc'] = [[word for word in x if word.isalpha()] for x in reffinal_df['tokenized_text']]
# Remove capitalization
reffinal_df['tokenized_nopunc_lower'] = [[word.lower() for word in x] for x in reffinal_df['tokenized_nopunc']]

# Alternative method, if we'd be interested in keeping numbers as well:
# import string
# exclude = set(string.punctuation) 
# punc_free = ''.join(ch for ch in stop_free if ch not in exclude)

In [240]:
# Check
reffinal_df.iloc[200]['tokenized_nopunc_lower']

['page',
 'sequence',
 'thomas',
 'l',
 'thompson',
 'early',
 'history',
 'of',
 'the',
 'israelite',
 'people',
 'from',
 'the',
 'written',
 'amp',
 'archaeological',
 'sources',
 'in',
 'the',
 'history',
 'of',
 'the',
 'near',
 'j',
 'brill',
 'xv',
 'pp',
 'this',
 'book',
 'represents',
 'dilettantism',
 'gone',
 'wild',
 'at',
 'best',
 'it',
 'could',
 'pass',
 'for',
 'a',
 'seminary',
 'middler',
 's',
 'term',
 'paper',
 'the',
 'text',
 'reveals',
 'a',
 'complete',
 'lack',
 'of',
 'skill',
 'in',
 'the',
 'proper',
 'handling',
 'of',
 'ancient',
 'sources',
 'both',
 'written',
 'and',
 'archaeological',
 'in',
 'the',
 'much',
 'of',
 'biblical',
 'scholarship',
 'wasted',
 'its',
 'time',
 'chasing',
 'its',
 'tail',
 'around',
 'the',
 'theory',
 'that',
 'the',
 'israelites',
 'were',
 'nothing',
 'but',
 'former',
 'canaanite',
 'peasants',
 'now',
 'in',
 'the',
 'the',
 'new',
 'fad',
 'is',
 'to',
 'explain',
 'away',
 'the',
 'monarchies',
 'of',
 'the',
 'tem

In [241]:
custom_stop_words = ['ab', 'al', 'alten', 'america', 'atlanta', 'au', 'av', 'avrov', 'b', 'ba', 'bauer', 'berlin', 'BOOK',
                    'boston', 'brill', 'brown', 'c', 'cad', 'cambridge', 'cf', 'ch', 'chap', 'chapter', 'charles',
                    'chicago', 'chs', 'cit', 'cite', 'claremont', 'college', 'craig', 'cum', 'd', 'dans', 'de', 'dennis',
                    'diese', 'dissertation', 'dm', 'dtr', 'ed', 'eds', 'eerdmans', 'ek', 'elisabeth', 'en', 'et',
                    'ev', 'ez', 'f', 'far', 'ff', 'fiir', 'g', 'gar', 'george', 'geschichte', 'gott', 'gottes',
                    'grand', 'h', 'ha', 'hall', 'hartford', 'hat', 'haven', 'henry', 'I', 'ia', 'ibid', 'io',
                    'isbn', 'iv', 'ivye', 'ix', 'jeremias', 'jesu', 'k', 'ka', 'kai', 'kal', 'kat', 'kee', 'ki', 'kim',
                    'kirche', 'klein', 'knox', 'l', 'la', 'le', 'leiden', 'leipzig', 'les', 'life', 'line', 'loc', 'louisville', 'm',
                    'ma', 'madison', 'marie', 'marshall', 'mohr', 'n', 'na', 'neuen', 'ni', 'nu', 'nur', 'o', 'ol',
                    'om', 'op', 'ov', 'ovadd', 'ovk', 'oxford', 'paper', 'pp', 'paulus', 'ph', 'philadelphia', 'point', 'post',
                    'pres', 'president', 'press', 'pro', 'prof', 'professor','quod', 'r', 'ra', 'rab', 'rapids', 'refer', 'review','REVIEWS'
                    'reviews', 'ro', 'robert', 'robinson', 'rov', 's', 'sa', 'schmidt', 'schriften', 'scott', 'sec',
                    'section', 'seiner', 'sheffield', 'siebeck', 'stanely', 'studien', 't', 'text', 'thee', 'theologie',
                    'they', 'thing', 'thou', 'thy', 'tiibingen','tion', 'tov', 'tr', 'tv', 'u', 'um', 'univ', 'University', 'unto', 'v',
                    'van', 'verse','view', 'vol', 'volume', 'vs', 'vss', 'vv', 'w', 'william', 'world' 'wunt',
                    'y', 'yap', 'ye', 'york', 'zeit','-PRON-', 'jews','jewish', 'judaism', 'page_sequence','page','book','text','doe', 
                    'books','publish','include','say','die','der','des','das','und','ha','ha-','new','ica','ceede', 'sequence', 
                     'ibn', 'ben','say','br','ts','aj','thing','iii','nx','va','pr','give','way','nn','im','ny','mn','rn','nm',
                    'ri','nl','gt']
    

In [242]:
nlp = spacy.load('en_core_web_sm')
nltk_stop = nlp.Defaults.stop_words
nltk_list=list(nltk_stop)
en_stop = sorted(list(nltk_list + custom_stop_words))
# en_stop=en_stop.extend (custom_stop_words)

    

In [243]:
reffinal_df['tokenized_nopunc_lower_nostop'] = [[word for word in x if not word in en_stop] for x in reffinal_df['tokenized_nopunc_lower']]

In [244]:
# Check
len(reffinal_df.iloc[500]['tokenized_nopunc_lower_nostop'])

524

In [245]:
extra_stop_words = [
    'big','small','low','high',
    'none',
    'may',
    'among',
    'within',
    'don','t',
    'day',
    'etc',
    'around',
    'frequent',
    'including',
    'even',
    'can',
    'likely',
    'will',
    'like',
    'today',
    'bit',
    'put',
    'aim',
    's',
    'got',
    'really',
    'huge',
    'see',
    'almost',
    'already',
    'much',
    'recent',   #
    'many',
    'change',    #
    'changes',       #
    'someone',
    'said',
    'says',
    'gives',
    'give',
#     'people',
    'new',
    'say',
    'least','first','last','second',
    'one','two',
    'go',
    'goes',
    'take',
    'going',
    'taking',
    'just',
    'can'
    'cannot',
    'keep',
    'keeps',
    'also',
    'done',
    'good',
    'get',
    'without',
    'told',
    'might',
    'time',
    'unable',  #
    'able',  #
    'know',
    'end',
    'now',
    'want',
    'didn',
    'back',
    'doesn',
    'couldn',
    'since',
    'shouldn',
    'seen',
    'works',
    'zero',
    'every',
    'each',
    'other',
    'ever',
    'neither',
    'll',
    'mr',
    'ms',
    'mrs',
    'think',
    'tomorrow',
    'way',
    'still',
    'know',
    'later',
    'fine',    #
    'let',
    'went',
    'night',
    've',
    'must',
    'act',  #
    're',
    'c','b', 'a',
    'done',
    'began',
    'ones',
    'm',
    'soon',
    'word',
    'along',
    'main',
    'q',
    'lot',
    'e', 'd',
    'entire',
    'year',
    'mean',
    'means',
    'important',
    'always',
    'something',
    'rather',
    'either',
    'makes',
    'make',
    'uses',
    'use',
    'enough',
    'w','d',
    'never',
    'giving',
    'o',
    'involve',
    'involes',
    'involving',
    'little',
    'inside',
    'sat',
    'third','fourth','fifth','sixth',
    'next',
    'given',
    'million','billion','millions','billions',
    'option',
    'options',
    'full',
    'complete',
    'need',
    'needs',
    'set',
    'manage',
    'sets',
    'manages',
    'bring','brings','brought',
    'try','tries','tried'
    'week',
    'former',
    'monday','tuesday','wednesday','thursday','friday','saturday','sunday',
    'spent','spend', 'spends',
    'month','months',
    'send','sends','sent',
    'went',
    'january','february','march','april','may','june','july','august','september','october','november','december',
    'allow',
    'process',
#     'old',
    'times',
    'nearly',
    'looking','looks','look',
    'thinly',
    'becoming',
    'stay','stays',
    'took','takes','take',
    'types', 'type',
    'thought', 'though',
    'idea',
    'clear','clearly',
    'behind',
    'half',
    'us',
    'less',
    'claim','claims',
    'long', 'short',
    'smaller','larger','bigger','largest','biggest','smallest','longer','shorter','short','long',
    'extreme','severe',
    'largely',
    'anymore',
    'years',
    'spoke',
    'give','gave','given','gives',
    'reportedly','supposedly','alledgedly',
    'please',
    'received','receive','receives',
    'longtime',
    'best',
    'existing',
    'putting','put','puts',
    
    'whose',
    'yesterday',
    
    
    
    'thing',   #added later
    'week',
    'another',
    'month',
    'day',
    'come']

In [246]:
reffinal_df['tokenized_nopunc_lower_nostop_extra'] = [[word for word in x if not word in extra_stop_words] for x in reffinal_df['tokenized_nopunc_lower_nostop']]

In [247]:
reffinal_df['tokenized_nopunc_lower_nostop_extra_lemmatized'] = [[lemma.lemmatize(word) for word in x] for x in reffinal_df['tokenized_nopunc_lower_nostop_extra']]

In [248]:
with open('clean_data_full.pickle', 'wb') as file:
    pickle.dump(reffinal_df, file)

    # Drop interim colums
raw_small = pd.DataFrame(reffinal_df['tokenized_nopunc_lower_nostop_extra_lemmatized'])
raw_small.rename(columns={'tokenized_nopunc_lower_nostop_extra_lemmatized':'article_text'},inplace=True)

with open('clean_data_small.pickle', 'wb') as file:
    pickle.dump(raw_small, file)


In [252]:
research_article_interim=pd.DataFrame(reffinal_df[reffinal_df.type.eq('research-article')])
book_review_interim=pd.DataFrame(reffinal_df[reffinal_df.type.eq('book-review')])
research_article_small = pd.DataFrame(research_article_interim['tokenized_nopunc_lower_nostop_extra_lemmatized'])
research_article_small.rename(columns={'tokenized_nopunc_lower_nostop_extra_lemmatized':'article_text'},inplace=True)
book_review_small = pd.DataFrame(book_review_interim['tokenized_nopunc_lower_nostop_extra_lemmatized'])
book_review_small.rename(columns={'tokenized_nopunc_lower_nostop_extra_lemmatized':'article_text'},inplace=True)
citations_small=pd.DataFrame(reffinal_df.filter(['id','citations'],axis=1))

with open('clean_data_research_small.pickle', 'wb') as file:
    pickle.dump(research_article_small, file)

with open('clean_data_book_small.pickle', 'wb') as file:
    pickle.dump(book_review_small, file)

with open('clean_data_citations_small.pickle', 'wb') as file:
    pickle.dump(citations_small, file)

In [257]:
citations_small.to_csv('citations.csv')