# DfR Topic Modeling Notebook

This program creates topics from JSTOR's "Data for Research" (DfR). The program takes two inputs. One is a spreadsheet that has the type of articles (research-article or book-review, which will be known as the "atype" below), the ids that will be used to access the OCR files, the year of each article, and the language. The second is a simple text file with the custom stop words, each on a separate line.

In [None]:
import os
import re
import collections
import spacy
import json
from pathlib import Path
import pandas as pd
from gensim import corpora, models, similarities
from pprint import pprint

In [None]:
# We set the file structure here.  The "directory" variable should be changed to wherever this file lives.
file=os.getcwd()

file_path = Path(file).resolve()  # get path of this file
base_dir = file_path.parents[0]  # get path of parent directory
data_dir = base_dir  # get data directory path
jstor_dir = data_dir / 'jstor_data'  # get path to jstor_data path
corpus_dir = data_dir # get path to where the corpus files will be written
model_dir = data_dir # get path to where the model files will be written
ocr_files = jstor_dir / 'ocr'

corpus_dir_str = str(Path(corpus_dir)) # sets variable for the string version of the corpus path
model_dir_str = str(Path(model_dir)) # sets variable for the string version of the model path

In [None]:
# Reads the csv index file of the articles and prints it out, to enable checking. The name of the file should be changed.
reference_df = pd.read_csv(data_dir / "REFERENCE FILE NAME.csv")

reference_df.head()

In [None]:
# Many journals contain abbreviations that are significant for the topics.  The abbreviations should be custom set, in 
# a list of tuples.  This particular one expands the abbreviations for biblical books found in the Journal of Biblical Literature.

books_abbrs = [('gen', 'genesis'),('exod', 'exodus'),('ex', 'exodus'),('lev', 'leviticus'),('num', 'numbers'),
               ('deut', 'deuteronomy'),('josh', 'joshua'),('judg', 'judges'), ('jud', 'judges'),('sam', 'samuel'),('kgs', 'kings'),
               ('chr', 'chronicles'),('neh', 'nehemiah'),('esth', 'esther'),('ps', 'psalms'),('pss', 'psalms'),
               ('prov', 'proverbs'),('eccl', 'ecclesiastes'),('qoh', 'qoheleth'), ('isa', 'isaiah'),
               ('jer', 'jeremiah'),('lam', 'lamentations'),('ezek', 'ezekiel'),('hos', 'hosea'),('obad', 'obediah'),
               ('mic', 'micah'),('nah', 'nahum'),('hab', 'habakkuk'),('zeph', 'zephaniah'),('hag', 'haggai'),
               ('zech', 'zechariah'),('mal', 'malachi'),('matt', 'matthew'),('mk', 'mark'),('lk', 'luke'),
               ('jn', 'john'),('rom', 'romans'),('cor', 'corinthians'),('gal', 'galatians'),('eph', 'ephesians'),
               ('phil', 'philippians'),('col', 'colossians'),('thess', 'thessalonians'),('tim','timothy'),
               ('phlm', 'philemon'),('heb', 'hebrews'),('jas', 'james'),('pet', 'peter'),('rev', 'revelation'),
               ('tob', 'tobit'),('jdt', 'judith'), ('wis', 'wisdom of solomon'),('sir', 'sirach'), ('bar', 'baruch'),
               ('macc', 'maccabees'), ('esd', 'esdras'), ('tg', 'targum')]

In [None]:
# Stop words will be filtered from the results.  These stopwords come from three places: The NLP package; any custom file
# that you might have; and anything that you want to put in this list.

custom_stop_words = ['ab', 'al', 'alten', 'america', 'atlanta', 'au', 'av', 'avrov', 'b', 'ba', 'bauer', 'berlin', 'BOOK',
                    'boston', 'brill', 'brown', 'c', 'cad', 'cambridge', 'cf', 'ch', 'chap', 'chapter', 'charles',
                    'chicago', 'chs', 'cit', 'cite', 'claremont', 'college', 'craig', 'cum', 'd', 'dans', 'de', 'dennis',
                    'diese', 'dissertation', 'dm', 'dtr', 'ed', 'eds', 'eerdmans', 'ek', 'elisabeth', 'en', 'et',
                    'ev', 'ez', 'f', 'far', 'ff', 'fiir', 'g', 'gar', 'george', 'geschichte', 'gott', 'gottes',
                    'grand', 'h', 'ha', 'hall', 'hartford', 'hat', 'haven', 'henry', 'I', 'ia', 'ibid', 'io',
                    'isbn', 'iv', 'ivye', 'ix', 'jeremias', 'jesu', 'k', 'ka', 'kai', 'kal', 'kat', 'kee', 'ki', 'kim',
                    'kirche', 'klein', 'knox', 'l', 'la', 'le', 'leiden', 'leipzig', 'les', 'life', 'line', 'loc', 'louisville', 'm',
                    'ma', 'madison', 'marie', 'marshall', 'mohr', 'n', 'na', 'neuen', 'ni', 'nu', 'nur', 'o', 'ol',
                    'om', 'op', 'ov', 'ovadd', 'ovk', 'oxford', 'paper', 'pp', 'paulus', 'ph', 'philadelphia', 'point', 'post',
                    'pres', 'president', 'press', 'pro', 'prof', 'professor','quod', 'r', 'ra', 'rab', 'rapids', 'refer', 'review','REVIEWS'
                    'reviews', 'ro', 'robert', 'robinson', 'rov', 's', 'sa', 'schmidt', 'schriften', 'scott', 'sec',
                    'section', 'seiner', 'sheffield', 'siebeck', 'stanely', 'studien', 't', 'text', 'thee', 'theologie',
                    'they', 'thing', 'thou', 'thy', 'tiibingen','tion', 'tov', 'tr', 'tv', 'u', 'um', 'univ', 'University', 'unto', 'v',
                    'van', 'verse','view', 'vol', 'volume', 'vs', 'vss', 'vv', 'w', 'william', 'world' 'wunt',
                    'y', 'yap', 'ye', 'york', 'zeit','-PRON-']

In [None]:
# loads all the stop words together, and prints out their number just to check.

nlp = spacy.load('en_core_web_sm')
stop_words = nlp.Defaults.stop_words
stop_words.update(custom_stop_words)
stop_words.update('custom_stopwords.txt')
stop_words=set(stop_words)
len(stop_words)

In [None]:
# Creates a list of ids from the reference dataframe that meet the criteria for type (book-review)
# or article), language, and the year range. These ids are then used to access the ocr files

def id_list(atype, years, ltype):
    atype_filter = reference_df['type'] == atype
    ltype_filter = reference_df['lang'] == ltype
    atype_ids = reference_df[atype_filter]
    atype_ids = reference_df[ltype_filter]

    a_ids = []
    file_id_lst_final=[]

    for year_lst in years:
        for year in year_lst:
            ids = atype_ids.loc[atype_ids['year'] == year]['id'].tolist()
            a_ids.append(ids)
            file_id_lst=[item for sublist in a_ids for item in sublist]
            file_id_lst=[item+'.txt' for item in file_id_lst]
        file_id_lst_final.append(file_id_lst) 
    
    return file_id_lst_final
    

In [None]:
# These functions process the ocr texts with the NLP tools

def substitute(list_tuples, string):
    for tuple_ in list_tuples:
        string = re.sub(r'\b' + tuple_[0] + r'\b', tuple_[1], string)
    return string

def get_lemmas(doc):
    tokens = [token for token in doc]
    lemmas = [token.lemma_ for token in tokens if token.is_alpha]
    lemmas = [lemma for lemma in lemmas if lemma not in stop_words]
    for index, item in enumerate(lemmas):
        item = substitute(books_abbrs, item)
        lemmas[index] = item
    return lemmas

def get_noun_lemmas(doc):
    tokens = [token for token in doc]
    noun_tokens = [token for token in tokens if token.tag_ == 'NN' or token.tag_ == 'NNP' or token.tag_ == 'NNS']
    noun_lemmas = [noun_token.lemma_ for noun_token in noun_tokens if noun_token.is_alpha]
    noun_lemmas = [noun_lemma for noun_lemma in noun_lemmas if noun_lemma not in stop_words]
    for index, item in enumerate(noun_lemmas):
        item = substitute(books_abbrs, item)
        noun_lemmas[index] = item
    return noun_lemmas
    
def process_text(text):
    doc = nlp(text)
    lemmas = get_lemmas(doc)
    noun_lemmas = get_noun_lemmas(doc)
    return lemmas, noun_lemmas

In [None]:
# Creates unique ids for storing the output files
def string_id(dates):
    dates = tuple(dates)
    years_str = str(dates[0]) + '-' + str(dates[-1])
    return years_str

In [None]:
# This is the primary function that creates the dictionary and corpus files and writes out the model files.  

def prep_models(year_range_lst, years):
    
    # Takes in a list of ids to ocr files (the year_range_lst), accesses the ocr file, processes the language, and creates the
    # general list of all words (general_docs) and the list of nouns.
    for item in year_range_lst:
        general_docs = []
        noun_docs = [] 
        with open (ocr_files / item, mode='r', encoding='utf8') as f:
            text = f.read()
            lemmas, nouns = process_text(text)
            general_docs.append(lemmas)
            noun_docs.append(nouns)
        with open (corpus_dir / 'general_docs', encoding='utf8', mode='w') as outfile:
            json.dump(general_docs, outfile)
        with open (corpus_dir / 'noun_docs', encoding='utf8', mode='w') as outfile:
            json.dump(noun_docs, outfile)
        
# Puts the list into the tokenized form that the dictionary and corpus producing functions below need.    
    f = open(corpus_dir / 'general_docs', 'r')
    contents = f.read()
    f.close()
    contents = contents.split(',')
    contents = [d.split() for d in contents]
        
        
# create general dictionary
    general_dictionary = corpora.Dictionary(contents)
    # general_dictionary.filter_extremes(no_below=2, no_above=0.7)
    general_dictionary.save(corpus_dir_str + '/general_corpus' + '_' + years +'.dict' )

# create general corpus
    corpus = [general_dictionary.doc2bow(doc) for doc in contents]
    corpora.MmCorpus.serialize(corpus_dir_str + '/general_corpus'+ '_' + years + '.mm', corpus)
    
    nf = open(corpus_dir / 'noun_docs','r')
    ncontents = nf.read()
    nf.close()
    ncontents = ncontents.split(',')
    ncontents = [d.split() for d in ncontents]
     
        
# create noun dictionary
    noun_dictionary = corpora.Dictionary(ncontents)
    # noun_dictionary.filter_extremes(no_below=10, no_above=0.7)
    noun_dictionary.save(corpus_dir_str + '/noun_corpus' + '_' + years + '.dict')
    

# create noun corpus
    ncorpus = [noun_dictionary.doc2bow(doc) for doc in ncontents]
    corpora.MmCorpus.serialize(corpus_dir_str + '/noun_corpus'+ '_' + years + '.mm', corpus)
    
    
# run topic models and save.  These can be changed to different numbers of topics
    nlda_25 = models.LdaModel(ncorpus, id2word=noun_dictionary, num_topics=25, passes=100, random_state=42)
    nlda_25.save(model_dir_str + '/noun_25_' + years + '.model')
    
    glda_25 = models.LdaModel(corpus, id2word=general_dictionary, num_topics=25, passes=100, random_state=42)
    glda_25.save(model_dir_str + '/general_25_' + years + '.model')
        
    return nlda_25, ncorpus, noun_dictionary, noun_docs, glda_25, corpus, general_dictionary, general_docs
    

In [None]:
# The files are output in the '...data/corpus' directory.  It is a good idea to make sure that before you run this step you make
# sure that the directory exists and is empty (not strictly necessary, but better for the next step).



# These next three statements should be modified for the query.  atype can be 'book-review' or 'research-article'.  years is a list of range
# objects, with the beginning and ending years (N.B.: results will not include the last year) separated by a comma, in 
# parentheses. Language will usually be "eng" for English
atype = 'research-article'
years = [range(1984, 1985), 
        # range(1992,1995)
        # range(1991,1996),
        # range(1996,2001), 
        # range(2001,2006), 
        # range(2006,2011),
        # range(2011,2015)
        ]
ltype = 'eng'


# Calls the function to generate the lists of ids.  This comes back as a list of lists (one for each date range), and within 
# each one are the file ids needed to access the relevant ocr files
final_lst_id = []
final_lst_id = id_list(atype, years, ltype)

# Passes each list in final_lst_id to a function that (1) creates and cleans the files needed for the topic modeling; 
# and (2) runs a topic model; and (3) saves the results of the model in a unique file.
i = 0
for year_range_lst in final_lst_id:
    yr_range = string_id(years[i])
    noun_model, ncorpus, noun_dictionary, noun_docs, general_model, corpus, general_dictionary, general_docs = prep_models(year_range_lst, yr_range)
    i += 1 

In [None]:
# A general command to print out the topics

pprint(general_model.print_topics())

In [None]:
doc_compositions = {}
for i in range(len(corpus)):
    doc_compositions[i] = general_model.get_document_topics(corpus[i])
# doc_compositions

In [None]:
general_dictionary.doc

In [None]:
for item in year_range_lst:
        with open (ocr_files / item, mode='r', encoding='utf8') as f:
            text = f.read()
            break

# general_dictionary.doc2bow(text.split(' ')[0])
general_dictionary.doc2idx(text.split(' '))