Start with some demo cells for showing how to do markup and then change code (for those only registered in one workshop). Use the python example to explain python libraries. Then we will get our import statements out of the way. 

In [None]:
# some standard libraries
import os,re,sys
from datetime import datetime

# libraries for data mining and text analysis
import matplotlib.pyplot as plt
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.tokenize import RegexpTokenizer
import plotly.express as px

# whoosh libraries - for creating and searching indexes
from whoosh import scoring
from whoosh.fields import Schema, DATETIME, ID, TEXT 
import whoosh.highlight as highlight
from whoosh.index import create_in, open_dir
from whoosh.qparser import QueryParser

print("=> libraries loaded and ready to go...")

nltk will download some building blocks, this could take a few minutes

In [None]:
# download the stopwords for several languages & VADER lexicon
# some jupyter environments need custom paths
#ntlk_path = os.sep + "util" + os.sep + "odw" + os.sep + "nltk_data"
#nltk.data.path.append(ntlk_path)
# alternative is nltk.download('stopwords')
#nltk.download('stopwords',download_dir=ntlk_path)
nltk.download('stopwords')
# alternative is nltk.download('vader_lexicon')
#nltk.download('vader_lexicon',download_dir=ntlk_path)
nltk.download('vader_lexicon')
print("=> downloading complete...")

now some config options

In [None]:
"""
These two values align the newspaper title and OCR directory (leave these alone for workshop since we are using one title)
"""
news_title = "The Amherstburg Echo" # newspaper title for indexing
news_code = "echo" # used for location of OCR text
print("=> newspaper title and OCR directory set...")

Values needed for reative frequencies.

In [None]:
"""
These values (on the other hand) are meant to be tinkered with.
"""

news_topics = ["influenza","flu"]
#news_topics = ["pepsi"]
news_range = "[1917 to 1919]" # we use whoosh layout for date range
print("=> relative frequency configuration set...")

With kudos to [jcoliver's](https://github.com/jcoliver) [Collections as Data](https://github.com/OurDigitalWorld/data_samples/blob/5651ca21a5a3b5186a2cbabdaed844dae1486818//repository) site. This is a slight variation on one of the exercises in the [Introduction to text mining](https://github.com/jcoliver/dig-coll-borderlands/blob/main/Text-Mining-Short.ipynb) notebook.

In [None]:
dates = []

range = re.findall(r'\d+', news_range)
folder_list = sorted(os.listdir(news_code))
folder_paths = [os.path.join(news_code,i) for i in folder_list]
for folder_path in folder_paths:
    folder = folder_path.split(os.sep)[1]
    if int(range[0]) <= int(folder) <= int(range[1]):
        file_list = sorted(os.listdir(folder_path))
        file_paths = [os.path.join(folder_path,j) for j in file_list]
        for file in file_paths:
            fp = open(file,'r', encoding='utf8')
            text = fp.read()
            text = " ".join(text.split())
            fp.close()
            tokenizer = RegexpTokenizer(r'\w+')
            word_list = tokenizer.tokenize(text.lower())
            word_table = pd.Series(word_list,dtype='string')
            # Calculate relative frequencies of all words in the issue
            word_freqs = word_table.value_counts(normalize = True)
            # Pull out only values that match words of interest
            my_freqs = word_freqs.filter(news_topics)
            # Get the total frequency for words of interest
            total_my_freq = my_freqs.sum()
            skip = len(news_code) + 6
            dates.append([file[skip:skip + 10],total_my_freq])
            
# add those dates to a data frame
results_table = pd.DataFrame(dates, columns = ['Date','Frequency']) 
# Analyses are all done, plot the figure
my_figure = px.line(results_table, x = 'Date', y = 'Frequency').update_layout(yaxis_title="Relative Freq.")
print("=> pages examined:", len(dates))
my_figure.show()

In [None]:
""" Configuration options for Sentiment Analysis """

index_dir = "echo_whoosh" # directory for index
#news_query = "wom?n OR female*" # follow whoosh conventions, e.g "wom?n OR female*"
news_query = "road*"
index_range = "[19170601 to 19190601]" # follow whoosh conventions, e.g "1975", "[1970 to 1980]", "[19000101 to 19000431]"
snippet_limit = 200 # limit for number of snippets to work with
print("=> configuration set...")

In [None]:
"""
Classes and functions are here in one place.
"""

class MinimalFormatter(highlight.Formatter):

    def format_token(self, text, token, replace=False):
        tokentext = highlight.get_text(text, token, replace)

        # this could be elaborate as shown 
        # return "[%s]" % tokentext

        # but just return the token here
        return tokentext

def createSearchableData(root,indexdir):   
 
    # Note that we need content to be stored for highlighting to work
    schema = Schema(title=TEXT(stored=True),
              path=ID(stored=True),
              content=TEXT(stored=True),
              pubdate=DATETIME(stored=True))

    # this is how a whoosh index can be created
    # ideally, this would be done outside of the notebook
    # for a large set
    if not os.path.exists(indexdir):
        os.mkdir(indexdir)
 
        # Creating an index writer to add documents
        ix = create_in(indexdir,schema)
        writer = ix.writer()
 
        # Assume file text is local
        folder_list = sorted(os.listdir(root))
        folder_paths = [os.path.join(root,i) for i in folder_list]
        for folder_path in folder_paths:
            print(folder_path)
            file_list = sorted(os.listdir(folder_path))
            file_paths = [os.path.join(folder_path,j) for j in file_list]
            for file_path in file_paths:
                fp = open(file_path,'r', encoding='utf8')
                file_bits = file_path.split(os.sep)
                page_id = file_bits[len(file_bits) - 1]
                page_id = page_id.replace(".txt","")
                date_str = page_id[:10]
                date_object = datetime.strptime(date_str,"%Y-%m-%d")
                page_num = int(page_id[11:])
                ntitle = date_object.strftime("%B %d, %Y") + "- pg. " + str(page_num)
                text = fp.read()
                text = " ".join(text.split())
                writer.add_document(title = news_title + ". " + ntitle,
                    path=page_id, content=text, pubdate = date_object)
                fp.close()
                
        print("commiting...") # this can be the slowest step
        writer.commit() 
    if os.path.exists(indexdir):
        print("=> index directory exists...")
        
print("=> classes & functions in place...")

An index only has to be created once (if the data has not changed). This next cell can be skipped if the index is already there.

In [None]:
createSearchableData(news_code,index_dir) # this will index the OCR text

Now use the index for getting highlights. The highlights will be our snippets. Whoosh has the plumbing for something far more elaborate but keep it simple for now.

In [None]:
# the index directory contains the index
ix = open_dir(index_dir)
 
qp = QueryParser("content", schema=ix.schema)
q = qp.parse(news_query)
allow_q = qp.parse("pubdate:" + index_range)
#allow_q = qp.parse(u"pubdate:19510104")
#allow_q = qp.parse(u"pubdate:[19700101 to 19801231]")
#q = qp.parse(u"19500104")

with ix.searcher() as s:
    results = s.search(q,filter=allow_q,limit=snippet_limit) 
    # Allow larger fragments
    results.fragmenter.maxchars = 50

    # Show more context before and after
    results.fragmenter.surround = 5

    # using the class above
    minf = MinimalFormatter()
    results.formatter = minf

    snippets = []
    i = 0
    for i,hit in enumerate(results):
        # clean up the spaces in the result
        snippet = " ".join(hit.highlights("content").split())
        snippets.append([snippet,hit["path"][:4],hit["path"]])
print("=> # of snippets gathered: ", 0 if i == 0 else i+1)

At this point, the snippets/highlights are collected. Now we handover the results to the powerful [pandas](https://pandas.pydata.org/) library.

In [None]:
df = pd.DataFrame(snippets,columns=['snippet','year','page'])
df["row_id"] = df.index + 1

#remove all non-alphabet characters
df['snippet'] = df['snippet'].str.replace("[^a-zA-Z#]", " ", regex=True)
#covert to lower-case
df['snippet'] = df['snippet'].str.casefold()

print("=> the handover from whoosh to pandas is complete...")

The following is based on the redgate tutorial at [Sentiment Analysis with Python](https://www.red-gate.com/simple-talk/development/data-science-development/sentiment-analysis-python/). This is for illustrative purposes, [VADER](https://github.com/cjhutto/vaderSentiment) may not be the right tool for historical text.

In [None]:
sid = SentimentIntensityAnalyzer()
tmp = [['99999999999','NA999NA',0]]
for index, row in df.iterrows():
    scores = sid.polarity_scores(row[0])   
    for key, value in scores.items():
        #row is is the last column in tmp
        tmp.append([row[3],key,value])

# this is a slight variation, the original append method is being depreciated
t_df=pd.DataFrame(tmp,columns=['row_id','sentiment_type','sentiment_score'])

# remove dummy row with row_id = 99999999999
t_df_cleaned = t_df[t_df.row_id != '99999999999']
# remove duplicates if any exist
t_df_cleaned = t_df_cleaned.drop_duplicates()
# only keep rows where sentiment_type = compound
t_df_cleaned = t_df[t_df.sentiment_type == 'compound']
# merge dataframes
df_output = pd.merge(df, t_df_cleaned, on='row_id', how='inner')

# take a look at first few entries for negative scores
df_belowzero = df_output[df_output.sentiment_score < 0.0]
print("len negative",len(df_belowzero))
print(df_belowzero.head())
#print("total",len(df_output))

df_abovezero = df_output[df_output.sentiment_score > 0.0]
print("len positive",len(df_abovezero))
print("total",len(df_output))
print(df_abovezero.head())

df_zero = df_output[df_output.sentiment_score == 0.0]
print("len zero",len(df_zero))

See above link for explaination.

Get some details on the scoring.

In [None]:
df_output[["sentiment_score"]].describe()

The graphing is a little misdirected in my sample, it would make more sense to compare time periods around an event, for example.

In [None]:
# generate mean of sentiment_score by snippet
dfg = df_output.groupby(['year'])['sentiment_score'].mean()
# create a bar plot

dfg.plot(kind='bar', title='Sentiment Score', ylabel='Mean Sentiment Score',
         xlabel='Year', figsize=(6, 5))

And that's it!