## Setup:

To setup the lookup page, run the cell below.

In [99]:
print('Loading...')

# Imports
import os, zipfile, pathlib, shutil
import pandas as pd
import regex as re
from IPython.display import clear_output
from IPython.display import IFrame
import zipp

# Definitions

def empty(folder):
    for filename in os.listdir(folder):
        file_path = os.path.join(folder, filename)
        try:
            if os.path.isfile(file_path) or os.path.islink(file_path):
                os.unlink(file_path)
            elif os.path.isdir(file_path):
                shutil.rmtree(file_path)
        except Exception as e:
            print('Failed to delete %s. Reason: %s' % (file_path, e))

def sanitize(request_list):
    wanted_articles = []
    unrecognized_articles = []

    for article in request_list:
        clean_art = str(article).strip()
        if clean_art in all_articles:
            wanted_articles.append(clean_art)
        elif clean_art == '':
            pass
        else: 
            unrecognized_articles.append(clean_art)
    
    if len(unrecognized_articles) > 0:
        print("The following entries were not recognized as articles:")
        [print(repr(x)) for x in unrecognized_articles]
    return(wanted_articles)

def is_list(request_list):
    try:
        assert type(request_list) == list
        return(True)
    except NameError:
        request_list = []
        return(False)
    except AssertionError:
        "Please make sure your input is a list and try again."
        return(False)
    
def request_input():
    request = input("What article IDs shall I look up for you?")
    request_list = re.split(",|;| ", request)
    clear_output()
    return(request_list)

def unpack_pdfs(request_list, metadata):
    print("Unpacking requested pdfs...")

    df = metadata.loc[request_list]
    zips_to_open = df.pdf_zip.unique().tolist()

    for zip_file in zips_to_open:

        files_to_extract = df[df['pdf_zip'] == zip_file].pdf_file.unique().tolist()   

        with zipfile.ZipFile(os.path.join('PDF', zip_file)) as zf: 

            for file in files_to_extract:
                zf.extract(file, path="temp")
    print("Done!")

def display_article(metadata, article):
    
    pdf_file = metadata.at[article, 'pdf_file']
    pdf_file = os.path.join("temp", pdf_file)
    
    display(IFrame(src=pdf_file, width='100%', height='700px'))
    
    pub_date = metadata.at[article, 'pub_date']
    objecttypes = metadata.at[article, 'objecttypes']
    objecttypes = objecttypes.split(';')
    
    txt_zip = metadata.at[article, 'txt_zip']
    txt_file = metadata.at[article, 'txt_file']
    
    txt_zip = os.path.join('TXT', txt_zip)
    
    with zipfile.ZipFile(txt_zip) as zf:
        with zf.open(txt_file) as f:
            text = f.read()
    
    print(f'Article ID: {article} \t Published: {pub_date}')
    print('Object Types:\t', ', '.join(objecttypes))
    
    
#     print('Article text:', '\n\n', text, '\n')
    
    try:
        linked_function(article)
    except:
        foobar = input('Press enter to continue...')
    
    clear_output()
    
def display_requested_articles(wanted_articles, metadata):
    
    n = 0
    for article in wanted_articles:
        n+=1
        print(f"Here's article {n} of {len(wanted_articles)}:")
        display_article(metadata, article)

    empty("temp")

def process(request_list):   
    if is_list(request_list):  
        if request_list == []:
            request_list = sanitize(request_input())
        else:
            request_list = sanitize(request_list)
    return request_list

# Import Data Index
    
with zipfile.ZipFile(os.path.join('TOI_metadata.zip')) as zf:
    with zf.open('TOI_metadata.csv') as file:
        metadata = pd.read_csv(file, usecols=['record_id', 
                                              'pub_date', 
                                              'txt_zip', 
                                              'txt_file',
                                              'pdf_zip',
                                              'pdf_file',
                                              'objecttypes'], dtype='object').set_index('record_id')

all_articles = set(metadata.index)

print('done \n')

Loading...
done 



## Define Articles to display

If you have a list or custom script to output a list of articles, you can add it in the cell below. Otherwise the next cell will prompt for manual input.


In [104]:
# Add list-defining function here:

# read in the training data
td = pd.read_csv('../pogrep/training_data.csv', dtype='object', index_col='record_id')

# read in the second_opinion work so far
second_opinion_df = pd.read_csv('../pogrep/temp/second_opinion.csv', dtype='object', index_col=0)
second_opinion = second_opinion_df.second_opinion.to_dict()

request_list = list(set(td.index.tolist()) - set(second_opinion_df.index.tolist()))

In [105]:
request_list = process(request_list)

## Unpacking the pdfs

In [106]:
unpack_pdfs(request_list, metadata)

Unpacking requested pdfs...
Done!


## Define an Input Function (Optional)
If you would like to have a custom input function run after display (say, a coding assignment), add it in the cell below, and use linked_function to run it. If linked_function remains undefined, the page will defaultto a "Press enter to continue" dialog box.

In [107]:
def linked_function(article):
    new_judgment = ''
    while type(new_judgment) != bool:
        new_judgment = input("Is this a pogrom narrative? (y or 1 = Yes, n or 0 = No)")
        if new_judgment == 'y' or '1':
            new_judgment = True
        elif new_judgment == 'n' or '0':
            new_judgment = False
        else:
            print("I'm not sure how to interpret that.")

    second_opinion.update({int(article):str(new_judgment)})

## Display Articles

In [108]:
display_requested_articles(request_list, metadata)

Here's article 2 of 1688:


Article ID: 613498968 	 Published: 1986-07-27
Object Types:	 feature, article


KeyboardInterrupt: 

In [97]:
opinion = {'second_opinion': second_opinion}
pd.DataFrame(opinion).to_csv('../pogrep/temp/second_opinion_new.csv')

Note: to load the newly completed segment, you'll need to drop '\_new' from the new file, and replace the old one. For safety, this should be done manually.