# Times of India Article Display Interface

In [1]:
# Imports
import os, zipfile, pathlib, shutil
import pandas as pd
import regex as re
from IPython.display import clear_output
from IPython.display import IFrame
import zipp

# Definitions

def empty(folder):
    for filename in os.listdir(folder):
        file_path = os.path.join(folder, filename)
        try:
            if os.path.isfile(file_path) or os.path.islink(file_path):
                os.unlink(file_path)
            elif os.path.isdir(file_path):
                shutil.rmtree(file_path)
        except Exception as e:
            print('Failed to delete %s. Reason: %s' % (file_path, e))

def sanitize(request_list):
    wanted_articles = []
    unrecognized_articles = []
    
    all_articles = set(metadata.index)

    for article in request_list:
        clean_art = str(article).strip()
        if clean_art in all_articles:
            wanted_articles.append(clean_art)
        elif clean_art == '':
            pass
        else: 
            unrecognized_articles.append(clean_art)
    
    if len(unrecognized_articles) > 0:
        print("The following entries were not recognized as articles:")
        [print(repr(x)) for x in unrecognized_articles]
    if len(wanted_articles) == 0:
        raise ValueError('No valid article IDs Recognized!')
    return(wanted_articles)

def request_input():
    request = input("What article IDs shall I look up for you?")
    request_list = re.split(",|;| ", request)
    clear_output()
    return request_list

def unpack_pdfs(request_list, metadata):
    print("Unpacking requested pdfs...")

    df = metadata.loc[request_list]
    zips_to_open = df.pdf_zip.unique().tolist()

    for zip_file in zips_to_open:

        files_to_extract = df[df['pdf_zip'] == zip_file].pdf_file.unique().tolist()   

        with zipfile.ZipFile(os.path.join('PDF', zip_file)) as zf: 

            for file in files_to_extract:
                zf.extract(file, path="temp")
    print("Done!")

def display_article(metadata, article):
    
    pdf_file = metadata.at[article, 'pdf_file']
    pdf_file = os.path.join("temp", pdf_file)
    
    display(IFrame(src=pdf_file, width='100%', height='700px'))
    
    pub_date = metadata.at[article, 'pub_date']
    objecttypes = metadata.at[article, 'objecttypes']
    objecttypes = objecttypes.split(';')
    
    txt_zip = metadata.at[article, 'txt_zip']
    txt_file = metadata.at[article, 'txt_file']
    
    txt_zip = os.path.join('TXT', txt_zip)
    
    with zipfile.ZipFile(txt_zip) as zf:
        with zf.open(txt_file) as f:
            text = f.read()
    
    print(f'Article ID: {article} \t Published: {pub_date}')
    print('Object Types:\t', ', '.join(objecttypes))
    
    
#     print('Article text:', '\n\n', text, '\n')
    
    linked_function(article)
    
    clear_output()
    
# Next step is to introduce the choice to save and end or save and continue.

def save_results():
    try:
        save_function()
        save_indicator = 1
    except:
        save_indicator = 0
    
    if save_indicator == 1:
        print('changes saved!')
    else:
        print('no save function detected; changes not saved.')
    


def display_article_chunk(request_list, chunk_number, metadata, continue_indicator, chunk_size):
    
    number_of_chunks = len(request_list)//chunk_size + (len(request_list) % chunk_size > 0)
    
    if int(continue_indicator) == 1:
        n = 0
        save_indicator = 0
        
        
        if len(request_list) >= chunk_size*chunk_number+1:
            this_chunk = request_list[chunk_size*chunk_number:chunk_size*(chunk_number+1)]
        else:
            this_chunk = request_list[chunk_size*chunk_number:]

        for article in this_chunk:
            n+=1
            print(f"Here's article {n} of {len(this_chunk)}, in set {chunk_number+1} of {number_of_chunks}:")
            display_article(metadata, article)
    elif int(continue_indicator) == 0:
        return continue_indicator
    
    save_results()
    
    if chunk_number != number_of_chunks:
        continue_indicator = input('enter 1 to continue, or 0 to exit.')
        while int(continue_indicator) not in set([1, 0]):
            continue_indicator = input('enter 1 to continue, or 0 to exit.')
    elif chunk_number == number_of_chunks:
        continue_indicator = 0
    return continue_indicator
    
def display_requested_articles(metadata, chunk_size=15):
    try:
        request_list = get_display_list()
        if type(request_list) != list:
            raise TypeError('Please make sure get_display_function() is returning a list of article IDs.')
    except NameError:
        request_list = request_input()
    
    request_list = sanitize(request_list)
    
    unpack_pdfs(request_list, metadata)
    
    number_of_chunks = len(request_list)//chunk_size + (len(request_list) % chunk_size > 0)
    
    continue_indicator = 1
    
    while int(continue_indicator) == 1:
        for chunk_number in range(number_of_chunks):
            continue_indicator = display_article_chunk(request_list, chunk_number, metadata, continue_indicator, chunk_size)

    empty("temp")
    

# Import Data Index

def load_metadata():
    print('Loading metadata...')
    with zipfile.ZipFile(os.path.join('TOI_metadata.zip')) as zf:
        with zf.open('TOI_metadata.csv') as file:
            metadata = pd.read_csv(file, usecols=['record_id', 
                                                  'pub_date', 
                                                  'txt_zip', 
                                                  'txt_file',
                                                  'pdf_zip',
                                                  'pdf_file',
                                                  'objecttypes'], dtype='object').set_index('record_id')

    print('done \n')
    return metadata

metadata = load_metadata()

Loading metadata...
done 



## Define Articles to display

If you have a list or custom script to output a list of articles, you can add it in the cell below, defining it as 'get_display_list'. If no list-defining function is provided, the next cell will prompt for manual input.

In [2]:
# Add list-defining function here:

def get_display_list():
  
    # read in the training data
    td = pd.read_csv('../pogrep/training_data.csv', dtype='object', index_col='record_id')

    # read in the second_opinion work so far
    second_opinion_df = pd.read_csv('../pogrep/temp/second_opinion.csv', dtype='object', index_col=0)
    second_opinion = second_opinion_df.second_opinion.to_dict()

    request_list = list(set(td.index.tolist()) - set(second_opinion_df.index.tolist()))
    
    return request_list

## Define Input and Save functions (Optional)
If you would like to have a custom input function run after display (say, a coding assignment), and if you would like to save the results of the input function, add them in the cell below. Define them as linked_function() and save_function() respectively. If linked_function remains undefined, the page will default to a "Press enter to continue" dialog box with no save function.

In [3]:
second_opinion_df = pd.read_csv('../pogrep/temp/second_opinion.csv', dtype='object', index_col=0)
second_opinion = second_opinion_df.second_opinion.to_dict()

def linked_function(article):
    new_judgment = ''
    while type(new_judgment) != bool:
        new_judgment = input("Is this a pogrom narrative? (y or 1 = Yes, n or 0 = No)")
        if new_judgment == 'y' or '1':
            new_judgment = True
        elif new_judgment == 'n' or '0':
            new_judgment = False
        else:
            print("I'm not sure how to interpret that.")

    second_opinion.update({int(article):str(new_judgment)})
    
def save_function():
    opinion = {'second_opinion': second_opinion}
    pd.DataFrame(opinion).to_csv('../pogrep/temp/second_opinion.csv')
    # Note: to load the newly completed segment, you'll need to drop '\_new' from the new file, and replace the old one. For safety, this should be done manually.

## Display Articles

In [4]:
display_requested_articles(metadata, chunk_size=3)

changes saved!


enter 1 to continue, or 0 to exit. 0
