In [1]:
from IPython.display import clear_output
import pandas as pd
from pandas import Series
import numpy as np
import re
import csv
from gsheets import Sheets
from datetime import datetime

In [2]:
url = 'https://docs.google.com/spreadsheets/d/1LRbios7yQRo3aqCh0Es2Wiae_dicg_OtL-_yqP-Tb8I/edit#gid=1718343431'

## Create resusable lists for reference in data display or collection

In [3]:
# lists used in the loading of the AAT Metadata
terms_header_list = ["AACR2_FLAG", "DISPLAY_DATE", "DISPLAY_NAME", "DISPLAY_ORDER", "END_DATE", "HISTORIC_FLAG", "OTHER_FLAGS", "PREFERRED", "START_DATE", "SUBJECT_ID", "TERM", "TERM_ID", "VERNACULAR"]
terms_dtype = {'TERM_ID': str, 'SUBJECT_ID' : str}
lang_rels_header_list = ["LANGUAGE_CODE", "PREFERRED", "SUBJECT_ID", "TERM_ID", "QUALIFIER", "TERM_TYPE", "PART_OF_SPEECH", "LANG_STAT"]
lang_rel_dtype = {'TERM_ID': str, 'SUBJECT_ID' : str, 'LANGUAGE_CODE': str}
narrow_list = ['identifier', 'media', 'subject', 'title', 'subjects', 'SUBJECT_ID_x', 'TERM_ID', 'join_concept', 'BoxId', 'color']

## Functions

In [4]:
%run functions.ipynb

## Load the spreadsheet information

In [5]:
# if HttpAccessTokenRefreshError, close everything and start over
sheets = Sheets.from_files('~/client_secrets.json', '~/storage.json')
s = sheets.get(url)

In [6]:
# get the list of boxes containing metadata

box_list = []
box_list_sheets = s.sheets.titles()

for b in box_list_sheets:
    if 'Box' in b:
        box_list.append(int(b[4:]))

### Consider comma cases

given that we'll parse the created Arquin metadata by comma, we need to handle cases where the subject term, aka concept, contains an internal comma (e.g. "knives, gauge"). These will need to be parsed carfully for two reasons:

1.) The comma could falsely parse the concept

2.) The individual components of the concept could be valid AAT terms, too... eg. "knives, gauge" Both "knives" and "gauge" are valid AAT terms while so is "knives, gauge". Knowing what the metata analyst intended in this case is highly difficult to determine. 

In [7]:
concepts = load_metadata()

  exec(code_obj, self.user_global_ns, self.user_ns)


In [8]:
# review the metadata for concepts containing a comma and return those as a list
concepts_drop = concepts[concepts['use_concept'].notna()]
concepts_search = concepts_drop[concepts_drop['use_concept'].str.contains(',')]
concepts_comma = concepts_search['use_concept'].to_list()

## Load the Metadata from Getty AAT

Individual AAT Terms can be seen here:
https://www.getty.edu/research/tools/vocabularies/aat/

We use the .Out files available below for consuming the data - these are batch released so terms in the .Out files may note be availalbe on the AAT search (link above) or vice-versa:
http://aatdownloads.getty.edu/

Note that when we load files, we join the LANGUAGE_RELS.out and TERM.out files and combine the term and and the qualifier to create the "subjects" and "join_concepts"

The join_concept ensures the case for the term is all lower case

We also create a "full_concept" but doesn't handle the NULL case well and shouldn't be used for analysis

In [9]:
concepts[concepts['join_concept'].notna()]

Unnamed: 0,AACR2_FLAG,DISPLAY_DATE,DISPLAY_NAME,DISPLAY_ORDER,END_DATE,HISTORIC_FLAG,OTHER_FLAGS,PREFERRED_x,START_DATE,SUBJECT_ID_x,...,LANGUAGE_CODE,PREFERRED_y,SUBJECT_ID_y,QUALIFIER,TERM_TYPE,PART_OF_SPEECH,LANG_STAT,full_concept,use_concept,join_concept
0,,,,3.0,,C,,V,,300022903,...,70051,N,300022903,,UF,U,U,,"knives, gauge","knives, gauge"
1,,,,1.0,,C,,P,,300022904,...,70051,P,300022904,,D,PN,U,,hand knives,hand knives
2,,,,2.0,,C,,V,,300022904,...,70051,N,300022904,,AD,SN,U,,hand knife,hand knife
3,,,,3.0,,C,,V,,300022904,...,70051,N,300022904,,UF,U,U,,"knives, hand","knives, hand"
4,,,,1.0,,C,,P,,300022905,...,70051,P,300022905,,D,PN,U,,hawkbill knives,hawkbill knives
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
483188,,,,1.0,,C,,P,,300430882,...,70051,P,300430882,,D,U,U,,prophylactics,prophylactics
483189,,,,1.0,,C,,P,,300434048,...,70051,P,300434048,,D,PN,U,,styptic pencils,styptic pencils
483190,,,,1.0,,C,,P,,300434776,...,70051,P,300434776,,D,PN,U,,tissues,tissues
483191,,,,1.0,,C,,P,,300431243,...,70051,P,300431243,,D,PN,U,,towelettes,towelettes


In [10]:
# load all the boxes and split them
for i, box_number in enumerate(box_list):
    df = return_box(box_number)
    clear_output(wait=True)
    print(f"working on Box {box_number}")
    
    try:
        df = split_df(df)
    except:
        print(f'failed to split df for box number {box_number}')

    if i+1 == 1:
        df_combined = df
    else:
        df_combined = combine_boxes(df_combined, df)
 
clear_output(wait=True)    

df = df_combined
df['subjects'] = df['subjects'].replace(r'^\s*$', np.nan, regex=True)
df = df.dropna(subset=['subjects'])
print(f'complete, imported {i+1} boxes')

complete, imported 69 boxes


In [11]:
# join the df with the concepts
df = pd.merge(df, concepts, how="left", on='join_concept')

In [12]:
# get the boxid
df['boxid_temp'] = df['media'].str.extract(r"(_0\d+_)")
df = df[df['media'].notna()]
df['boxid_temp'] = df['boxid_temp'].str.replace("_","")
df['boxid_temp'] = pd.to_numeric(df['boxid_temp'], errors='coerce')
df = df.dropna(subset=['boxid_temp'])
df['boxid_temp'] = df['boxid_temp'].astype('int')
df['boxid_temp'] = df['boxid_temp'].astype('str')
df['BoxId'] = 'Box '+df['boxid_temp']
df.drop('boxid_temp', inplace=True, axis=1)

## Add in Tab Color

The colors of each tab are identified in 'get_spreadsheets_color.ipynb'. That script creates a CSV file and that is read in and added to the df_combined object

Read in the color match file created here:
http://localhost:8888/notebooks/get_spreadsheets.ipynb

use that to create the list of titles based on color of the tab

In [13]:
color_match_df = pd.read_csv('csv_files/color_match', sep='\t')

In [14]:
df = pd.merge(df, color_match_df, on='BoxId')

In [15]:
# get all the unique media IDs where media is not NA 
df_medias = df.media.dropna().unique()

Run the full flow through in the following cell. Be sure to DELETE the unmatched_subjects_v2.csv file before staring

In [16]:
df_medias_len = len(df_medias)

for i, search_media in enumerate(df_medias):
    clear_output(wait=True) 
    print(f'{i} of {df_medias_len}: search_media = {search_media}')
#     x = input('step 1: press any key to continue... ')
    
    try_match_neg = None
    try_match_pos = None
    cancel_repeat = False
    concepts_found_pos = []
    concepts_found_neg = []
    
    df_subjects = df[df['media'] == search_media].join_concept.to_list()
    df_media_no_match = df[df['media'] == search_media].loc[df['SUBJECT_ID_x'].isna()]
#     print(f'{i}: search_media = {search_media}, df_media_no_match = {df_media_no_match}')
#     x = input('press any key to continue... ')
    
    if len(df_media_no_match) > 0:
#         print(f'{i}: search_media = {search_media}, passed len(df_media_no_match) > 0 ')
#         x = input('step 2: press any key to continue... ')
        
        
        no_match = df_media_no_match.join_concept.to_list()
#         print(f'{i}: search_media: {search_media}, no_match = {no_match}')
        
        for i, unmatched in enumerate(no_match):
#             print(f'{i}: unmatched = {unmatched}, remove_con = {remove_con}')
#             x = input('step 3: press any key to continue... ')
#             if cancel_repeat == True:
#                 break
            
            if i > 0:
                if unmatched == remove_this_con:
#                     print(f'already removed {remove_con} from  {search_media}')
#                     x = input(f'step 4: it worked! Continue? (press x)')
#                     cancel_repeat = True
                    break
            
            pos = False
            neg = False
            new_concept = None
            pop_index = None
            
            no_match_index = df_subjects.index(unmatched)
#             print(f'step 5: {i+1}: unmatched concept is: "{unmatched}" and its index is "{no_match_index}" out of {len(df_subjects)-1}')
            
            if no_match_index > 0:
                try_match_neg = df_subjects[no_match_index-1]+', '+df_subjects[no_match_index]
                concepts_found_pos = concepts.loc[concepts['join_concept'] == try_match_neg]
                if len(concepts_found_pos) > 0:
                    pos = True
#                     print(f'pos is {pos}')
            if no_match_index < len(df_subjects)-1:
                try_match_pos = df_subjects[no_match_index]+', '+df_subjects[no_match_index+1]
                concepts_found_neg = concepts.loc[concepts['join_concept'] == try_match_pos]
                if len(concepts_found_neg) > 0:
                    neg = True
#                     print(f'neg is {neg}')
#             print(f'try_match_neg = "{try_match_neg}"\ntry_match_pos = "{try_match_pos}"')
            
            try:
                if len(concepts_found_pos) == 1:
                    new_concept = concepts_found_pos.join_concept.to_list()[0]
                    
                        
            except (NameError):
                pass
            try:
                if len(concepts_found_neg) == 1:
                    new_concept = concepts_found_neg.join_concept.to_list()[0]
            except (NameError):
                pass
            try:
                if (len(concepts_found_pos) > 0) & (len(concepts_found_neg) > 0):
                    print('too many matches!')
                    pass
            except (NameError):
                pass
            try:
                if (len(concepts_found_pos) == 0) & (len(concepts_found_neg) == 0):
                    unmatched_df = df[(df['media'] == search_media) 
                                      & (df['join_concept'] == unmatched)][narrow_list]
                    add_to_not_found(unmatched_df)
            except (NameError):
                pass
            
            remove_con = df_subjects[no_match_index]
            remove_this_con_index = 0
            
            if pos:
                remove_this_con_index = no_match_index-1
            if neg:
                remove_this_con_index = no_match_index+1

            if pos | neg:
                remove_this_con = df_subjects[remove_this_con_index]

                df = df.drop(
                    df[((df['media'] == search_media) & 
                                      (df['join_concept'] == remove_this_con)
                                     )].index
                )
            
    #     determine the location in the index of the concept to be removed
                pop_index = df.loc[(df['media'] == search_media) & (df['join_concept'] == remove_con)].index
    #     remove the concept found at that index
                df.at[pop_index[0], 'join_concept'] = new_concept
                df.at[pop_index[0], 'subjects'] = new_concept
                df.at[pop_index[0], 'SUBJECT_ID_x'] = concepts.at[concepts.loc[concepts['join_concept'] == new_concept].index[0], 'SUBJECT_ID_x']
                df.at[pop_index[0], 'TERM_ID'] = concepts.at[concepts.loc[concepts['join_concept'] == new_concept].index[0], 'TERM_ID']

            print(f'new_concept = {new_concept}')
    

16221 of 16222: search_media = Arquin_069_0217.jpg
