# Functions

In [3]:
from IPython.display import clear_output
import pandas as pd
from pandas import Series
import numpy as np
import re
import csv
from gsheets import Sheets
from datetime import datetime

In [4]:
terms_header_list = ["AACR2_FLAG", "DISPLAY_DATE", "DISPLAY_NAME", "DISPLAY_ORDER", "END_DATE", "HISTORIC_FLAG", "OTHER_FLAGS", "PREFERRED", "START_DATE", "SUBJECT_ID", "TERM", "TERM_ID", "VERNACULAR"]
terms_dtype = {'TERM_ID': str, 'SUBJECT_ID' : str}
lang_rels_header_list = ["LANGUAGE_CODE", "PREFERRED", "SUBJECT_ID", "TERM_ID", "QUALIFIER", "TERM_TYPE", "PART_OF_SPEECH", "LANG_STAT"]
lang_rel_dtype = {'TERM_ID': str, 'SUBJECT_ID' : str, 'LANGUAGE_CODE': str}
english_language_codes = ['70051', '70052', '70053']

In [5]:
# load metadata
# Language Codes found here: https://www.getty.edu/research/tools/vocabularies/guidelines/tgn_4_5_appendix_e_languages.html
# 4.5.2 List of Languages

def load_metadata() -> pd.DataFrame:
    """load the metadata that will be used
    files were downloaded from: http://aatdownloads.getty.edu/
    """
    terms = read_aat_terms('./AAT_Files/TERM.out', terms_header_list, terms_dtype)
    lang_rels = read_aat_terms('./AAT_Files/LANGUAGE_RELS.out', lang_rels_header_list, lang_rel_dtype)
    concepts = (pd.merge(terms, lang_rels, how = 'inner', on='TERM_ID'))
    concepts['full_concept'] = (concepts.TERM + ' (' + concepts.QUALIFIER + ')')
    concepts['use_concept'] = (concepts.full_concept.fillna(concepts.TERM))
    concepts['join_concept'] = (concepts.use_concept.str.lower())
    concepts = concepts[concepts['LANGUAGE_CODE'].isin(english_language_codes)]

    return concepts

In [7]:
def read_aat_terms(file_path: str, names: list, data_types: dict) -> pd.DataFrame:
    data = pd.read_csv(file_path
                       , sep = '\t'
                       , warn_bad_lines=True
                       , error_bad_lines=False
                       , names=names
                       , dtype=data_types)
    return data

In [5]:
# get box data by box number (as an integer)

def return_box(box_num: int) -> pd.DataFrame:
    box_id = f'Box {str(box_num)}'
    df = s.find(box_id).to_frame()
    df = format_column_names(df)
    return df

In [6]:
# get column names in the correct format during the retrieval of box data

def format_column_names(df: pd.DataFrame) -> pd.DataFrame:
    column_names = list(map(lambda x: x.strip().lower().replace("dcterms:", ""), df.columns))
    df = df.set_axis(column_names, axis=1, inplace=False)
    return df

In [7]:
def split_df(df: pd.DataFrame) -> pd.DataFrame:
    """given a dataframe with multiple concepts specified, splits them and returns them 
    as one row per concept
    """
    split_df = df['subject'].str.split(',', expand=True).apply(Series, 1).stack()
    split_df.index = split_df.index.droplevel(-1)
    split_df.name = 'subjects'

    x = df.join(split_df)
    x['subjects'] = x['subjects'].str.strip()
    x['join_concept'] = x['subjects'].str.lower()
    
    return x

In [8]:
def terminate() -> str:
    """
    check to see if we should stop or continue the loop
    
    """
    
    terminate = input("Press 'X' key to terminate the program")
    if terminate == 'X':
        print('stopping')
    return terminate

In [9]:
def combine_boxes(df1: pd.DataFrame, df2: pd.DataFrame) -> pd.DataFrame:
    """assuming df1 and df2 are dataframes with same schema, combines them and returns them"""
    df_concat = pd.concat([df1, df2])
    return df_concat

In [10]:
def add_to_not_found(df: pd.DataFrame):
    """write to a csv the records that could not be matched"""
    
    file_name = 'csv_files/unmatched_subjects_v2.csv'
    
    try:
        old_df = pd.read_csv(file_name, header=0, index_col=False)
        combined_df = pd.concat([old_df, df])
    except(FileNotFoundError):
        combined_df = df
    
    combined_df.to_csv(file_name, index=False, quoting=csv.QUOTE_ALL)