In [None]:
import pandas as pd
from pandas import Series
import numpy as np
from gsheets import Sheets

In [None]:
pd.set_option("display.max_rows", 300)

In [None]:
url = 'https://docs.google.com/spreadsheets/d/1LRbios7yQRo3aqCh0Es2Wiae_dicg_OtL-_yqP-Tb8I/edit#gid=1718343431'

In [None]:
box_list = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 40, 41, 42, 43, 51, 52, 53, 54, 55, 56, 58, 59, 60, 61, 62, 63, 64, 65, 69]
# box_list = [3, 4]

In [None]:
terms_header_list = ["AACR2_FLAG", "DISPLAY_DATE", "DISPLAY_NAME", "DISPLAY_ORDER", "END_DATE", "HISTORIC_FLAG", "OTHER_FLAGS", "PREFERRED", "START_DATE", "SUBJECT_ID", "TERM", "TERM_ID", "VERNACULAR"]
terms_dtype = {'TERM_ID': str, 'SUBJECT_ID' : str}
lang_rels_header_list = ["LANGUAGE_CODE", "PREFERRED", "SUBJECT_ID", "TERM_ID", "QUALIFIER", "TERM_TYPE", "PART_OF_SPEECH", "LANG_STAT"]
lang_rel_dtype = {'TERM_ID': str, 'SUBJECT_ID' : str, 'LANGUAGE_CODE': str}
spreadsheet = ['identifier', 'media', 'subject', 'title', 'subjects', 'join_concept']
narrow_list = ['identifier', 'media', 'subject', 'title', 'subjects', 'SUBJECT_ID_x', 'TERM_ID', 'join_concept']
concept_list = ['PREFERRED_x', 'SUBJECT_ID_x', 'TERM', 'TERM_ID', 'LANGUAGE_CODE', 'QUALIFIER', 'LANG_STAT', 'use_concept', 'join_concept']
alt_lang_list = ['identifier', 'media', 'subjects', 'SUBJECT_ID_x', 'TERM_ID_x', 'use_concept_x', 'LANGUAGE_CODE_y', 'PREFERRED_y_y', 'SUBJECT_ID_y_y', 'TERM_ID_y', 'use_concept_y']

In [None]:
def read_aat_terms(file_path: str, names: list, data_types: dict) -> pd.DataFrame:
    data = pd.read_csv(file_path
                       , sep = '\t'
                       , warn_bad_lines=True
                       , error_bad_lines=False
                       , names=names
                       , dtype=data_types)
    return data

In [None]:
def format_column_names(df: pd.DataFrame) -> pd.DataFrame:
    column_names = list(map(lambda x: x.strip().lower().replace("dcterms:", ""), df.columns))
    df = df.set_axis(column_names, axis=1, inplace=False)
    return df

In [None]:
def get_unique_subjects(box_number: int=1) -> list:
    subjects = []
    single_list = []
    box_id = f'Box {box_number}'
    try:
        df = s.find(box_id).to_frame()
        df = format_column_names(df)
        subjects = df['subject'].unique().tolist()
        for term in subjects:
            split_terms = str(term).split(',')
            for ind_term in split_terms:
                single_list.append(ind_term.strip())
    except:
        print(f'ParserError: could not access {box_id}')

    return single_list

In [None]:
def combine_boxes(df1: pd.DataFrame, df2: pd.DataFrame) -> pd.DataFrame:
    """assuming df1 and df2 are dataframes with same schema, combines them and returns them"""
    df_concat = pd.concat([df1, df2])
    return df_concat

In [None]:
def split_df(df: pd.DataFrame) -> pd.DataFrame:
    """given a dataframe with multiple concepts specified, splits them and returns them 
    as one row per concept
    """
    split_df = df['subject'].str.split(',', expand=True).apply(Series, 1).stack()
    split_df.index = split_df.index.droplevel(-1)
    split_df.name = 'subjects'
#     del df['subjects']
    x = df.join(split_df)
    x['subjects'] = x['subjects'].str.strip()
    x['join_concept'] = x['subjects'].str.lower()
    return x

In [None]:
def load_metadata() -> pd.DataFrame:
    """load the metadata that will be used"""
    terms = read_aat_terms('./AAT_Files/TERM.out', terms_header_list, terms_dtype)
    lang_rels = read_aat_terms('./AAT_Files/LANGUAGE_RELS.out', lang_rels_header_list, lang_rel_dtype)
    concepts = (pd.merge(terms, lang_rels, how = 'inner', on='TERM_ID'))
    concepts['full_concept'] = (concepts.TERM + ' (' + concepts.QUALIFIER + ')')
    concepts['use_concept'] = (concepts.full_concept.fillna(concepts.TERM))
    concepts['join_concept'] = (concepts.use_concept.str.lower())
    
    
    return concepts

In [None]:
def return_box(box_num: int) -> pd.DataFrame:
    box_id = f'Box {str(box_num)}'
    df = s.find(box_id).to_frame()
    df = format_column_names(df)
    return df

In [None]:
sheets = Sheets.from_files('~/client_secrets.json', '~/storage.json')
s = sheets.get(url)

In [None]:
for i, box_number in enumerate(box_list):
    df = return_box(box_number)
    try:
        df = split_df(df)
    except:
        print(f'failed to split df for box number {box_number}')
    if i+1 == 1:
        df_combined = df
    else:
        df_combined = combine_boxes(df_combined, df)

In [None]:
items_with_subjects = len(df_combined[df_combined['subject'].notnull()])
items_without_subjects = len(df_combined[df_combined['subject'].isnull()])
total_items = len(df_combined)
print(f'total items: {total_items}\nitems with subjects {items_with_subjects}\nitems without subjects {items_without_subjects}')
unique_subjects = len(df_combined['subjects'].unique())
print(f'there are {unique_subjects} unique subjects found')

In [None]:
df_combined[df_combined['media'] == 'Arquin_069_0026.jpg']

In [None]:
len(df_combined[df_combined['media'] == 'A-2_Arquin_003_0349.jpg'])

In [None]:
concepts = load_metadata()

In [None]:
concepts.columns

In [None]:
c = concepts[concepts['TERM'].notnull()][['TERM','SUBJECT_ID_x', 'TERM_ID', 'QUALIFIER', 'use_concept']]
c[c['TERM_ID'].isin(['1000007466','1000290603'])]

d = c[c['use_concept'] == 'churches (building)']
d
# d_item = d['use_concept'].item()
# d_item
# d

In [None]:
subjects_concat_merge = pd.merge(df_combined, concepts, how="left", on='join_concept')

In [None]:
a = df_combined[df_combined['media']=='A-2_Arquin_003_0349.jpg']
a_item = a[a['subjects'] == 'churches (building)']['subjects'].item()
a_item

In [None]:
d['use_concept'].item() == a[a['subjects'] == 'churches (building)']['subjects'].item()

In [None]:
d_test = pd.merge(a, d, how='left', left_on='subjects', right_on='use_concept')
d_test

In [None]:
# pd.set_option("display.max_rows", None, "display.max_columns", None)
subjects_concat_merge[narrow_list]

In [None]:
subjects_concat_merge[subjects_concat_merge['media']=='A-2_Arquin_003_0349.jpg'][narrow_list]

In [None]:
unmatched_concepts = subjects_concat_merge[subjects_concat_merge['TERM_ID'].isnull()]['subjects']

In [None]:
empty_term = subjects_concat_merge[subjects_concat_merge['TERM_ID'].isnull()]
empty_term_n = empty_term[['subjects','media']].groupby(['subjects'])['media'] \
                                                .count() \
                                                .reset_index(name='count') \
                                                .sort_values(['count'], ascending=False)
empty_term_n

In [None]:
alt_lang = (pd.merge(subjects_concat_merge, concepts, how='left', on='SUBJECT_ID_x'))
alt_lang = alt_lang[
    (alt_lang['PREFERRED_y_y'].isnull()) 
    | (alt_lang['PREFERRED_y_y'] == 'P')]

In [None]:
espanol_code = '70641'

alt_lang_espanol = alt_lang[(alt_lang['LANGUAGE_CODE_y'] == espanol_code) 
                    | (alt_lang['LANGUAGE_CODE_y'].isnull())
                   ]
# alt_lang_espanol = alt_lang_espanol[alt_lang_espanol['media'] == 'A-2_Arquin_003_0349.jpg']
alt_lang_espanol[alt_lang_list]

In [None]:
df_combined['title']