# Generate Output
This notebook will read in the metadata from the [Arquin Spreadsheet](https://docs.google.com/spreadsheets/d/1LRbios7yQRo3aqCh0Es2Wiae_dicg_OtL-_yqP-Tb8I/edit#gid=1718343431) and product an output for ingestion into Omeka 

In [1]:
url = 'https://docs.google.com/spreadsheets/d/1LRbios7yQRo3aqCh0Es2Wiae_dicg_OtL-_yqP-Tb8I/edit#gid=1718343431'

In [None]:
SUBJECT_TERMS = ['SUBJECT_ID', 'TERM_ID', 'LANGUAGE_CODE', 'use_concept', 'BoxId', 'color']

In [None]:
FIELDS_TO_RETURN = ['identifier', 'media', 'title', 'subject', 'description', 'publisher', 
                 'date', 'rights', 'language', 'relation', 'format', 'type', 'coverage', 'spatial']

## Define the Output Columns and labels

In [None]:
output = ['identifier', 'media', 'title', 'title_ES', 'title_PT', 'subject','description', 'creator', 'publisher', 'date', 'rights', 'language', 'relation', 'format', 'type', 'coverage', 'spatial']

In [None]:
dc_output = ['dcterms:identifier',
'dcterms:media',
'dcterms:title', #English
'dcterms:title', #Spanish
'dcterms:title', #Portuguese 
'dcterms:subject',
'dcterms:description',
'dcterms:creator',
'dcterms:publisher',
'dcterms:date',
'dcterms:rights',
'dcterms:language',
'dcterms:relation',
'dcterms:format',
'dcterms:type',
'dcterms:coverage',
'dcterms:spatial']

## import libraries

In [None]:
from IPython.display import clear_output
import pandas as pd
from pandas import Series
import numpy as np
import re
import csv
from gsheets import Sheets
from datetime import datetime

In [None]:
# show all the columns up to 500
pd.set_option('display.max_columns', 500)

In [None]:
%run functions.ipynb

## Load the spreadsheet information

In [None]:
# if HttpAccessTokenRefreshError, close everything and start over
sheets = Sheets.from_files('~/client_secrets.json', '~/storage.json')
s = sheets.get(url)

## This is testing to figure out how to get all the data where it needs to go!

### Get the list of Identifiers by Box Number

In [None]:
# get the list of boxes containing metadata

box_list = []
box_list_sheets = s.sheets.titles()

for b in box_list_sheets:
    if 'Box' in b:
        box_list.append(int(b[4:]))

In [None]:
x = return_box(box_list[1])

In [None]:
# concepts[concepts['join_concept'].notna()]
'''strip leading and trailing space'''
 
x['title'] = x['title'].str.strip()

### Get the title translations
from Box 'unique_titles_06142022'

In [None]:
title_translations = s.find('unique_titles_06142022').to_frame()

In [None]:
# example
x[x['identifier'] == 'A-2 091'][FIELDS_TO_RETURN]

In [None]:
box_with_trans = x.merge(title_translations, how='left', left_on='title', right_on='Title', indicator=True)

In [None]:
# display the columns
box_with_trans.columns

In [None]:
FIELDS_PLUS_TITLES = ['title_ES', 'title_PT', '_merge']

In [None]:
box_with_trans = box_with_trans.rename(
    columns={
        'Translation ES': 'title_ES',
        'Translation PT': 'title_PT'
    })

In [None]:
# example
box_with_trans[box_with_trans['identifier'] == 'A-1 269'][FIELDS_TO_RETURN + FIELDS_PLUS_TITLES]

## Load Arquin Metadata with validated Subjects for parsing subjects

In [None]:
%run arquin_metadata_subjects_review_final.ipynb

In [None]:
box_with_trans_and_subjects = box_with_trans.merge(df, on=['media'])

## Rename the columns post join 

In [None]:
box_with_trans_and_subjects = box_with_trans_and_subjects.rename(columns={'identifier_x': 'identifier', 
'media': 'media', 
'title_x': 'title', 
'subject_x': 'subject', 
'description_x': 'description', 
'creator_x': 'creator', 
'publisher_x': 'publisher', 
'date_x': 'date', 
'rights_x': 'rights', 
'language_x': 'language', 
'relation_x': 'relation', 
'format_x': 'format', 
'type_x': 'type', 
'coverage_x': 'coverage', 
'spatial_x': 'spatial', 
'Index': 'Index', 
'Title': 'Title', 
'Translation ES': 'title_ES', 
'Translation PT': 'title_PT', 
'_merge': '_merge', 
'identifier_y': 'identifier_y', 
'title_y': 'title_y', 
'subject_y': 'subject_y', 
'description_y': 'description_y', 
'creator_y': 'creator_y', 
'publisher_y': 'publisher_y', 
'date_y': 'date_y', 
'rights_y': 'rights_y', 
'language_y': 'language_y', 
'relation_y': 'relation_y', 
'format_y': 'format_y', 
'type_y': 'type_y', 
'coverage_y': 'coverage_y', 
'spatial_y': 'spatial_y', 
'subjects': 'subjects', 
'join_concept': 'join_concept', 
'extra_notes': 'extra_notes', 
'unnamed: 2': 'unnamed: 2', 
'unnamed: 4': 'unnamed: 4', 
'AACR2_FLAG': 'AACR2_FLAG', 
'DISPLAY_DATE': 'DISPLAY_DATE', 
'DISPLAY_NAME': 'DISPLAY_NAME', 
'DISPLAY_ORDER': 'DISPLAY_ORDER', 
'END_DATE': 'END_DATE', 
'HISTORIC_FLAG': 'HISTORIC_FLAG', 
'OTHER_FLAGS': 'OTHER_FLAGS', 
'PREFERRED_x': 'PREFERRED', 
'START_DATE': 'START_DATE', 
'SUBJECT_ID_x': 'SUBJECT_ID', 
'TERM': 'TERM', 
'TERM_ID': 'TERM_ID', 
'VERNACULAR': 'VERNACULAR', 
'LANGUAGE_CODE': 'LANGUAGE_CODE', 
'PREFERRED_y': 'PREFERRED_y', 
'SUBJECT_ID_y': 'SUBJECT_ID_y', 
'QUALIFIER': 'QUALIFIER', 
'TERM_TYPE': 'TERM_TYPE', 
'PART_OF_SPEECH': 'PART_OF_SPEECH', 
'LANG_STAT': 'LANG_STAT', 
'full_concept': 'full_concept', 
'use_concept': 'use_concept', 
'BoxId': 'BoxId', 
'Unnamed: 0': 'Unnamed: 0', 
'color': 'color', 
'RGB': 'RGB'})

In [None]:
# example
box_with_trans_and_subjects[box_with_trans_and_subjects['identifier'] == 'A-1 269'][FIELDS_TO_RETURN + FIELDS_PLUS_TITLES+ SUBJECT_TERMS]

## Generate a distinct list of subjects for the box

In [None]:
subjets_dist = box_with_trans_and_subjects[['SUBJECT_ID', 'TERM_ID', 'use_concept']].drop_duplicates()

In [None]:
subjets_dist

In [None]:
concepts[concepts['PREFERRED_y'] == 'P'][['SUBJECT_ID_y', 'TERM_ID', 'use_concept', 'LANGUAGE_CODE', 'PREFERRED_y']].drop_duplicates()

In [None]:
concepts[concepts['PREFERRED_y'] == 'P'][['LANGUAGE_CODE']].drop_duplicates()

In [None]:
subjets_dist

In [None]:
concepts[concepts['PREFERRED_y'] == 'P'][['SUBJECT_ID_y', 'TERM_ID', 'use_concept', 'LANGUAGE_CODE', 'PREFERRED_y']].drop_duplicates()

In [None]:
concepts[concepts['PREFERRED_y'] == 'P'][['LANGUAGE_CODE']].drop_duplicates()

## Create the Combination Columns for Title and Translations

don't need to do this now that the columns will be kept separate

In [None]:
# title_cols = ['title', 'title_ES', 'title_PT']

In [None]:
# box_with_trans_and_subjects['title_trans'] = box_with_trans_and_subjects[title_cols].apply(lambda title_cols: ', '.join(title_cols.dropna()), axis=1)

### Rename the columns post join 

In [None]:
# box_with_trans_and_subjects = box_with_trans_and_subjects.rename(columns={'identifier_x': 'identifier', 
# 'media': 'media', 
# 'title_x': 'title', 
# 'subject_x': 'subject', 
# 'description_x': 'description', 
# 'creator_x': 'creator', 
# 'publisher_x': 'publisher', 
# 'date_x': 'date', 
# 'rights_x': 'rights', 
# 'language_x': 'language', 
# 'relation_x': 'relation', 
# 'format_x': 'format', 
# 'type_x': 'type', 
# 'coverage_x': 'coverage', 
# 'spatial_x': 'spatial', 
# 'Index': 'Index', 
# 'Title': 'Title', 
# 'Translation ES': 'title_ES', 
# 'Translation PT': 'title_PT', 
# '_merge': '_merge', 
# 'identifier_y': 'identifier_y', 
# 'title_y': 'title_y', 
# 'subject_y': 'subject_y', 
# 'description_y': 'description_y', 
# 'creator_y': 'creator_y', 
# 'publisher_y': 'publisher_y', 
# 'date_y': 'date_y', 
# 'rights_y': 'rights_y', 
# 'language_y': 'language_y', 
# 'relation_y': 'relation_y', 
# 'format_y': 'format_y', 
# 'type_y': 'type_y', 
# 'coverage_y': 'coverage_y', 
# 'spatial_y': 'spatial_y', 
# 'subjects': 'subjects', 
# 'join_concept': 'join_concept', 
# 'extra_notes': 'extra_notes', 
# 'unnamed: 2': 'unnamed: 2', 
# 'unnamed: 4': 'unnamed: 4', 
# 'AACR2_FLAG': 'AACR2_FLAG', 
# 'DISPLAY_DATE': 'DISPLAY_DATE', 
# 'DISPLAY_NAME': 'DISPLAY_NAME', 
# 'DISPLAY_ORDER': 'DISPLAY_ORDER', 
# 'END_DATE': 'END_DATE', 
# 'HISTORIC_FLAG': 'HISTORIC_FLAG', 
# 'OTHER_FLAGS': 'OTHER_FLAGS', 
# 'PREFERRED_x': 'PREFERRED', 
# 'START_DATE': 'START_DATE', 
# 'SUBJECT_ID_x': 'SUBJECT_ID', 
# 'TERM': 'TERM', 
# 'TERM_ID': 'TERM_ID', 
# 'VERNACULAR': 'VERNACULAR', 
# 'LANGUAGE_CODE': 'LANGUAGE_CODE', 
# 'PREFERRED_y': 'PREFERRED_y', 
# 'SUBJECT_ID_y': 'SUBJECT_ID_y', 
# 'QUALIFIER': 'QUALIFIER', 
# 'TERM_TYPE': 'TERM_TYPE', 
# 'PART_OF_SPEECH': 'PART_OF_SPEECH', 
# 'LANG_STAT': 'LANG_STAT', 
# 'full_concept': 'full_concept', 
# 'use_concept': 'use_concept', 
# 'BoxId': 'BoxId', 
# 'Unnamed: 0': 'Unnamed: 0', 
# 'color': 'color', 
# 'RGB': 'RGB'})

In [None]:
box_with_trans_and_subjects[FIELDS_TO_RETURN + FIELDS_PLUS_TITLES+ SUBJECT_TERMS]

## Generate a distinct list of subjects for the box

In [None]:
box_with_trans_and_subjects[['SUBJECT_ID', 'TERM_ID', 'use_concept']].drop_duplicates()

## Review the output with this Example

In [None]:
# example
box_with_trans_and_subjects[box_with_trans_and_subjects['identifier'] == 'A-1 269'][output].drop_duplicates()

## Create the Combination Columns for Title and Translations

In [None]:
title_cols = ['title', 'title_ES', 'title_PT']

In [None]:
# # Combined titles
# box_with_trans_and_subjects['title_trans'] = box_with_trans_and_subjects[title_cols].apply(lambda title_cols: ';'.join(title_cols.dropna()), axis=1)

In [None]:
# example
box_with_trans_and_subjects[box_with_trans_and_subjects['identifier'] == 'A-2 091'][FIELDS_TO_RETURN + FIELDS_PLUS_TITLES+ SUBJECT_TERMS+['title_trans']]

## Create the Combination Columns for Subjects (eventually also translations)

### example with  a single record

In [None]:
bts_ss = box_with_trans_and_subjects[box_with_trans_and_subjects['identifier'] == 'A-2 091'][['identifier', 'subject', 'use_concept', 'SUBJECT_ID']]

In [None]:
bts_comb =  bts_ss.groupby(['identifier', 'subject'])['use_concept'].apply(lambda x: ';'.join(x)).reset_index()

In [None]:
bts_comb =  bts_ss.groupby(['identifier', 'subject'])['SUBJECT_ID'].apply(lambda x: ';'.join(x)).reset_index()

In [None]:
bts_comb

### remove any use_concept records that are NaN

In [None]:
box_with_trans_and_subjects[box_with_trans_and_subjects['use_concept'].isna()]

In [None]:
bts = box_with_trans_and_subjects.dropna(subset=['use_concept'])

In [None]:
bts = box_with_trans_and_subjects.dropna(subset=['SUBJECT_ID'])

In [None]:
# example
box_with_trans_and_subjects[box_with_trans_and_subjects['identifier']=='A-1 269'][['identifier','subject', 'use_concept', 'SUBJECT_ID']]

In [None]:
# example
bts[bts['identifier'] == 'A-1 269'][['identifier','subject', 'use_concept', 'SUBJECT_ID']]

### having dropped NaN use_concepts, create the DF with the combined subjects
schema is identifier, use_concepts (combined)

In [None]:
bts_combined_subjects = bts.groupby(['identifier'])['use_concept'].apply(lambda x: '; '.join(x)).reset_index()

In [None]:
output_test_subj = bts_combined_subjects.merge(box_with_trans_and_subjects, on='identifier')
# x.merge(title_translations, how='left', left_on='title', right_on='Title', indicator=True)

In [None]:
bts_combined_subject_ids = bts.groupby(['identifier'])['SUBJECT_ID'].apply(lambda x: '; '.join(x)).reset_index()

In [None]:
# put in a rename for the bts_combined_subject_ids so that the final join doesn't have two SUBJECT_ID_y columns
bts_combined_subject_ids = bts_combined_subject_ids.rename(
    columns={
        'identifier': 'identifier',
        'SUBJECT_ID': 'subject_id_comb_en'
    }
)

In [None]:
output_test = output_test_subj.merge(bts_combined_subject_ids, on='identifier')
# x.merge(title_translations, how='left', left_on='title', right_on='Title', indicator=True)

In [None]:
output_test[output+['subject_id_comb_en']].drop_duplicates()

In [None]:
output_test_subj = bts_combined_subjects.merge(box_with_trans_and_subjects, on='identifier')
# x.merge(title_translations, how='left', left_on='title', right_on='Title', indicator=True)

In [None]:
bts_combined_subject_ids = bts.groupby(['identifier'])['SUBJECT_ID'].apply(lambda x: '; '.join(x)).reset_index()

In [None]:
# put in a rename for the bts_combined_subject_ids so that the final join doesn't have two SUBJECT_ID_y columns
bts_combined_subject_ids = bts_combined_subject_ids.rename(
    columns={
        'identifier': 'identifier',
        'SUBJECT_ID': 'subject_id_comb_en'
    }
)

In [None]:
output_test = output_test_subj.merge(bts_combined_subject_ids, on='identifier')
# x.merge(title_translations, how='left', left_on='title', right_on='Title', indicator=True)

In [None]:
final_output = output_test[output+['subject_id_comb_en']].drop_duplicates()

In [None]:
final_output

## This will be the script for outputting the file to the CSV

In [None]:
final_output[output].to_csv(
    path_or_buf='./csv_files/output_file_box2.csv',
    index=False, header=dc_output,
    quoting=csv.QUOTE_NONNUMERIC
)

## This will be the script for outputting the file to the CSV

In [58]:
final_output[output].to_csv(
    path_or_buf='./csv_files/output_file_box2.csv',
    index=False, header=dc_output,
    quoting=csv.QUOTE_NONNUMERIC
)