# Generate Output
This notebook will read in the metadata from the [Arquin Spreadsheet](https://docs.google.com/spreadsheets/d/1LRbios7yQRo3aqCh0Es2Wiae_dicg_OtL-_yqP-Tb8I/edit#gid=1718343431) and product an output for ingestion into Omeka 

In [1]:
url = 'https://docs.google.com/spreadsheets/d/1LRbios7yQRo3aqCh0Es2Wiae_dicg_OtL-_yqP-Tb8I/edit#gid=1718343431'

## import libraries

In [2]:
from IPython.display import clear_output
import pandas as pd
from pandas import Series
import numpy as np
import re
import csv
from gsheets import Sheets
from datetime import datetime

In [3]:
%run functions.ipynb

## Load the spreadsheet information

In [5]:
# if HttpAccessTokenRefreshError, close everything and start over
sheets = Sheets.from_files('~/client_secrets.json', '~/storage.json')
s = sheets.get(url)

## This is testing to figure out how to get all the data where it needs to go!

### Get the list of Identifiers by Box Number

In [6]:
# get the list of boxes containing metadata

box_list = []
box_list_sheets = s.sheets.titles()

for b in box_list_sheets:
    if 'Box' in b:
        box_list.append(int(b[4:]))

In [None]:
x = return_box(box_list[1])

In [None]:
# concepts[concepts['join_concept'].notna()]
'''strip leading and trailing space'''
 
x['title'] = x['title'].str.strip()

### Get the title translations
from Box 'unique_titles_06142022'

In [None]:
title_translations = s.find('unique_titles_06142022').to_frame()

In [None]:
FIELDS_TO_RETURN = ['identifier', 'media', 'title', 'subject', 'description', 'publisher', 
                 'date', 'rights', 'language', 'relation', 'format', 'type', 'coverage', 'spatial']

In [None]:
x[x['identifier'] == 'A-2 091'][FIELDS_TO_RETURN]

In [None]:
box_with_trans = x.merge(title_translations, how='left', left_on='title', right_on='Title', indicator=True)

In [None]:
box_with_trans.columns

In [None]:
FIELDS_PLUS_TITLES = ['title_ES', 'title_PT', '_merge']

In [None]:
FIELDS_PLUS_TITLES + FIELDS_TO_RETURN

In [None]:
box_with_trans[box_with_trans['identifier'] == 'A-2 091'][FIELDS_TO_RETURN + FIELDS_PLUS_TITLES]

## Load Arquin Metadata with validated Subjects for parsing subjects

In [None]:
%run arquin_metadata_subjects_review_final.ipynb

In [None]:
box_with_trans_and_subjects = box_with_trans.merge(df, on=['media'])

In [None]:
SUBJECT_TERMS = ['SUBJECT_ID', 'TERM_ID', 'LANGUAGE_CODE', 'use_concept', 'BoxId', 'color']

## Rename the columns post join 

In [None]:
box_with_trans_and_subjects = box_with_trans_and_subjects.rename(columns={'identifier_x': 'identifier', 
'media': 'media', 
'title_x': 'title', 
'subject_x': 'subject', 
'description_x': 'description', 
'creator_x': 'creator', 
'publisher_x': 'publisher', 
'date_x': 'date', 
'rights_x': 'rights', 
'language_x': 'language', 
'relation_x': 'relation', 
'format_x': 'format', 
'type_x': 'type', 
'coverage_x': 'coverage', 
'spatial_x': 'spatial', 
'Index': 'Index', 
'Title': 'Title', 
'Translation ES': 'title_ES', 
'Translation PT': 'title_PT', 
'_merge': '_merge', 
'identifier_y': 'identifier_y', 
'title_y': 'title_y', 
'subject_y': 'subject_y', 
'description_y': 'description_y', 
'creator_y': 'creator_y', 
'publisher_y': 'publisher_y', 
'date_y': 'date_y', 
'rights_y': 'rights_y', 
'language_y': 'language_y', 
'relation_y': 'relation_y', 
'format_y': 'format_y', 
'type_y': 'type_y', 
'coverage_y': 'coverage_y', 
'spatial_y': 'spatial_y', 
'subjects': 'subjects', 
'join_concept': 'join_concept', 
'extra_notes': 'extra_notes', 
'unnamed: 2': 'unnamed: 2', 
'unnamed: 4': 'unnamed: 4', 
'AACR2_FLAG': 'AACR2_FLAG', 
'DISPLAY_DATE': 'DISPLAY_DATE', 
'DISPLAY_NAME': 'DISPLAY_NAME', 
'DISPLAY_ORDER': 'DISPLAY_ORDER', 
'END_DATE': 'END_DATE', 
'HISTORIC_FLAG': 'HISTORIC_FLAG', 
'OTHER_FLAGS': 'OTHER_FLAGS', 
'PREFERRED_x': 'PREFERRED', 
'START_DATE': 'START_DATE', 
'SUBJECT_ID_x': 'SUBJECT_ID', 
'TERM': 'TERM', 
'TERM_ID': 'TERM_ID', 
'VERNACULAR': 'VERNACULAR', 
'LANGUAGE_CODE': 'LANGUAGE_CODE', 
'PREFERRED_y': 'PREFERRED_y', 
'SUBJECT_ID_y': 'SUBJECT_ID_y', 
'QUALIFIER': 'QUALIFIER', 
'TERM_TYPE': 'TERM_TYPE', 
'PART_OF_SPEECH': 'PART_OF_SPEECH', 
'LANG_STAT': 'LANG_STAT', 
'full_concept': 'full_concept', 
'use_concept': 'use_concept', 
'BoxId': 'BoxId', 
'Unnamed: 0': 'Unnamed: 0', 
'color': 'color', 
'RGB': 'RGB'})

In [None]:
# show all the columns up to 500
pd.set_option('display.max_columns', 500)

In [None]:
box_with_trans_and_subjects[box_with_trans_and_subjects['identifier'] == 'A-2 091'][FIELDS_TO_RETURN + FIELDS_PLUS_TITLES+ SUBJECT_TERMS]

## Generate a distinct list of subjects for the box

In [None]:
box_with_trans_and_subjects[['SUBJECT_ID', 'TERM_ID', 'use_concept']].drop_duplicates()

## Define the Output Columns and labels

In [None]:
output = ['identifier', 'media', 'title', 'subject','description', 'creator', 'publisher', 'date', 'rights', 'language', 'relation', 'format', 'type', 'coverage', 'spatial']

In [None]:
dc_output = ['dcterms:identifier',
'dcterms:media',
'dcterms:title',
'dcterms:subject',
'dcterms:description',
'dcterms:creator',
'dcterms:publisher',
'dcterms:date',
'dcterms:rights',
'dcterms:language',
'dcterms:relation',
'dcterms:format',
'dcterms:type',
'dcterms:coverage',
'dcterms:spatial']

## Create the Combination Columns for Title and Translations

In [None]:
title_cols = ['title', 'title_ES', 'title_PT']

In [None]:
box_with_trans_and_subjects['title_trans'] = box_with_trans_and_subjects[title_cols].apply(lambda title_cols: ', '.join(title_cols.dropna()), axis=1)

## Rename the columns post join 

In [None]:
box_with_trans_and_subjects = box_with_trans_and_subjects.rename(columns={'identifier_x': 'identifier', 
'media': 'media', 
'title_x': 'title', 
'subject_x': 'subject', 
'description_x': 'description', 
'creator_x': 'creator', 
'publisher_x': 'publisher', 
'date_x': 'date', 
'rights_x': 'rights', 
'language_x': 'language', 
'relation_x': 'relation', 
'format_x': 'format', 
'type_x': 'type', 
'coverage_x': 'coverage', 
'spatial_x': 'spatial', 
'Index': 'Index', 
'Title': 'Title', 
'Translation ES': 'title_ES', 
'Translation PT': 'title_PT', 
'_merge': '_merge', 
'identifier_y': 'identifier_y', 
'title_y': 'title_y', 
'subject_y': 'subject_y', 
'description_y': 'description_y', 
'creator_y': 'creator_y', 
'publisher_y': 'publisher_y', 
'date_y': 'date_y', 
'rights_y': 'rights_y', 
'language_y': 'language_y', 
'relation_y': 'relation_y', 
'format_y': 'format_y', 
'type_y': 'type_y', 
'coverage_y': 'coverage_y', 
'spatial_y': 'spatial_y', 
'subjects': 'subjects', 
'join_concept': 'join_concept', 
'extra_notes': 'extra_notes', 
'unnamed: 2': 'unnamed: 2', 
'unnamed: 4': 'unnamed: 4', 
'AACR2_FLAG': 'AACR2_FLAG', 
'DISPLAY_DATE': 'DISPLAY_DATE', 
'DISPLAY_NAME': 'DISPLAY_NAME', 
'DISPLAY_ORDER': 'DISPLAY_ORDER', 
'END_DATE': 'END_DATE', 
'HISTORIC_FLAG': 'HISTORIC_FLAG', 
'OTHER_FLAGS': 'OTHER_FLAGS', 
'PREFERRED_x': 'PREFERRED', 
'START_DATE': 'START_DATE', 
'SUBJECT_ID_x': 'SUBJECT_ID', 
'TERM': 'TERM', 
'TERM_ID': 'TERM_ID', 
'VERNACULAR': 'VERNACULAR', 
'LANGUAGE_CODE': 'LANGUAGE_CODE', 
'PREFERRED_y': 'PREFERRED_y', 
'SUBJECT_ID_y': 'SUBJECT_ID_y', 
'QUALIFIER': 'QUALIFIER', 
'TERM_TYPE': 'TERM_TYPE', 
'PART_OF_SPEECH': 'PART_OF_SPEECH', 
'LANG_STAT': 'LANG_STAT', 
'full_concept': 'full_concept', 
'use_concept': 'use_concept', 
'BoxId': 'BoxId', 
'Unnamed: 0': 'Unnamed: 0', 
'color': 'color', 
'RGB': 'RGB'})

In [None]:
# show all the columns up to 500
pd.set_option('display.max_columns', 500)

In [None]:
box_with_trans_and_subjects[FIELDS_TO_RETURN + FIELDS_PLUS_TITLES+ SUBJECT_TERMS]

## Generate a distinct list of subjects for the box

In [None]:
box_with_trans_and_subjects[['SUBJECT_ID', 'TERM_ID', 'use_concept']].drop_duplicates()

## This will be the script for outputting the file to the CSV

In [None]:
box_with_trans_and_subjects[output].to_csv(
    path_or_buf='./csv_files/output_file_box2.csv',
    index=False, header=dc_output,
    quoting=csv.QUOTE_NONNUMERIC
)

In [None]:
box_with_trans_and_subjects[box_with_trans_and_subjects['identifier'] == 'A-2 091'][output]

## Define the Output Columns and labels

In [None]:
output = ['identifier', 'media', 'title_trans', 'use_concept_x','description', 'creator', 'publisher', 'date', 'rights', 'language', 'relation', 'format', 'type', 'coverage', 'spatial']

In [None]:
dc_output = ['dcterms:identifier',
'dcterms:media',
'dcterms:title',
'dcterms:subject',
'dcterms:description',
'dcterms:creator',
'dcterms:publisher',
'dcterms:date',
'dcterms:rights',
'dcterms:language',
'dcterms:relation',
'dcterms:format',
'dcterms:type',
'dcterms:coverage',
'dcterms:spatial']

## Create the Combination Columns for Title and Translations

In [None]:
title_cols = ['title', 'title_ES', 'title_PT']

In [None]:
box_with_trans_and_subjects['title_trans'] = box_with_trans_and_subjects[title_cols].apply(lambda title_cols: '; '.join(title_cols.dropna()), axis=1)

In [None]:
box_with_trans_and_subjects[box_with_trans_and_subjects['identifier'] == 'A-2 091'][FIELDS_TO_RETURN + FIELDS_PLUS_TITLES+ SUBJECT_TERMS+['title_trans']].to_clipboard()

## Create the Combination Columns for Subjects (eventually also translations)

### example with  a single record

In [None]:
bts_ss = box_with_trans_and_subjects[box_with_trans_and_subjects['identifier'] == 'A-2 091'][['identifier', 'subject', 'use_concept']]

In [None]:
bts_comb =  bts_ss.groupby(['identifier', 'subject'])['use_concept'].apply(lambda x: '; '.join(x)).reset_index()

In [None]:
bts_comb

### remove any use_concept records that are NaN

In [None]:
box_with_trans_and_subjects[box_with_trans_and_subjects['use_concept'].isna()]

In [None]:
bts = box_with_trans_and_subjects.dropna(subset=['use_concept'])

In [None]:
box_with_trans_and_subjects[box_with_trans_and_subjects['identifier']=='A-2 049'][['identifier','subject', 'use_concept']]

In [None]:
bts[bts['identifier'] == 'A-2 049'][['identifier','subject', 'use_concept']]

### having dropped NaN use_concepts, create the DF with the combined subjects
schema is identifier, use_concepts (combined)

In [None]:
bts_combined_subjects = bts.groupby(['identifier'])['use_concept'].apply(lambda x: '; '.join(x)).reset_index()

In [None]:
output_test = bts_combined_subjects.merge(box_with_trans_and_subjects, on='identifier')
# x.merge(title_translations, how='left', left_on='title', right_on='Title', indicator=True)

In [None]:
output_test[output].drop_duplicates()

## This will be the script for outputting the file to the CSV

In [None]:
output_test[output].to_csv(
    path_or_buf='./csv_files/output_file.csv',
    index=False, header=dc_output,
    quoting=csv.QUOTE_NONNUMERIC
)

In [239]:
bts_comb =  bts_ss.groupby(['identifier', 'subject'])['use_concept'].apply(lambda x: '; '.join(x)).reset_index()

In [240]:
bts_comb

Unnamed: 0,identifier,subject,use_concept
0,A-2 091,"buildings (structures), streets, flags",buildings (structures); streets; flags


### remove any use_concept records that are NaN

In [238]:
box_with_trans_and_subjects[box_with_trans_and_subjects['use_concept'].isna()]

Unnamed: 0.1,identifier,media,title,subject,description,creator,publisher,date,rights,language,relation,format,type,coverage,spatial,extra_notes_x,Index,Title,Spanish (from Google),title_ES,Unnamed: 4,Portuguese(from Google),title_PT,_merge,identifier_y,title_y,subject_y,description_y,creator_y,publisher_y,date_y,rights_y,language_y,relation_y,format_y,type_y,coverage_y,spatial_y,subjects,join_concept,extra_notes_y,unnamed: 2,\,unnamed: 4,AACR2_FLAG,DISPLAY_DATE,DISPLAY_NAME,DISPLAY_ORDER,END_DATE,HISTORIC_FLAG,OTHER_FLAGS,PREFERRED,START_DATE,SUBJECT_ID,TERM,TERM_ID,VERNACULAR,LANGUAGE_CODE,PREFERRED_y,SUBJECT_ID_y,QUALIFIER,TERM_TYPE,PART_OF_SPEECH,LANG_STAT,full_concept,use_concept,BoxId,Unnamed: 0,color,RGB,title_trans
419,A-2 049,A-2_Arquin_002_049.jpg,Llamas in La Paz,"llamas, cities",La Paz - Llamas,"Florence Arquin, photographer",Florida Atlantic University,,The copyright and related rights status of thi...,,,35 mm kodachrome slide,still image,-16.500/ -68.1667,"La Paz (inhabited place), La Paz (department),...",La Paz,32.0,Llamas in La Paz,Llamas en la paz,Llamas en la paz,,Lhamas em La Paz,Lhamas em La Paz,both,A-2 049,Llamas in La Paz,"llamas, cities",La Paz - Llamas,"Florence Arquin, photographer",Florida Atlantic University,,The copyright and related rights status of thi...,,,35 mm kodachrome slide,still image,-16.500/ -68.1667,"La Paz (inhabited place), La Paz (department),...",llamas,llamas,La Paz,,,,,,,,,,,,,,,,,,,,,,,,,,Box 2,33,purple,"(60, 0, 100)",Llamas in La Paz; Llamas en la paz; Lhamas em ...


In [241]:
bts = box_with_trans_and_subjects.dropna(subset=['use_concept'])

In [242]:
box_with_trans_and_subjects[box_with_trans_and_subjects['identifier']=='A-2 049'][['identifier','subject', 'use_concept']]

Unnamed: 0,identifier,subject,use_concept
419,A-2 049,"llamas, cities",
420,A-2 049,"llamas, cities",cities


In [244]:
bts[bts['identifier'] == 'A-2 049'][['identifier','subject', 'use_concept']]

Unnamed: 0,identifier,subject,use_concept
420,A-2 049,"llamas, cities",cities


### having dropped NaN use_concepts, create the DF with the combined subjects
schema is identifier, use_concepts (combined)

In [246]:
bts_combined_subjects = bts.groupby(['identifier'])['use_concept'].apply(lambda x: '; '.join(x)).reset_index()

In [255]:
output_test = bts_combined_subjects.merge(box_with_trans_and_subjects, on='identifier')
# x.merge(title_translations, how='left', left_on='title', right_on='Title', indicator=True)

In [259]:
output_test[output].drop_duplicates()

Unnamed: 0,identifier,media,title_trans,use_concept_x,description,creator,publisher,date,rights,language,relation,format,type,coverage,spatial
0,A-1 269,A-1_Arquin_002_0269.jpg,View of Ghent; Vista de Ghent; Vista de Ghent,arch bridges; canals (waterways); cathedrals (...,,"Florence Arquin, photographer",Florida Atlantic University,,The copyright and related rights status of thi...,,,35 mm kodachrome slide,still image,51.0500/ 3.7167,"Ghent (inhabited place), East Flanders (provin..."
3,A-1 270,A-1_Arquin_002_0270.jpg,View of Ghent; Vista de Ghent; Vista de Ghent,canals (waterways); streets; clock towers (tow...,,"Florence Arquin, photographer",Florida Atlantic University,,The copyright and related rights status of thi...,,,35 mm kodachrome slide,still image,51.0500/ 3.7167,"Ghent (inhabited place), East Flanders (provin..."
6,A-1 271,A-1_Arquin_002_0271.jpg,Quay of Herbs; Muelle de hierbas; Cais das Ervas,quays; streets; clock towers (towers),Quay of Herbs,"Florence Arquin, photographer",Florida Atlantic University,,The copyright and related rights status of thi...,,,35 mm kodachrome slide,still image,51.0500/ 3.7167,"Ghent (inhabited place), East Flanders (provin..."
9,A-1 272,A-1_Arquin_002_0272.jpg,Quay of Herbs; Muelle de hierbas; Cais das Ervas,quays; streets; clock towers (towers),Quay of Herbs,"Florence Arquin, photographer",Florida Atlantic University,,The copyright and related rights status of thi...,,,35 mm kodachrome slide,still image,51.0500/ 3.7167,"Ghent (inhabited place), East Flanders (provin..."
12,A-1 273,A-1_Arquin_002_0273.jpg,Quay of Herbs; Muelle de hierbas; Cais das Ervas,quays; streets; clock towers (towers),Quay of Herbs,"Florence Arquin, photographer",Florida Atlantic University,,The copyright and related rights status of thi...,,,35 mm kodachrome slide,still image,51.0500/ 3.7167,"Ghent (inhabited place), East Flanders (provin..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
680,A-2 334,A-2_Arquin_002_334.jpg,"Church of San Francisco, Upper Section; Iglesi...",churches (buildings); architectural sculpture,Colonial Churches - San Francisco (1753-1772) ...,"Florence Arquin, photographer",Florida Atlantic University,,The copyright and related rights status of thi...,,,35 mm kodachrome slide,still image,-16.500/ -68.1667,"La Paz (inhabited place), La Paz (department),..."
682,A-2 336,A-2_Arquin_002_336.jpg,"Church of San Francisco, Domes and Roofs; Igle...",churches (buildings); domes (architectural ele...,Colonial Churches - San Francisco (1753-1772) ...,"Florence Arquin, photographer",Florida Atlantic University,,The copyright and related rights status of thi...,,,35 mm kodachrome slide,still image,-16.500/ -68.1667,"La Paz (inhabited place), La Paz (department),..."
685,A-2 337,A-2_Arquin_002_337.jpg,"Church of San Francisco, Cloister; Iglesia de ...",churches (buildings); cloisters; children (peo...,Colonial Churches - San Francisco (1753-1772) ...,"Florence Arquin, photographer",Florida Atlantic University,,The copyright and related rights status of thi...,,,35 mm kodachrome slide,still image,-16.500/ -68.1667,"La Paz (inhabited place), La Paz (department),..."
688,A-2 338,A-2_Arquin_002_338.jpg,Church of San Sebastian; Iglesia de San Sebast...,churches (buildings); bell towers; portals,Colonial Churches - San Sebastian Entirely Reb...,"Florence Arquin, photographer",Florida Atlantic University,,The copyright and related rights status of thi...,,,35 mm kodachrome slide,still image,-16.500/ -68.1667,"La Paz (inhabited place), La Paz (department),..."


## This will be the script for outputting the file to the CSV

In [260]:
output_test[output].to_csv(
    path_or_buf='./csv_files/output_file.csv',
    index=False, header=dc_output,
    quoting=csv.QUOTE_NONNUMERIC
)