In [2]:
import pandas as pd
import numpy as np
import pycountry
import pycountry_convert as pc
import matplotlib

### Step 1: Importing/Exporting of Data

In [3]:
#new version expects dtype on import however as of now only 1 row is impacted
#latinamerican_art = pd.read_csv('../../../data_samples/art_tables/latin_art.csv', on_bad_lines='skip')

In [103]:
#this version is the expanded dataset and is a test to update the SQL for latin_art as I've done for non_la_art
latinamerican_art = pd.read_csv('../../../data_samples/art_tables/non_latin_art.csv', low_memory =False, on_bad_lines='skip')

 Saving this column to its' own dataframe will allow for the EDA notebook 'Geography and Demography' to be ran and utilized later

In [104]:
#Saving the 'nationality' feature to the constituents nationalities CSV file
#latinamerican_art.nationality.to_csv('../../../data_samples/LaArt/constituents_nationalities.csv', index=False)

In [105]:
# I need to import a copy of the constituents 'nationality' column from the latin_art.csv dataset
#constituent_nationalities = pd.read_csv('../../../data_samples/LaArt/constituents_nationalities.csv')

These columns exist in the la_continentCounts & la_geographicStatistics, except for expanded_url which is added as its own.

In [106]:
pd.Series(latinamerican_art.columns)

0                            uuid
1                         iiifurl
2                    iiifthumburl
3                     accessioned
4                           title
5             displayDate_created
6                        roletype
7                            role
8              forwarddisplayname
9                       birthyear
10                      deathyear
11                         ulanid
12              artistofngaobject
13                    nationality
14                constituenttype
15       beginyear_artistAssigned
16         endyear_artistAssigned
17         country_artistAssigned
18         zipcode_artistAssigned
19                         medium
20                     dimensions
21                    inscription
22                       markings
23                    attribution
24    visualBrowserClassification
25                       parentID
26                      isVirtual
27                      portfolio
28                         series
29            

In [107]:
latinamerican_art.shape

(463799, 40)

### Step 2: of the La Art Pipeline - Feature Engineering new geographical features

#### List of American Continent Codes for Determining which Latin American countries are present in the Gallery.

In [108]:
cname_alpha_2 = []
cname_alpha_3 = []
for country in pycountry.countries:
    cname_alpha_2.append(country.alpha_2)
    cname_alpha_3.append(country.alpha_3)

In [109]:
cname_alpha_2 = pd.Series(cname_alpha_2)
cname_alpha_3 = pd.Series(cname_alpha_3)

In [110]:
error_list = ['AQ', 'TF', 'EH', 'PN', 'SX', 'TL', 'UM', 'VA']
continent_names = cname_alpha_2.apply(lambda x: pc.country_alpha2_to_continent_code(x) if x not in error_list else 'non-transformable')

In [111]:
country_and_continent = pd.DataFrame([cname_alpha_2, continent_names], index= ['Country', 'Continent']).T

In [112]:
total_by_continent = country_and_continent.groupby('Continent').count()

In [113]:
remove_from_index = ['non-transformable']
total_by_continent.index = pd.Series(list(total_by_continent.index)).apply(lambda x: pc.convert_continent_code_to_continent_name(x) if x not in remove_from_index else x)

### Estimated Actual Distribution of Countries by Continent, World Wide

In [114]:
country_and_continent['Country Name'] = country_and_continent.Country.apply(pc.country_alpha2_to_country_name)

In [115]:
country_and_continent['Continent Name'] = country_and_continent.Continent.apply(lambda x: pc.convert_continent_code_to_continent_name(x) if x != 'non-transformable' else x)

In [116]:
# 'NA' string translates to null in many data structures, so code changed to use 'NoAm' string instead
country_and_continent['Continent'] = country_and_continent.Continent.apply(lambda x: 'NoAm' if x == 'NA' else x)

In [117]:
total_by_continent

Unnamed: 0,Country
Africa,57
Antarctica,2
Asia,53
Europe,50
North America,40
Oceania,24
South America,15
non-transformable,8


In [118]:
Latin_in_NA = ['BZ', 'CR', 'CU', 'DO', 'SV', 'GT', 'HT', 'HN', 'JM', 'MX', 'NI', 'PA', 'LC']
latin_in_NA_map = country_and_continent.Country.apply(lambda x: x in Latin_in_NA)
latin_in_SA_map = country_and_continent.Continent.apply(lambda x: x == 'SA')
latins = country_and_continent.where(latin_in_SA_map | latin_in_NA_map).dropna()
latins.name = 'Latin Countries'
latins.reset_index(inplace=True, drop=True)
latins['Country Name'] = latins.Country.apply(pc.country_alpha2_to_country_name)

### Latin American Group (Feature Engineering)

In [119]:
country_and_continent.head()

Unnamed: 0,Country,Continent,Country Name,Continent Name
0,AW,NoAm,Aruba,North America
1,AF,AS,Afghanistan,Asia
2,AO,AF,Angola,Africa
3,AI,NoAm,Anguilla,North America
4,AX,EU,Åland Islands,Europe


In [120]:
latins.head()

Unnamed: 0,Country,Continent,Country Name,Continent Name
0,AR,SA,Argentina,South America
1,BZ,NoAm,Belize,North America
2,BO,SA,"Bolivia, Plurinational State of",South America
3,BR,SA,Brazil,South America
4,CL,SA,Chile,South America


In [121]:
la_artist_origin = latinamerican_art.nationality.value_counts(normalize=True)

In [122]:
latins['demonym'] = ['Argentinean', 'Belizean', 'Bolivian', 'Brazilian', 'Chilean', 'Colombian', 'Costa Rican', 'Cuban', 'Dominican', 'Ecuadorian', 'Falkland Islander', 'Guatemalan', 'Guianese', 'Guyanese', 'Honduran', 'Haitian', 'Jamaican', 'Saint Lucian', 'Mexican', 'Nicaraguan', 'Panamanian', 'Peruvian', 'Paraguayan', 'South Georgian', 'Salvadoran', 'Surinamese', 'Uruguayan', 'Venezuelan']

In [123]:
la_artist_origin = la_artist_origin.reset_index()

In [124]:
la_artist_origin.columns = ['demonym', 'pct_country_NGA']

In [125]:
la_artist_origin

Unnamed: 0,demonym,pct_country_NGA
0,American,0.704890
1,French,0.102478
2,Italian,0.045463
3,German,0.041107
4,British,0.029466
...,...,...
208,None.,0.000005
209,3262,0.000005
210,4685,0.000005
211,3483,0.000005


In [126]:
la_geographicStatistics = pd.merge(la_artist_origin, latins, how='inner', on ='demonym')

#### After using the whole dataset to apply the filter to select for latinamerican artists, there are 10 nationalities present.

In [127]:
la_geographicStatistics

Unnamed: 0,demonym,pct_country_NGA,Country,Continent,Country Name,Continent Name
0,Mexican,0.001012,MX,NoAm,Mexico,North America
1,Brazilian,0.000183,BR,SA,Brazil,South America
2,Argentinean,0.000134,AR,SA,Argentina,South America
3,Guatemalan,0.000129,GT,NoAm,Guatemala,North America
4,Chilean,0.000104,CL,SA,Chile,South America
5,Venezuelan,7.9e-05,VE,SA,"Venezuela, Bolivarian Republic of",South America
6,Cuban,1.5e-05,CU,NoAm,Cuba,North America
7,Peruvian,1.5e-05,PE,SA,Peru,South America
8,Uruguayan,1e-05,UY,SA,Uruguay,South America
9,Colombian,1e-05,CO,SA,Colombia,South America


In [128]:
la_continentCounts = la_geographicStatistics.groupby('Continent Name').apply(lambda x: len(x))

In [129]:
la_continentCounts.name = 'Countrys_in_Continents'

In [130]:
la_continentCounts

Continent Name
North America    4
South America    7
Name: Countrys_in_Continents, dtype: int64

In [131]:
pct_continent_NGA = la_geographicStatistics.groupby('Continent Name')['pct_country_NGA'].sum()

In [132]:
pct_continent_NGA

Continent Name
North America    0.001160
South America    0.000536
Name: pct_country_NGA, dtype: float64

In [133]:
total_by_continent

Unnamed: 0,Country
Africa,57
Antarctica,2
Asia,53
Europe,50
North America,40
Oceania,24
South America,15
non-transformable,8


In [134]:
total_by_continent = total_by_continent.loc[['North America', 'South America'], :]

In [135]:
la_continentCounts = pd.DataFrame({'countries_present':la_continentCounts, 'proportion_of_continent': (la_continentCounts / total_by_continent['Country'])})

In [136]:
la_continentCounts['pct_continent_NGA'] = pct_continent_NGA

In [137]:
la_continentCounts

Unnamed: 0_level_0,countries_present,proportion_of_continent,pct_continent_NGA
Continent Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
North America,4,0.1,0.00116
South America,7,0.466667,0.000536


In [138]:
la_continentCounts_missing = total_by_continent['Country'] - la_continentCounts['countries_present']

In [139]:
la_continentCounts_missing.name = 'Missing_from_Actual_Distribution'

In [140]:
la_continentCounts_missing = pd.DataFrame({'countries_missing':la_continentCounts_missing, 'proportion_of_continent': (la_continentCounts_missing / total_by_continent['Country'])})

In [141]:
la_continentCounts.reset_index(drop=False, inplace=True)

In [142]:
la_geographicStatistics = pd.merge(la_continentCounts, la_geographicStatistics, on='Continent Name')

In [143]:
la_continentCounts_missing = la_continentCounts_missing.reset_index(drop=False)

In [144]:
la_continentCounts_missing.columns = ['Continent Name', 'countries_missing', 'proportion_of_continent']

In [145]:
la_continentCounts_missing

Unnamed: 0,Continent Name,countries_missing,proportion_of_continent
0,North America,36,0.9
1,South America,8,0.533333


In [146]:
la_geographicStatistics = pd.merge(la_geographicStatistics, la_continentCounts_missing.loc[:, ['Continent Name','countries_missing']], how='inner', on='Continent Name')

In [147]:
la_geographicStatistics

Unnamed: 0,Continent Name,countries_present,proportion_of_continent,pct_continent_NGA,demonym,pct_country_NGA,Country,Continent,Country Name,countries_missing
0,North America,4,0.1,0.00116,Mexican,0.001012,MX,NoAm,Mexico,36
1,North America,4,0.1,0.00116,Guatemalan,0.000129,GT,NoAm,Guatemala,36
2,North America,4,0.1,0.00116,Cuban,1.5e-05,CU,NoAm,Cuba,36
3,North America,4,0.1,0.00116,Nicaraguan,5e-06,NI,NoAm,Nicaragua,36
4,South America,7,0.466667,0.000536,Brazilian,0.000183,BR,SA,Brazil,8
5,South America,7,0.466667,0.000536,Argentinean,0.000134,AR,SA,Argentina,8
6,South America,7,0.466667,0.000536,Chilean,0.000104,CL,SA,Chile,8
7,South America,7,0.466667,0.000536,Venezuelan,7.9e-05,VE,SA,"Venezuela, Bolivarian Republic of",8
8,South America,7,0.466667,0.000536,Peruvian,1.5e-05,PE,SA,Peru,8
9,South America,7,0.466667,0.000536,Uruguayan,1e-05,UY,SA,Uruguay,8


In [148]:
latinamerican_art.shape

(463799, 40)

In [149]:
#inner join of la_geographicStatistics and non_latinamerican_art
latinamerican_art = pd.merge(latinamerican_art, la_geographicStatistics, how='inner',left_on='nationality', right_on='demonym')

In [168]:
#converts the iiifurl to return the full image size
latinamerican_art['expanded_url'] = latinamerican_art.iiifthumburl.apply(lambda x: x.replace('!200,200', '!640,640'))

In [162]:
latinamerican_art.shape

(684, 51)

### Output Latin American Art data present within the NGA database (Data Prep)

In [163]:
### Important: This file is used to add geographic data to other tables in DB

In [153]:
la_geographicStatistics.to_csv('../../../data_samples/LaArt/la_geographicStatistics.csv', index=False)

In [169]:
latinamerican_art.to_csv('../../../data_samples/art_tables_test/latinamerican_art.csv', index=False)