In [95]:
import pandas as pd
import numpy as np
import pycountry
import pycountry_convert as pc
import random

### Step 1: Importing/Exporting of Data

The initial number of rows in the non_latinamerican_art dataset is 471542 in the database and after removing the 'bad lines', 7743 rows were removed.

In [96]:
#new version expects dtype on import however as of now only 1 row is impacted
non_latinamerican_art = pd.read_csv('../../../data_samples/nonLaArt/non_latinamerican_art.csv', low_memory=False, on_bad_lines='skip')

In [97]:
non_latinamerican_art.shape

(200073, 40)

 Saving this column to its' own dataframe will allow for the EDA notebook 'Geography and Demography' to be ran and utilized later

### Step 2 of the La Art Pipeline: Feature Engineering new geographical features

#### List of Continent Codes for Determining which non Latin American countries are present in the Gallery

In [98]:
cname_alpha_2 = []
cname_alpha_3 = []
for country in pycountry.countries:
    cname_alpha_2.append(country.alpha_2)
    cname_alpha_3.append(country.alpha_3)

In [99]:
cname_alpha_2 = pd.Series(cname_alpha_2)
cname_alpha_3 = pd.Series(cname_alpha_3)

In [100]:
error_list = ['AQ', 'TF', 'EH', 'PN', 'SX', 'TL', 'UM', 'VA']
continent_names = cname_alpha_2.apply(lambda x: pc.country_alpha2_to_continent_code(x) if x not in error_list else 'non-transformable')

In [101]:
country_and_continent = pd.DataFrame([cname_alpha_2, continent_names], index= ['Country', 'Continent']).T

This is an important aspect of the data as it will pair the countries with their continents and the nationalities of the artists in the dataset

In [102]:
country_and_continent.head()

Unnamed: 0,Country,Continent
0,AW,
1,AF,AS
2,AO,AF
3,AI,
4,AX,EU


In [103]:
total_by_continent = country_and_continent.groupby('Continent').count()['Country'].copy()

In [104]:
total_by_continent

Continent
AF                   57
AN                    2
AS                   53
EU                   50
NA                   40
OC                   24
SA                   15
non-transformable     8
Name: Country, dtype: int64

### Distribution of Countries by Continent, World Wide

In [105]:
country_and_continent['Country Name'] = country_and_continent.Country.apply(pc.country_alpha2_to_country_name)

In [106]:
country_and_continent['Continent Name'] = country_and_continent.Continent.apply(lambda x: pc.convert_continent_code_to_continent_name(x) if x != 'non-transformable' else x)

In [107]:
country_and_continent.head()

Unnamed: 0,Country,Continent,Country Name,Continent Name
0,AW,,Aruba,North America
1,AF,AS,Afghanistan,Asia
2,AO,AF,Angola,Africa
3,AI,,Anguilla,North America
4,AX,EU,Åland Islands,Europe


In [108]:
constituent_nationalities = non_latinamerican_art.nationality.copy()

In [109]:
nonla_artist_origin = constituent_nationalities.value_counts(normalize=True)

In [110]:
constituent_nationalities.shape

(200073,)

In [111]:
nonla_artist_origin.head()

nationality
American    0.549140
French      0.153443
German      0.072701
Italian     0.060949
British     0.046483
Name: proportion, dtype: float64

#### Adding information to select for non_latinamerican_art (todo- write script to create the change in the DB)

In [112]:
Latin_in_NA = ['BZ', 'CR', 'CU', 'DO', 'SV', 'GT', 'HT', 'HN', 'JM', 'MX', 'NI', 'PA', 'LC']
latin_in_NA_map = country_and_continent.Country.apply(lambda x: x in Latin_in_NA)
latin_in_SA_map = country_and_continent.Continent.apply(lambda x: x == 'SA')
latins = country_and_continent.where(latin_in_SA_map | latin_in_NA_map).dropna()
latins.name = 'Latin Countries'
latins.reset_index(inplace=True, drop=True)
latins['Country Name'] = latins.Country.apply(pc.country_alpha2_to_country_name)

In [113]:
#the assistance of an outside demonyms table which has a key to connect Country to Demonym
demonyms = pd.read_csv('../../../data_samples/results/processed_subset_results/demonyms.csv')

In [114]:
latins['demonym'] = ['Argentinean', 'Belizean', 'Bolivian', 'Brazilian', 'Chilean', 'Colombian', 'Costa Rican', 'Cuban', 'Dominican', 'Ecuadorian', 'Falkland Islander', 'Guatemalan', 'Guianese', 'Guyanese', 'Honduran', 'Haitian', 'Jamaican', 'Saint Lucian', 'Mexican', 'Nicaraguan', 'Panamanian', 'Peruvian', 'Paraguayan', 'South Georgian', 'Salvadoran', 'Surinamese', 'Uruguayan', 'Venezuelan']

In [115]:
latins

Unnamed: 0,Country,Continent,Country Name,Continent Name,demonym
0,AR,SA,Argentina,South America,Argentinean
1,BZ,,Belize,North America,Belizean
2,BO,SA,"Bolivia, Plurinational State of",South America,Bolivian
3,BR,SA,Brazil,South America,Brazilian
4,CL,SA,Chile,South America,Chilean
5,CO,SA,Colombia,South America,Colombian
6,CR,,Costa Rica,North America,Costa Rican
7,CU,,Cuba,North America,Cuban
8,DO,,Dominican Republic,North America,Dominican
9,EC,SA,Ecuador,South America,Ecuadorian


In [116]:
nonla_artist_origin = nonla_artist_origin.reset_index(drop=False)

In [117]:
nonla_artist_origin.columns = ['demonym', 'pct_country_NGA']

In [118]:
nonla_artist_origin

Unnamed: 0,demonym,pct_country_NGA
0,American,0.549140
1,French,0.153443
2,German,0.072701
3,Italian,0.060949
4,British,0.046483
...,...,...
130,10402,0.000010
131,8200,0.000010
132,6b7aa14f-e63c-4387-80ba-aea129df9ac9,0.000010
133,3209,0.000010


#### This will remove the subset of data that is latin american from nonlatines and thus nonla_geographicStatistics

In [119]:
non_latines = pd.merge(country_and_continent, demonyms, how='inner', on ='Country Name')

In [120]:
non_latines = non_latines.astype({'Country':'string','Continent':'string','Country Name':'string','Continent Name':'string','demonym':'string'})

In [121]:
latins = latins.astype({'Country':'string','Continent':'string','Country Name':'string','Continent Name':'string','demonym':'string'})

In [122]:
non_latines = non_latines.where(non_latines['Country Name'].apply(lambda x: not latins['Country Name'].isin([x]).any())).dropna(how='all')

#### TODO: Some of these countries may be part of latin america. South America was completely removed after seperation which is expected.

In [123]:
# Might remove jamaica/belize? from the latinamerican group

In [124]:
latins.groupby('Continent Name')['Country Name'].apply(lambda x: x.value_counts())['North America']

Belize                1
Costa Rica            1
Cuba                  1
Dominican Republic    1
Guatemala             1
Honduras              1
Haiti                 1
Jamaica               1
Saint Lucia           1
Mexico                1
Nicaragua             1
Panama                1
El Salvador           1
Name: Country Name, dtype: Int64

In [125]:
# To include puerto rico in the latinamerican group

In [126]:
non_latines.groupby('Continent Name')['Country Name'].apply(lambda x: x.value_counts())['North America']

Barbados            4
Canada              4
United States       4
Bermuda             2
Guadeloupe          2
Martinique          2
Puerto Rico         2
Aruba               1
Anguilla            1
Saint Barthélemy    1
Curaçao             1
Cayman Islands      1
Dominica            1
Grenada             1
Greenland           1
Montserrat          1
Name: Country Name, dtype: Int64

In [127]:
latins

Unnamed: 0,Country,Continent,Country Name,Continent Name,demonym
0,AR,SA,Argentina,South America,Argentinean
1,BZ,,Belize,North America,Belizean
2,BO,SA,"Bolivia, Plurinational State of",South America,Bolivian
3,BR,SA,Brazil,South America,Brazilian
4,CL,SA,Chile,South America,Chilean
5,CO,SA,Colombia,South America,Colombian
6,CR,,Costa Rica,North America,Costa Rican
7,CU,,Cuba,North America,Cuban
8,DO,,Dominican Republic,North America,Dominican
9,EC,SA,Ecuador,South America,Ecuadorian


In [128]:
nonla_artist_origin = nonla_artist_origin.where(nonla_artist_origin['demonym'].apply(lambda x: not latins['demonym'].isin([x]).any())).dropna(how='all')

In [129]:
nonla_geographicStatistics = pd.merge(nonla_artist_origin, non_latines, on='demonym')

In [130]:
nonla_geographicStatistics.shape

(38, 6)

In [131]:
nonla_geographicStatistics = nonla_geographicStatistics.sort_values(by='Country Name')

In [132]:
nonla_geographicStatistics.reset_index(drop=True, inplace=True)

In [133]:
nonla_geographicStatistics.head(15)

Unnamed: 0,demonym,pct_country_NGA,Country,Continent,Country Name,Continent Name
0,Albanian,3e-05,AL,EU,Albania,Europe
1,Armenian,1e-05,AM,AS,Armenia,Asia
2,Australian,0.000292,AU,OC,Australia,Oceania
3,Austrian,0.003508,AT,EU,Austria,Europe
4,Belgian,0.001257,BE,EU,Belgium,Europe
5,Bulgarian,7e-05,BG,EU,Bulgaria,Europe
6,Canadian,0.001146,CA,,Canada,North America
7,Chinese,0.003669,CN,AS,China,Asia
8,Croatian,0.000251,HR,EU,Croatia,Europe
9,Danish,0.000774,DK,EU,Denmark,Europe


### Proportion of Countries per Continent
Real vs Non-LA Dataset Distributions

In [134]:
remove_from_index = ['non-transformable']
total_by_continent.index = pd.Series(list(total_by_continent.index)).apply(lambda x: pc.convert_continent_code_to_continent_name(x) if x not in remove_from_index else x)

In [135]:
actual_proportion_of_countries = total_by_continent/ total_by_continent.sum()

In [136]:
total_by_continent = pd.DataFrame({'Countries':total_by_continent,'proportion_of_countries':actual_proportion_of_countries})

In [137]:
total_by_continent.name = 'Actual Distribution'

In [138]:
nonla_geographicStatistics['Continent'] = nonla_geographicStatistics['Continent'].replace('NA', 'NoA')

In [139]:
nonla_continentCounts = nonla_geographicStatistics.groupby('Continent Name').apply(lambda x: len(x))

 After using the demonym dataset to link the geographic naming Data
 with the pct_country_NGA data, some datapoints were lots. (sum of pct is now ~96%) and the number of countries was reduced to ~ 52 (one duplicate)

In [140]:
nonla_continentCounts.name = 'Countrys_in_Continents'

In [141]:
nonla_continentCounts

Continent Name
Africa            4
Asia              5
Europe           25
North America     2
Oceania           2
Name: Countrys_in_Continents, dtype: int64

#### Adding pct_continent_in_NGA to nonla_ContinentCounts using the pct_in_NGA column and Continent Name from nonla_geographicStatistics

In [142]:
pct_continent_NGA = nonla_geographicStatistics.groupby('Continent Name')['pct_country_NGA'].sum()

In [143]:
pct_continent_NGA

Continent Name
Africa           0.000362
Asia             0.006816
Europe           0.383065
North America    0.550286
Oceania          0.000312
Name: pct_country_NGA, dtype: float64

In [144]:
nonla_continentCounts

Continent Name
Africa            4
Asia              5
Europe           25
North America     2
Oceania           2
Name: Countrys_in_Continents, dtype: int64

The goal is to make nonla_continentCounts a dataframe similar to total_by_continent and use this to measure distribution by geography

In [145]:
total_by_continent

Unnamed: 0,Countries,proportion_of_countries
Africa,57,0.228916
Antarctica,2,0.008032
Asia,53,0.212851
Europe,50,0.200803
North America,40,0.160643
Oceania,24,0.096386
South America,15,0.060241
non-transformable,8,0.032129


#### No artists in the NGA dataset were credited as having Antarctic/South American (after LatinAmerican data was removed) nationality & the non-transformable index was leftover from try-catching the PyCountry transformation of ISO codes, so they will be removed.

In [146]:
total_by_continent.drop('Antarctica', inplace=True)
total_by_continent.drop('non-transformable', inplace=True)
total_by_continent.drop('South America', inplace=True)

In [147]:
nonla_continentCounts = pd.DataFrame({'countries_present':nonla_continentCounts, 'proportion_of_continent': (nonla_continentCounts / total_by_continent['Countries'])})

#### This table shows world-wide share of countries and their proportion out of the total number of countries.

In [148]:
total_by_continent

Unnamed: 0,Countries,proportion_of_countries
Africa,57,0.228916
Asia,53,0.212851
Europe,50,0.200803
North America,40,0.160643
Oceania,24,0.096386


#### This table shows countries and continents present in the NGA dataset and the proportion of countries present from the continent.

In [149]:
nonla_continentCounts['pct_continent_NGA'] = pct_continent_NGA

In [150]:
nonla_continentCounts

Unnamed: 0_level_0,countries_present,proportion_of_continent,pct_continent_NGA
Continent Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Africa,4,0.070175,0.000362
Asia,5,0.09434,0.006816
Europe,25,0.5,0.383065
North America,2,0.05,0.550286
Oceania,2,0.083333,0.000312


#### To find out how many continents are not included in the NGA dataset, I will subtract the total for ALL countries included in the PyCountry library, and subtract the countries visible within the dataset per continent. The different will be called nonla_continentCounts_missing.

In [151]:
nonla_continentCounts_missing = total_by_continent['Countries'] - nonla_continentCounts['countries_present']

In [152]:
nonla_continentCounts_missing.name = 'Missing_from_Actual_Distribution'

In [153]:
nonla_continentCounts_missing = pd.DataFrame({'countries_missing':nonla_continentCounts_missing, 'proportion_of_continent': (nonla_continentCounts_missing / total_by_continent['Countries'])})

In [154]:
nonla_continentCounts_missing = nonla_continentCounts_missing.reset_index(drop=False)

In [155]:
nonla_continentCounts_missing.columns = ['Continent Name', 'countries_missing', 'proportion_of_continent']

#### The number of columns before adding information about the specific geographical details of the artists' nationalities as well as additional statistical information about the countries and continents representation within the dataset

In [156]:
nonla_geographicStatistics = pd.merge(nonla_geographicStatistics, nonla_continentCounts_missing.loc[:, ['Continent Name','countries_missing']], how='inner', on='Continent Name')

In [159]:
nonla_continentCounts

Unnamed: 0_level_0,countries_present,proportion_of_continent,pct_continent_NGA
Continent Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Africa,4,0.070175,0.000362
Asia,5,0.09434,0.006816
Europe,25,0.5,0.383065
North America,2,0.05,0.550286
Oceania,2,0.083333,0.000312


In [161]:
nonla_continentCounts.reset_index(drop=False, inplace=True)

In [64]:
nonla_geographicStatistics = pd.merge(nonla_continentCounts, nonla_geographicStatistics, on='Continent Name')

In [65]:
non_latinamerican_art.shape

(187183, 51)

In [165]:
non_latinamerican_art.columns

Index(['iiifurl', 'iiifthumburl', 'accessioned', 'title',
       'displayDate_created', 'roletype', 'role', 'forwarddisplayname',
       'birthyear', 'deathyear', 'ulanid', 'artistofngaobject', 'nationality',
       'constituenttype', 'beginyear_artistAssigned', 'endyear_artistAssigned',
       'country_artistAssigned', 'zipcode_artistAssigned', 'medium',
       'dimensions', 'inscription', 'markings', 'attribution',
       'visualBrowserClassification', 'parentID', 'isVirtual', 'portfolio',
       'series', 'volume', 'watermarks', 'uuid', 'viewtype', 'sequence',
       'width', 'height', 'maxpixels', 'assistivetext', 'depictstmsobjectid',
       'objectid', 'constituentid'],
      dtype='object')

In [166]:
#inner join of nonla_geographicStatistics and non_latinamerican_art
non_latinamerican_art = pd.merge(non_latinamerican_art, nonla_geographicStatistics, how='inner', left_on='nationality', right_on='demonym')

In [167]:
non_latinamerican_art.columns

Index(['iiifurl', 'iiifthumburl', 'accessioned', 'title',
       'displayDate_created', 'roletype', 'role', 'forwarddisplayname',
       'birthyear', 'deathyear', 'ulanid', 'artistofngaobject', 'nationality',
       'constituenttype', 'beginyear_artistAssigned', 'endyear_artistAssigned',
       'country_artistAssigned', 'zipcode_artistAssigned', 'medium',
       'dimensions', 'inscription', 'markings', 'attribution',
       'visualBrowserClassification', 'parentID', 'isVirtual', 'portfolio',
       'series', 'volume', 'watermarks', 'uuid', 'viewtype', 'sequence',
       'width', 'height', 'maxpixels', 'assistivetext', 'depictstmsobjectid',
       'objectid', 'constituentid', 'demonym', 'pct_country_NGA', 'Country',
       'Continent', 'Country Name', 'Continent Name', 'countries_missing'],
      dtype='object')

#### Feature Engineering IIIFUrl Links to view the data at the desired resolution

In [68]:
#converts the iiifurl to return the full image size
non_latinamerican_art['expanded_url'] = non_latinamerican_art.iiifthumburl.apply(lambda x: x.replace('!200,200', '!640,640'))

In [81]:
non_latinamerican_art.shape

(187183, 51)

In [82]:
non_latinamerican_art.sample(5)

Unnamed: 0,iiifurl,iiifthumburl,accessioned,title,displayDate_created,roletype,role,forwarddisplayname,birthyear,deathyear,...,countries_present,proportion_of_continent,pct_continent_NGA,demonym,pct_country_NGA,Country,Continent,Country Name,countries_missing,expanded_url
14566,https://api.nga.gov/iiif/78475106-6aab-4ce5-bd...,https://api.nga.gov/iiif/78475106-6aab-4ce5-bd...,1,Roche Taillet sur la Saosne proche Lyon,"French, 1621 - 1691",artist,artist,Israël Silvestre,1621.0,1691.0,...,25,0.5,0.383065,French,0.153443,FR,EU,France,25,https://api.nga.gov/iiif/78475106-6aab-4ce5-bd...
40267,https://api.nga.gov/iiif/16fd17a5-946f-4d3e-8d...,https://api.nga.gov/iiif/16fd17a5-946f-4d3e-8d...,1,California Western Railroad,"American, born 1954",artist,artist,Mark Ruwedel,1954.0,,...,2,0.05,0.550286,American,0.54914,US,NoA,United States,38,https://api.nga.gov/iiif/16fd17a5-946f-4d3e-8d...
109501,https://api.nga.gov/iiif/b878c2b2-7af3-4700-bb...,https://api.nga.gov/iiif/b878c2b2-7af3-4700-bb...,1,"Rear View of Reclining Female Nude, Leaning on...","American, born Russia (now Latvia), 1903 - 1970",artist,artist,Mark Rothko,1903.0,1970.0,...,2,0.05,0.550286,American,0.54914,US,NoA,United States,38,https://api.nga.gov/iiif/b878c2b2-7af3-4700-bb...
1629,https://api.nga.gov/iiif/0d7a4f6f-5499-4f4c-9f...,https://api.nga.gov/iiif/0d7a4f6f-5499-4f4c-9f...,1,La Blanchisseuse,"French, 1688 - 1754",artist,artist,Charles-Nicolas Cochin I,1688.0,1754.0,...,25,0.5,0.383065,French,0.153443,FR,EU,France,25,https://api.nga.gov/iiif/0d7a4f6f-5499-4f4c-9f...
97199,https://api.nga.gov/iiif/9baebc22-ffed-4aff-a6...,https://api.nga.gov/iiif/9baebc22-ffed-4aff-a6...,1,Garden Ornament (Greyhound),"American, active c. 1935",artist,artist,George Constantine,1855.0,1995.0,...,2,0.05,0.550286,American,0.54914,US,NoA,United States,38,https://api.nga.gov/iiif/9baebc22-ffed-4aff-a6...


After merging the main non_latinamerican_art dataset with the external geographical information that I created, I will write the updated dataset back to the original filepath and overwrite it as it contains valuable information

In [83]:
non_latinamerican_art.head()

Unnamed: 0,iiifurl,iiifthumburl,accessioned,title,displayDate_created,roletype,role,forwarddisplayname,birthyear,deathyear,...,countries_present,proportion_of_continent,pct_continent_NGA,demonym,pct_country_NGA,Country,Continent,Country Name,countries_missing,expanded_url
0,https://api.nga.gov/iiif/00004dec-8300-4487-8d...,https://api.nga.gov/iiif/00004dec-8300-4487-8d...,1,First Book: Daphnis Playing His Pipe for Chloe...,"French, 1861 - 1944",artist,artist,Aristide Maillol,1861.0,1944.0,...,25,0.5,0.383065,French,0.153443,FR,EU,France,25,https://api.nga.gov/iiif/00004dec-8300-4487-8d...
1,https://api.nga.gov/iiif/00004dec-8300-4487-8d...,https://api.nga.gov/iiif/00004dec-8300-4487-8d...,1,First Book: Daphnis Playing His Pipe for Chloe...,"French, 1861 - 1944",artist,artist,Aristide Maillol,1861.0,1944.0,...,25,0.5,0.383065,French,0.153443,FR,EU,France,25,https://api.nga.gov/iiif/00004dec-8300-4487-8d...
2,https://api.nga.gov/iiif/00018ee2-2b87-444d-af...,https://api.nga.gov/iiif/00018ee2-2b87-444d-af...,1,Rose and Romaine,"French, 1890 - 1968",artist,artist,Valentine Hugo,1890.0,1968.0,...,25,0.5,0.383065,French,0.153443,FR,EU,France,25,https://api.nga.gov/iiif/00018ee2-2b87-444d-af...
3,https://api.nga.gov/iiif/00018ee2-2b87-444d-af...,https://api.nga.gov/iiif/00018ee2-2b87-444d-af...,1,Rose and Romaine,"French, 1890 - 1968",artist,artist,Valentine Hugo,1890.0,1968.0,...,25,0.5,0.383065,French,0.153443,FR,EU,France,25,https://api.nga.gov/iiif/00018ee2-2b87-444d-af...
4,https://api.nga.gov/iiif/0002991a-98fa-42bd-bb...,https://api.nga.gov/iiif/0002991a-98fa-42bd-bb...,1,Fourth Book: Daphnis Plays to His Goats (Daphn...,"French, 1861 - 1944",artist,artist,Aristide Maillol,1861.0,1944.0,...,25,0.5,0.383065,French,0.153443,FR,EU,France,25,https://api.nga.gov/iiif/0002991a-98fa-42bd-bb...


### Step 3 - Sampling from the large dataset for batch training

In [84]:
#Sampling from the full dataset (200073 rows), 500 rows for training  & 200 (for validation/testing == 700 total for one iteration of sample (1/100 of total dataset) // make sure not being selected with replacement to remove duplicates
matching_distribution = np.random.choice(non_latinamerican_art.index, p= non_latinamerican_art.pct_continent_NGA / non_latinamerican_art.pct_continent_NGA.sum(), size=700, replace=False)

In [85]:
index_matching = list(matching_distribution)

In [86]:
subsample_nonla = non_latinamerican_art.iloc[index_matching, :]

In [87]:
subsample_geography = subsample_nonla.groupby('Country Name').apply(lambda x: len(x))

In [88]:
subsample_geography

Country Name
Austria             25
Belgium              7
Bulgaria             1
Canada               9
Croatia              2
Denmark              3
Finland              1
France            1049
Germany            511
Greece               1
Hungary              3
Ireland              4
Israel               1
Italy              451
Netherlands        200
Norway              10
Poland               2
Romania              1
Spain               53
Sweden               3
Switzerland         49
United Kingdom     360
United States     5456
dtype: int64

In [89]:
subsample_geography = pd.DataFrame({'counts': subsample_geography,'proportion':subsample_geography / subsample_geography.sum()})

### Using the normalized probabilities in pct_continent_NGA to sample 10%, made the sample a little less representative to overall dataset. Since this is only a 10% sample, it is not fully representative, but I expect to train with more samples/batches.

In [90]:
subsample_geography.sort_values(by='proportion', ascending=False)

Unnamed: 0_level_0,counts,proportion
Country Name,Unnamed: 1_level_1,Unnamed: 2_level_1
United States,5456,0.665204
France,1049,0.127896
Germany,511,0.062302
Italy,451,0.054987
United Kingdom,360,0.043892
Netherlands,200,0.024384
Spain,53,0.006462
Switzerland,49,0.005974
Austria,25,0.003048
Norway,10,0.001219


In [91]:
nonla_geographicStatistics.sort_values(by='pct_country_NGA', ascending=False).head(10)

Unnamed: 0,Continent Name,countries_present,proportion_of_continent,pct_continent_NGA,demonym,pct_country_NGA,Country,Continent,Country Name,countries_missing
35,North America,2,0.05,0.584888,American,0.583669,US,NoA,United States,38
16,Europe,25,0.5,0.407152,French,0.163092,FR,EU,France,25
17,Europe,25,0.5,0.407152,German,0.077272,DE,EU,Germany,25
21,Europe,25,0.5,0.407152,Italian,0.064782,IT,EU,Italy,25
33,Europe,25,0.5,0.407152,British,0.049406,GB,EU,United Kingdom,25
24,Europe,25,0.5,0.407152,Dutch,0.02699,NL,EU,Netherlands,25
30,Europe,25,0.5,0.407152,Spanish,0.007511,ES,EU,Spain,25
32,Europe,25,0.5,0.407152,Swiss,0.007169,CH,EU,Switzerland,25
5,Asia,5,0.09434,0.007244,Chinese,0.0039,CN,AS,China,48
10,Europe,25,0.5,0.407152,Austrian,0.003729,AT,EU,Austria,25


In [92]:
non_latinamerican_art.shape

(187183, 51)

### Step 3 - Outputting the final combined NGA/FE data

In [93]:
subsample_nonla.reset_index(drop=True, inplace=True)

In [94]:
subsample_nonla

Unnamed: 0,iiifurl,iiifthumburl,accessioned,title,displayDate_created,roletype,role,forwarddisplayname,birthyear,deathyear,...,countries_present,proportion_of_continent,pct_continent_NGA,demonym,pct_country_NGA,Country,Continent,Country Name,countries_missing,expanded_url
0,https://api.nga.gov/iiif/b23ef555-1102-4abc-9b...,https://api.nga.gov/iiif/b23ef555-1102-4abc-9b...,1,Haystack #4,"American, 1923 - 1997",artist,artist,Roy Lichtenstein,1923.0,1997.0,...,2,0.05,0.550286,American,0.549140,US,NoA,United States,38,https://api.nga.gov/iiif/b23ef555-1102-4abc-9b...
1,https://api.nga.gov/iiif/ef36c5b0-2fe3-4b10-91...,https://api.nga.gov/iiif/ef36c5b0-2fe3-4b10-91...,1,Horse Weather Vane,"American, 1915 - 2001",artist,artist,Dolores Haupt,1915.0,2001.0,...,2,0.05,0.550286,American,0.549140,US,NoA,United States,38,https://api.nga.gov/iiif/ef36c5b0-2fe3-4b10-91...
2,https://api.nga.gov/iiif/641e71ef-5b9e-4373-a5...,https://api.nga.gov/iiif/641e71ef-5b9e-4373-a5...,1,American Tivoli,"American, 1857 - 1926",artist,artist,Joseph Pennell,1857.0,1926.0,...,2,0.05,0.550286,American,0.549140,US,NoA,United States,38,https://api.nga.gov/iiif/641e71ef-5b9e-4373-a5...
3,https://api.nga.gov/iiif/b466775f-8037-402f-b6...,https://api.nga.gov/iiif/b466775f-8037-402f-b6...,1,Franklin's Kite,"American, 1899 - 1964",artist,artist,Alfred Bendiner,1899.0,1964.0,...,2,0.05,0.550286,American,0.549140,US,NoA,United States,38,https://api.nga.gov/iiif/b466775f-8037-402f-b6...
4,https://api.nga.gov/iiif/123b5d53-7093-473d-8e...,https://api.nga.gov/iiif/123b5d53-7093-473d-8e...,1,Toaster,"American, active c. 1935",artist,artist,Roy Weber,1855.0,1995.0,...,2,0.05,0.550286,American,0.549140,US,NoA,United States,38,https://api.nga.gov/iiif/123b5d53-7093-473d-8e...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8197,https://api.nga.gov/iiif/bd6f880a-0c5a-4c74-82...,https://api.nga.gov/iiif/bd6f880a-0c5a-4c74-82...,1,Grazing Cow,"Dutch, 1625 - 1654",artist,artist,Paulus Potter,1625.0,1654.0,...,25,0.50,0.383065,Dutch,0.025393,NL,EU,Netherlands,25,https://api.nga.gov/iiif/bd6f880a-0c5a-4c74-82...
8198,https://api.nga.gov/iiif/56d88ffe-e713-4807-ad...,https://api.nga.gov/iiif/56d88ffe-e713-4807-ad...,1,San Francisco,"American, born 1939",artist,artist,Ralph Gibson,1939.0,,...,2,0.05,0.550286,American,0.549140,US,NoA,United States,38,https://api.nga.gov/iiif/56d88ffe-e713-4807-ad...
8199,https://api.nga.gov/iiif/c8769981-fe96-4749-a2...,https://api.nga.gov/iiif/c8769981-fe96-4749-a2...,1,"NY from Dyckman St., Ferry Hill, No. 2","American, 1870 - 1953",artist,artist,John Marin,1870.0,1953.0,...,2,0.05,0.550286,American,0.549140,US,NoA,United States,38,https://api.nga.gov/iiif/c8769981-fe96-4749-a2...
8200,https://api.nga.gov/iiif/16c3c8ff-133b-48ee-a6...,https://api.nga.gov/iiif/16c3c8ff-133b-48ee-a6...,1,The Siege of Breda [plate 2 of 6],"French, 1592 - 1635",artist,artist,Jacques Callot,1592.0,1635.0,...,25,0.50,0.383065,French,0.153443,FR,EU,France,25,https://api.nga.gov/iiif/16c3c8ff-133b-48ee-a6...


In [None]:
nonla_geographicStatistics.to_csv('../../../data_samples/nonLaArt/nonla_geographicStatistics.csv', index=False)

In [None]:
subsample_nonla.to_csv('../../../data_samples/results/processed_subset_results/non_latinamericanart_sample.csv', index=False)