In [3]:
import pandas as pd
import numpy as np
import pycountry
import pycountry_convert as pc
import random

### Step 1: Importing/Exporting of Data

In [4]:
#new version expects dtype on import however as of now only 1 row is impacted
non_latinamerican_art = pd.read_csv('../../../data_samples/art_tables/non_latin_art.csv', on_bad_lines='skip')

  non_latinamerican_art = pd.read_csv('../../../data_samples/art_tables/non_latin_art.csv', on_bad_lines='skip')


 Saving this column to its' own dataframe will allow for the EDA notebook 'Geography and Demography' to be ran and utilized later

In [5]:
#Saving the 'nationality' feature to the constituents nationalities CSV file
non_latinamerican_art.nationality.to_csv('../../../data_samples/nonLaArt/constituents_nationalities.csv', index=False)

### Step 2: of the La Art Pipeline - Feature Engineering new geographical features

#### List of Continent Codes for Determining which non Latin American countries are present in the Gallery

In [6]:
cname_alpha_2 = []
cname_alpha_3 = []
for country in pycountry.countries:
    cname_alpha_2.append(country.alpha_2)
    cname_alpha_3.append(country.alpha_3)

In [7]:
cname_alpha_2 = pd.Series(cname_alpha_2)
cname_alpha_3 = pd.Series(cname_alpha_3)

In [8]:
error_list = ['AQ', 'TF', 'EH', 'PN', 'SX', 'TL', 'UM', 'VA']
continent_names = cname_alpha_2.apply(lambda x: pc.country_alpha2_to_continent_code(x) if x not in error_list else 'non-transformable')

In [9]:
country_and_continent = pd.DataFrame([cname_alpha_2, continent_names], index= ['Country', 'Continent']).T

In [10]:
total_by_continent = country_and_continent.groupby('Continent').count()

### Distribution of Countries by Continent, World Wide

In [11]:
country_and_continent['Country Name'] = country_and_continent.Country.apply(pc.country_alpha2_to_country_name)

In [12]:
country_and_continent['Continent Name'] = country_and_continent.Continent.apply(lambda x: pc.convert_continent_code_to_continent_name(x) if x != 'non-transformable' else x)

In [15]:
constituent_nationalities = non_latinamerican_art.nationality.copy()

In [16]:
nonla_artist_origin = constituent_nationalities.value_counts(normalize=True)

In [17]:
nonla_artist_origin = nonla_artist_origin.reset_index(drop=False)

In [18]:
nonla_artist_origin.columns = ['demonym', 'pct_country_NGA']

In [19]:
#the assistance of an outside demonyms table which has a key to connect Country to Demonym
demonyms = pd.read_csv('../../../data_samples/results/processed_subset_results/demonyms.csv')

In [20]:
non_latines = pd.merge(country_and_continent, demonyms, how='inner', on ='Country Name')

In [22]:
nonla_geographicStatistics = pd.merge(nonla_artist_origin, non_latines, how='inner', on = 'demonym')

In [23]:
nonla_geographicStatistics.head()

Unnamed: 0,demonym,pct_country_NGA,Country,Continent,Country Name,Continent Name
0,American,0.378527,US,,United States,North America
1,French,0.347459,FR,EU,France,Europe
2,German,0.119132,DE,EU,Germany,Europe
3,Italian,0.03438,IT,EU,Italy,Europe
4,British,0.028555,GB,EU,United Kingdom,Europe


### Proportion of Countries per Continent
Real vs Non-LA Dataset Distributions

In [25]:
remove_from_index = ['non-transformable']
total_by_continent.index = pd.Series(list(total_by_continent.index)).apply(lambda x: pc.convert_continent_code_to_continent_name(x) if x not in remove_from_index else x)

In [27]:
total_by_continent['proportion'] = total_by_continent['Country'] / total_by_continent['Country'].sum()

In [156]:
total_by_continent.name = 'Actual Distribution'

In [157]:
total_by_continent

Unnamed: 0,Country,proportion
Africa,57,0.228916
Antarctica,2,0.008032
Asia,53,0.212851
Europe,50,0.200803
North America,40,0.160643
Oceania,24,0.096386
South America,15,0.060241
non-transformable,8,0.032129


In [70]:
nonla_geographicStatistics['Continent'] = nonla_geographicStatistics['Continent'].replace('NA', 'NoA')

In [220]:
nonla_continentCounts = nonla_geographicStatistics.groupby('Continent').apply(lambda x: len(x))

 After using the demonym dataset to link the geographic naming Data
 with the pct_country_NGA data, some datapoints were lots. (sum of pct is now ~92%)

In [221]:
nonla_geographicStatistics

Unnamed: 0,demonym,pct_country_NGA,Country,Continent,Country Name,Continent Name
0,American,0.378527,US,NoA,United States,North America
1,French,0.347459,FR,EU,France,Europe
2,German,0.119132,DE,EU,Germany,Europe
3,Italian,0.03438,IT,EU,Italy,Europe
4,British,0.028555,GB,EU,United Kingdom,Europe
5,Spanish,0.009595,ES,EU,Spain,Europe
6,Dutch,0.006967,NL,EU,Netherlands,Europe
7,Greek,0.000742,GR,EU,Greece,Europe
8,Swiss,0.000457,CH,EU,Switzerland,Europe
9,Austrian,0.000343,AT,EU,Austria,Europe


Among all countries, 9 belong to EU and 1 to NoA (US) which represents 92% of all the artwork in the NGA

In [222]:
nonla_continentCounts

Continent
EU     9
NoA    1
dtype: int64

In [223]:
nonla_continentCounts.index = ['Europe', 'North America']

In [224]:
nonla_continentCounts = pd.concat([nonla_continentCounts, pd.Series({'Africa': 0, 'Antarctica': 0, 'Asia': 0, 'Oceania':0, 'South America':0, 'non-transformable':0})])

In [225]:
nonla_continentCounts

Europe               9
North America        1
Africa               0
Antarctica           0
Asia                 0
Oceania              0
South America        0
non-transformable    0
dtype: int64

In [226]:
nonla_continentCounts = nonla_continentCounts.reindex(total_by_continent.index)
nonla_continentCounts.name = 'Countrys_in_Continents'

In [227]:
nonla_continentCounts

Africa               0
Antarctica           0
Asia                 0
Europe               9
North America        1
Oceania              0
South America        0
non-transformable    0
Name: Countrys_in_Continents, dtype: int64

The goal is to create a dataframe similar to this dataset and use this to measure distribution by geography

In [228]:
total_by_continent

Unnamed: 0,Country,proportion
Africa,57,0.228916
Antarctica,2,0.008032
Asia,53,0.212851
Europe,50,0.200803
North America,40,0.160643
Oceania,24,0.096386
South America,15,0.060241
non-transformable,8,0.032129


In [229]:
nonla_continentCounts

Africa               0
Antarctica           0
Asia                 0
Europe               9
North America        1
Oceania              0
South America        0
non-transformable    0
Name: Countrys_in_Continents, dtype: int64

In [230]:
nonla_continentCounts = pd.DataFrame({'countries_present':nonla_continentCounts, 'proportion_of_continent': (nonla_continentCounts / total_by_continent['Country'])})

In [231]:
nonla_continentCounts

Unnamed: 0,countries_present,proportion_of_continent
Africa,0,0.0
Antarctica,0,0.0
Asia,0,0.0
Europe,9,0.18
North America,1,0.025
Oceania,0,0.0
South America,0,0.0
non-transformable,0,0.0


To find out how many continents are not included in the NGA dataset, I will subtract the total for ALL countries included in the PyCountry library, and subtract the countries visible within the dataset per continent. The different will be called nonla_continentCounts_missing.

In [232]:
total_by_continent['Country']

Africa               57
Antarctica            2
Asia                 53
Europe               50
North America        40
Oceania              24
South America        15
non-transformable     8
Name: Country, dtype: int64

In [280]:
nonla_continentCounts_missing = total_by_continent['Country'] - nonla_continentCounts['countries_present']

In [281]:
nonla_continentCounts_missing.name = 'Missing_from_Actual_Distribution'

In [282]:
nonla_continentCounts_missing = pd.DataFrame({'countries_missing':nonla_continentCounts_missing, 'proportion_of_continent': (nonla_continentCounts_missing / total_by_continent['Country'])})

In [283]:
nonla_continentCounts_missing

Unnamed: 0,countries_missing,proportion_of_continent
Africa,57,1.0
Antarctica,2,1.0
Asia,53,1.0
Europe,41,0.82
North America,39,0.975
Oceania,24,1.0
South America,15,1.0
non-transformable,8,1.0


In [100]:
#inner join of non_latines and non_latinamerican_art to get the elementwise probs
non_latinamerican_art = pd.merge(non_latinamerican_art, non_latines, how='inner',left_on='nationality', right_on='demonym')

In [238]:
non_latinamerican_art.shape

(32434, 46)

After merging the main non_latinamerican_art dataset with the external geographical information that I created, I will write the updated dataset back to the original filepath and overwrite it as it contains valuable information

In [246]:
non_latinamerican_art.head()

Unnamed: 0,iiifurl,iiifthumburl,accessioned,title,displayDate_created,roletype,role,forwarddisplayname,birthyear,deathyear,...,maxpixels,assistivetext,depictstmsobjectid,objectid,constituentid,Country,Continent,Country Name,Continent Name,demonym
0,https://api.nga.gov/iiif/0056d32a-f4a3-44ee-a3...,https://api.nga.gov/iiif/0056d32a-f4a3-44ee-a3...,1.0,Red-necked Grebe,1836,artist,artist after,John James Audubon,1785.0,1851.0,...,,,32439.0,32439.0,122.0,US,NoA,United States,North America,American
1,https://api.nga.gov/iiif/0056d32a-f4a3-44ee-a3...,https://api.nga.gov/iiif/0056d32a-f4a3-44ee-a3...,1.0,Red-necked Grebe,1836,artist,artist after,John James Audubon,1785.0,1851.0,...,,,32439.0,32439.0,122.0,US,NoA,United States,North America,American
2,https://api.nga.gov/iiif/00c13cb9-012b-473a-ba...,https://api.nga.gov/iiif/00c13cb9-012b-473a-ba...,1.0,Bewick's Long-tailed Wren,1827,artist,artist after,John James Audubon,1785.0,1851.0,...,,,32159.0,32159.0,122.0,US,NoA,United States,North America,American
3,https://api.nga.gov/iiif/00c13cb9-012b-473a-ba...,https://api.nga.gov/iiif/00c13cb9-012b-473a-ba...,1.0,Bewick's Long-tailed Wren,1827,artist,artist after,John James Audubon,1785.0,1851.0,...,,,32159.0,32159.0,122.0,US,NoA,United States,North America,American
4,https://api.nga.gov/iiif/00dbc7b2-0300-443b-87...,https://api.nga.gov/iiif/00dbc7b2-0300-443b-87...,1.0,Swallow-tailed Hawk,1829,artist,artist after,John James Audubon,1785.0,1851.0,...,,,32213.0,32213.0,122.0,US,NoA,United States,North America,American


In [243]:
non_latinamerican_art.to_csv('../../../data_samples/art_tables/non_latin_art.csv', index=False)

In [259]:
continent_probabilities = pd.concat([nonla_continentCounts, non_latinamerican_art['Continent Name'].value_counts(normalize=True)], axis = 1).dropna(how='all')

In [261]:
continent_probabilities['pct_continent_NGA'] = continent_probabilities['Continent Name'].fillna(0)

In [263]:
continent_probabilities.drop('Continent Name', axis = 1, inplace=True)

In [267]:
continent_probabilities.columns = ['countries_present', 'pct_countries_from_continent_NGA', 'pct_from_continent_NGA']

In [268]:
continent_probabilities

Unnamed: 0,countries_present,pct_countries_from_continent_NGA,pct_from_continent_NGA
Africa,0,0.0,0.0
Antarctica,0,0.0,0.0
Asia,0,0.0,0.0
Europe,9,0.18,0.591293
North America,1,0.025,0.408707
Oceania,0,0.0,0.0
South America,0,0.0,0.0
non-transformable,0,0.0,0.0


In [287]:
nonla_continentCounts_missing.columns = ['countries_missing', 'pct_countries_missing_from_continent_NGA']

In [294]:
nonla_continentCounts_missing['is_missing'] = nonla_continentCounts_missing.pct_countries_missing_from_continent_NGA.apply(lambda x: 0 if x < 1 else 1)

I will merge the information about the missing continent/country data with the information about the data which is contained and use this to track if there is a naming issue causing datapoints to be dropped and non-EU or non-NA images and artists to not be included in the dataset - this is a given as I have noticed artists from asian and african countries featured in the NGA but that are now missing

In [295]:
nonla_continentCounts_missing

Unnamed: 0,countries_missing,pct_countries_missing_from_continent_NGA,is_missing
Africa,57,1.0,1
Antarctica,2,1.0,1
Asia,53,1.0,1
Europe,41,0.82,0
North America,39,0.975,0
Oceania,24,1.0,1
South America,15,1.0,1
non-transformable,8,1.0,1


Below is the code that samples data from the dataset using the missing/not missing information of country and continent data to influence its decision (not ready yet as the information is not yet accurate representation)

In [299]:
nonla_geographicStatistics

Unnamed: 0,demonym,pct_country_NGA,Country,Continent,Country Name,Continent Name
0,American,0.378527,US,NoA,United States,North America
1,French,0.347459,FR,EU,France,Europe
2,German,0.119132,DE,EU,Germany,Europe
3,Italian,0.03438,IT,EU,Italy,Europe
4,British,0.028555,GB,EU,United Kingdom,Europe
5,Spanish,0.009595,ES,EU,Spain,Europe
6,Dutch,0.006967,NL,EU,Netherlands,Europe
7,Greek,0.000742,GR,EU,Greece,Europe
8,Swiss,0.000457,CH,EU,Switzerland,Europe
9,Austrian,0.000343,AT,EU,Austria,Europe


In [301]:
nonla_continentStatistics = pd.concat([continent_probabilities, nonla_continentCounts_missing], axis = 1)

In [302]:
nonla_continentStatistics.to_csv('../../../data_samples/nonLaArt/nonla_continentStatistics.csv', index=False)

In [121]:
#1580 rows for training (0.008%) & 396 (0.002%) for validation/testing == 1976 for one iteration of sample (1/100 of total dataset) // make sure not being selected with replacement to remove duplicate issue resolved in download_nonLa_art (sampling only for nonLa)
#matching_distribution = np.random.choice(non_latinamerican_art.index, p=percent_from_nonLaContinent, size=1976, replace=False)

In [123]:
#index_matching = list(matching_distribution)

In [125]:
#nonLa_art_subsample = nonLa_art.iloc[index_matching, :]

In [126]:
#subsample_countryCounts = nonLa_art_subsample.groupby('Continent').apply(lambda x: len(x))

In [127]:
#subsample_countryCounts

Continent
Africa           480
Asia             453
Europe           469
North America    384
Oceania          190
dtype: int64

In [128]:
#subsample_countryCounts = pd.DataFrame({'counts': subsample_countryCounts,'proportion':subsample_countryCounts / subsample_countryCounts.sum()})

### Using the probabilities in percent_from_nonLaContinent (with minor correction due to removing the la art causing probabilities to not sum to 1) made the sample about equally representative to overall dataset

In [129]:
#subsample_countryCounts

Unnamed: 0_level_0,counts,proportion
Continent,Unnamed: 1_level_1,Unnamed: 2_level_1
Africa,480,0.242915
Asia,453,0.229251
Europe,469,0.237348
North America,384,0.194332
Oceania,190,0.096154


In [130]:
#nonla_countryCounts

Unnamed: 0,Countries,proportion
Africa,57,0.247826
Antarctica,2,0.008696
Asia,53,0.230435
Europe,50,0.217391
North America,36,0.156522
Oceania,24,0.104348
South America,0,0.0
non-transformable,8,0.034783


### Step 3 - Feature Engineering IIIFUrl Links to view the data at the desired resolution

### Step 4 - Outputting the final combined NGA/FE data

### Downloading the art and dropping into non_laImages folder TBD in download_nonLA_art notebook!

In [24]:
#nonla_geographicStatistics.to_csv('../../../data_samples/nonLaArt/nonla_geographicStatistics.csv', index=False)

In [132]:
#nonLa_art_subsample.to_csv('../../data_samples/results/processed_subset_results/nonLa_art_sample.csv', index=False)