In [1]:
import pandas as pd
import numpy as np
import pycountry
import pycountry_convert as pc
import random

### Step 1: Importing/Exporting of Data

The initial number of rows in the non_latinamerican_art dataset is 471542 in the database and after removing the 'bad lines', 7743 rows were removed.

In [2]:
#new version expects dtype on import however as of now only 1 row is impacted
non_latinamerican_art = pd.read_csv('../../../data_samples/art_tables/non_latinamerican_art.csv', low_memory=False, on_bad_lines='skip')

In [403]:
non_latinamerican_art.shape

(463799, 40)

 Saving this column to its' own dataframe will allow for the EDA notebook 'Geography and Demography' to be ran and utilized later

### Step 2: of the La Art Pipeline - Feature Engineering new geographical features

#### List of Continent Codes for Determining which non Latin American countries are present in the Gallery

In [323]:
cname_alpha_2 = []
cname_alpha_3 = []
for country in pycountry.countries:
    cname_alpha_2.append(country.alpha_2)
    cname_alpha_3.append(country.alpha_3)

In [324]:
cname_alpha_2 = pd.Series(cname_alpha_2)
cname_alpha_3 = pd.Series(cname_alpha_3)

In [325]:
error_list = ['AQ', 'TF', 'EH', 'PN', 'SX', 'TL', 'UM', 'VA']
continent_names = cname_alpha_2.apply(lambda x: pc.country_alpha2_to_continent_code(x) if x not in error_list else 'non-transformable')

In [326]:
country_and_continent = pd.DataFrame([cname_alpha_2, continent_names], index= ['Country', 'Continent']).T

This is an important aspect of the data as it will pair the countries with their continents and the nationalities of the artists in the dataset

In [327]:
country_and_continent.head()

Unnamed: 0,Country,Continent
0,AW,
1,AF,AS
2,AO,AF
3,AI,
4,AX,EU


In [328]:
total_by_continent = country_and_continent.groupby('Continent').count()['Country'].copy()

In [329]:
total_by_continent

Continent
AF                   57
AN                    2
AS                   53
EU                   50
NA                   40
OC                   24
SA                   15
non-transformable     8
Name: Country, dtype: int64

### Distribution of Countries by Continent, World Wide

In [330]:
country_and_continent['Country Name'] = country_and_continent.Country.apply(pc.country_alpha2_to_country_name)

In [331]:
country_and_continent['Continent Name'] = country_and_continent.Continent.apply(lambda x: pc.convert_continent_code_to_continent_name(x) if x != 'non-transformable' else x)

In [332]:
country_and_continent.head()

Unnamed: 0,Country,Continent,Country Name,Continent Name
0,AW,,Aruba,North America
1,AF,AS,Afghanistan,Asia
2,AO,AF,Angola,Africa
3,AI,,Anguilla,North America
4,AX,EU,Åland Islands,Europe


In [333]:
constituent_nationalities = non_latinamerican_art.nationality.copy()

In [334]:
nonla_artist_origin = constituent_nationalities.value_counts(normalize=True)

In [335]:
constituent_nationalities.shape

(463799,)

In [336]:
nonla_artist_origin.head()

American    0.704890
French      0.102478
Italian     0.045463
German      0.041107
British     0.029466
Name: nationality, dtype: float64

#### Adding information to select for non_latinamerican_art (todo- write script to create the change in the DB)

In [337]:
Latin_in_NA = ['BZ', 'CR', 'CU', 'DO', 'SV', 'GT', 'HT', 'HN', 'JM', 'MX', 'NI', 'PA', 'LC']
latin_in_NA_map = country_and_continent.Country.apply(lambda x: x in Latin_in_NA)
latin_in_SA_map = country_and_continent.Continent.apply(lambda x: x == 'SA')
latins = country_and_continent.where(latin_in_SA_map | latin_in_NA_map).dropna()
latins.name = 'Latin Countries'
latins.reset_index(inplace=True, drop=True)
latins['Country Name'] = latins.Country.apply(pc.country_alpha2_to_country_name)

In [338]:
#the assistance of an outside demonyms table which has a key to connect Country to Demonym
demonyms = pd.read_csv('../../../data_samples/results/processed_subset_results/demonyms.csv')

In [339]:
latins['demonym'] = ['Argentinean', 'Belizean', 'Bolivian', 'Brazilian', 'Chilean', 'Colombian', 'Costa Rican', 'Cuban', 'Dominican', 'Ecuadorian', 'Falkland Islander', 'Guatemalan', 'Guianese', 'Guyanese', 'Honduran', 'Haitian', 'Jamaican', 'Saint Lucian', 'Mexican', 'Nicaraguan', 'Panamanian', 'Peruvian', 'Paraguayan', 'South Georgian', 'Salvadoran', 'Surinamese', 'Uruguayan', 'Venezuelan']

In [340]:
latins

Unnamed: 0,Country,Continent,Country Name,Continent Name,demonym
0,AR,SA,Argentina,South America,Argentinean
1,BZ,,Belize,North America,Belizean
2,BO,SA,"Bolivia, Plurinational State of",South America,Bolivian
3,BR,SA,Brazil,South America,Brazilian
4,CL,SA,Chile,South America,Chilean
5,CO,SA,Colombia,South America,Colombian
6,CR,,Costa Rica,North America,Costa Rican
7,CU,,Cuba,North America,Cuban
8,DO,,Dominican Republic,North America,Dominican
9,EC,SA,Ecuador,South America,Ecuadorian


In [341]:
nonla_artist_origin = nonla_artist_origin.reset_index(drop=False)

In [342]:
nonla_artist_origin.columns = ['demonym', 'pct_country_NGA']

In [343]:
nonla_artist_origin

Unnamed: 0,demonym,pct_country_NGA
0,American,0.704890
1,French,0.102478
2,Italian,0.045463
3,German,0.041107
4,British,0.029466
...,...,...
208,None.,0.000005
209,3262,0.000005
210,4685,0.000005
211,3483,0.000005


#### This will remove the subset of data that is latin american from nonlatines and thus nonla_geographicStatistics

In [344]:
non_latines = pd.merge(country_and_continent, demonyms, how='inner', on ='Country Name')

In [345]:
non_latines = non_latines.astype({'Country':'string','Continent':'string','Country Name':'string','Continent Name':'string','demonym':'string'})

In [346]:
latins = latins.astype({'Country':'string','Continent':'string','Country Name':'string','Continent Name':'string','demonym':'string'})

In [347]:
non_latines = non_latines.where(non_latines['Country Name'].apply(lambda x: not latins['Country Name'].isin([x]).any())).dropna(how='all')

#### TODO: Some of these countries may be part of latin america. South America was completely removed after seperation which is expected.

In [None]:
# Might remove jamaica/belize? from the latinamerican group

In [348]:
latins.groupby('Continent Name')['Country Name'].apply(lambda x: x.value_counts())['North America']

Belize                1
Costa Rica            1
Cuba                  1
Dominican Republic    1
Guatemala             1
Honduras              1
Haiti                 1
Jamaica               1
Saint Lucia           1
Mexico                1
Nicaragua             1
Panama                1
El Salvador           1
Name: Country Name, dtype: Int64

In [None]:
# To include puerto rico in the latinamerican group

In [349]:
non_latines.groupby('Continent Name')['Country Name'].apply(lambda x: x.value_counts())['North America']

Barbados            4
Canada              4
United States       4
Bermuda             2
Guadeloupe          2
Martinique          2
Puerto Rico         2
Aruba               1
Anguilla            1
Saint Barthélemy    1
Curaçao             1
Cayman Islands      1
Dominica            1
Grenada             1
Greenland           1
Montserrat          1
Name: Country Name, dtype: Int64

In [350]:
latins

Unnamed: 0,Country,Continent,Country Name,Continent Name,demonym
0,AR,SA,Argentina,South America,Argentinean
1,BZ,,Belize,North America,Belizean
2,BO,SA,"Bolivia, Plurinational State of",South America,Bolivian
3,BR,SA,Brazil,South America,Brazilian
4,CL,SA,Chile,South America,Chilean
5,CO,SA,Colombia,South America,Colombian
6,CR,,Costa Rica,North America,Costa Rican
7,CU,,Cuba,North America,Cuban
8,DO,,Dominican Republic,North America,Dominican
9,EC,SA,Ecuador,South America,Ecuadorian


In [351]:
nonla_artist_origin = nonla_artist_origin.where(nonla_artist_origin['demonym'].apply(lambda x: not latins['demonym'].isin([x]).any())).dropna(how='all')

In [352]:
nonla_geographicStatistics = pd.merge(nonla_artist_origin, non_latines, on='demonym')

In [353]:
nonla_geographicStatistics.shape

(42, 6)

In [354]:
nonla_geographicStatistics = nonla_geographicStatistics.sort_values(by='Country Name')

In [355]:
nonla_geographicStatistics.reset_index(drop=True, inplace=True)

In [356]:
nonla_geographicStatistics.head(15)

Unnamed: 0,demonym,pct_country_NGA,Country,Continent,Country Name,Continent Name
0,Albanian,1.5e-05,AL,EU,Albania,Europe
1,Armenian,5e-06,AM,AS,Armenia,Asia
2,Australian,0.000347,AU,OC,Australia,Oceania
3,Austrian,0.001889,AT,EU,Austria,Europe
4,Belgian,0.000774,BE,EU,Belgium,Europe
5,Bulgarian,3.5e-05,BG,EU,Bulgaria,Europe
6,Canadian,0.00064,CA,,Canada,North America
7,Chinese,0.001939,CN,AS,China,Asia
8,Croatian,0.000134,HR,EU,Croatia,Europe
9,Danish,0.000585,DK,EU,Denmark,Europe


### Proportion of Countries per Continent
Real vs Non-LA Dataset Distributions

In [357]:
remove_from_index = ['non-transformable']
total_by_continent.index = pd.Series(list(total_by_continent.index)).apply(lambda x: pc.convert_continent_code_to_continent_name(x) if x not in remove_from_index else x)

In [358]:
actual_proportion_of_countries = total_by_continent/ total_by_continent.sum()

In [359]:
total_by_continent = pd.DataFrame({'Countries':total_by_continent,'proportion_of_countries':actual_proportion_of_countries})

In [360]:
total_by_continent.name = 'Actual Distribution'

In [361]:
nonla_geographicStatistics['Continent'] = nonla_geographicStatistics['Continent'].replace('NA', 'NoA')

In [362]:
nonla_continentCounts = nonla_geographicStatistics.groupby('Continent Name').apply(lambda x: len(x))

 After using the demonym dataset to link the geographic naming Data
 with the pct_country_NGA data, some datapoints were lots. (sum of pct is now ~96%) and the number of countries was reduced to ~ 52 (one duplicate)

In [363]:
nonla_continentCounts.name = 'Countrys_in_Continents'

In [364]:
nonla_continentCounts

Continent Name
Africa            6
Asia              6
Europe           25
North America     2
Oceania           3
Name: Countrys_in_Continents, dtype: int64

#### Adding pct_continent_in_NGA to nonla_ContinentCounts using the pct_in_NGA column and Continent Name from nonla_geographicStatistics

In [365]:
pct_continent_NGA = nonla_geographicStatistics.groupby('Continent Name')['pct_country_NGA'].sum()

In [366]:
pct_continent_NGA

Continent Name
Africa           0.000233
Asia             0.004086
Europe           0.249498
North America    0.705530
Oceania          0.000362
Name: pct_country_NGA, dtype: float64

In [367]:
nonla_continentCounts

Continent Name
Africa            6
Asia              6
Europe           25
North America     2
Oceania           3
Name: Countrys_in_Continents, dtype: int64

The goal is to make nonla_continentCounts a dataframe similar to total_by_continent and use this to measure distribution by geography

In [368]:
total_by_continent

Unnamed: 0,Countries,proportion_of_countries
Africa,57,0.228916
Antarctica,2,0.008032
Asia,53,0.212851
Europe,50,0.200803
North America,40,0.160643
Oceania,24,0.096386
South America,15,0.060241
non-transformable,8,0.032129


#### No artists in the NGA dataset were credited as having Antarctic/South American (after LatinAmerican data was removed) nationality & the non-transformable index was leftover from try-catching the PyCountry transformation of ISO codes, so they will be removed.

In [369]:
total_by_continent.drop('Antarctica', inplace=True)

total_by_continent.drop('non-transformable', inplace=True)

total_by_continent.drop('South America', inplace=True)

In [370]:
nonla_continentCounts = pd.DataFrame({'countries_present':nonla_continentCounts, 'proportion_of_continent': (nonla_continentCounts / total_by_continent['Countries'])})

#### This table shows world-wide share of countries and their proportion out of the total number of countries.

In [371]:
total_by_continent

Unnamed: 0,Countries,proportion_of_countries
Africa,57,0.228916
Asia,53,0.212851
Europe,50,0.200803
North America,40,0.160643
Oceania,24,0.096386


#### This table shows countries and continents present in the NGA dataset and the proportion of countries present from the continent. (e.g. 50% of countries from europe are not present in the NGA)

In [372]:
nonla_continentCounts['pct_continent_NGA'] = pct_continent_NGA

In [373]:
nonla_continentCounts

Unnamed: 0_level_0,countries_present,proportion_of_continent,pct_continent_NGA
Continent Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Africa,6,0.105263,0.000233
Asia,6,0.113208,0.004086
Europe,25,0.5,0.249498
North America,2,0.05,0.70553
Oceania,3,0.125,0.000362


#### To find out how many continents are not included in the NGA dataset, I will subtract the total for ALL countries included in the PyCountry library, and subtract the countries visible within the dataset per continent. The different will be called nonla_continentCounts_missing.

In [374]:
nonla_continentCounts_missing = total_by_continent['Countries'] - nonla_continentCounts['countries_present']

In [375]:
nonla_continentCounts_missing.name = 'Missing_from_Actual_Distribution'

In [376]:
nonla_continentCounts_missing = pd.DataFrame({'countries_missing':nonla_continentCounts_missing, 'proportion_of_continent': (nonla_continentCounts_missing / total_by_continent['Countries'])})

In [381]:
nonla_continentCounts_missing = nonla_continentCounts_missing.reset_index(drop=False)

In [383]:
nonla_continentCounts_missing.columns = ['Continent Name', 'countries_missing', 'proportion_of_continent']

#### The number of columns before adding information about the specific geographical details of the artists' nationalities as well as additional statistical information about the countries and continents representation within the dataset

In [385]:
nonla_geographicStatistics = pd.merge(nonla_geographicStatistics, nonla_continentCounts_missing.loc[:, ['Continent Name','countries_missing']], how='inner', on='Continent Name')

In [386]:
nonla_continentCounts.reset_index(drop=False, inplace=True)

In [387]:
nonla_geographicStatistics = pd.merge(nonla_continentCounts, nonla_geographicStatistics, on='Continent Name')

In [389]:
non_latinamerican_art.shape

(463799, 40)

I will merge the information about the missing continent/country data with the information about the data which is contained and use this to track if there is a naming issue causing datapoints to be dropped and non-EU or non-NA images and artists to not be included in the dataset - this is a given as I have noticed artists from asian and african countries featured in the NGA but that are now missing

In [390]:
#inner join of nonla_geographicStatistics and non_latinamerican_art
non_latinamerican_art = pd.merge(non_latinamerican_art, nonla_geographicStatistics, how='inner',left_on='nationality', right_on='demonym')

#### Feature Engineering IIIFUrl Links to view the data at the desired resolution

In [393]:
#converts the iiifurl to return the full image size
non_latinamerican_art['expanded_url'] = non_latinamerican_art.iiifthumburl.apply(lambda x: x.replace('!200,200', '!640,640'))

In [394]:
non_latinamerican_art.shape

(387064, 51)

After merging the main non_latinamerican_art dataset with the external geographical information that I created, I will write the updated dataset back to the original filepath and overwrite it as it contains valuable information

In [73]:
non_latinamerican_art.head()

Unnamed: 0,uuid,iiifurl,iiifthumburl,accessioned,title,displayDate_created,roletype,role,forwarddisplayname,birthyear,...,constituentid,Continent Name,countries_present,proportion_of_continent,pct_continent_NGA,demonym,pct_country_NGA,Country,Continent,Country Name
0,00004dec-8300-4487-8d89-562d0126b6a1,https://api.nga.gov/iiif/00004dec-8300-4487-8d...,https://api.nga.gov/iiif/00004dec-8300-4487-8d...,1,First Book: Daphnis Playing His Pipe for Chloe...,"French, 1861 - 1944",artist,artist,Aristide Maillol,1861.0,...,2163.0,Europe,25,0.5,0.249498,French,0.102478,FR,EU,France
1,00004dec-8300-4487-8d89-562d0126b6a1,https://api.nga.gov/iiif/00004dec-8300-4487-8d...,https://api.nga.gov/iiif/00004dec-8300-4487-8d...,1,First Book: Daphnis Playing His Pipe for Chloe...,"French, 1861 - 1944",artist,artist,Aristide Maillol,1861.0,...,2163.0,Europe,25,0.5,0.249498,French,0.102478,FR,EU,France
2,00018ee2-2b87-444d-afbf-b5d916306d2b,https://api.nga.gov/iiif/00018ee2-2b87-444d-af...,https://api.nga.gov/iiif/00018ee2-2b87-444d-af...,1,Rose and Romaine,"French, 1890 - 1968",artist,artist,Valentine Hugo,1890.0,...,4397.0,Europe,25,0.5,0.249498,French,0.102478,FR,EU,France
3,00018ee2-2b87-444d-afbf-b5d916306d2b,https://api.nga.gov/iiif/00018ee2-2b87-444d-af...,https://api.nga.gov/iiif/00018ee2-2b87-444d-af...,1,Rose and Romaine,"French, 1890 - 1968",artist,artist,Valentine Hugo,1890.0,...,4397.0,Europe,25,0.5,0.249498,French,0.102478,FR,EU,France
4,0002991a-98fa-42bd-bb33-fd0e2ff1ed1f,https://api.nga.gov/iiif/0002991a-98fa-42bd-bb...,https://api.nga.gov/iiif/0002991a-98fa-42bd-bb...,1,Fourth Book: Daphnis Plays to His Goats (Daphn...,"French, 1861 - 1944",artist,artist,Aristide Maillol,1861.0,...,2163.0,Europe,25,0.5,0.249498,French,0.102478,FR,EU,France


### Step 3 - Sampling from the large dataset for batch training

In [306]:
#Sampling from the full dataset, 30970 rows for training (0.08%) & 3960 (0.02%) for validation/testing == 34930 for one iteration of sample (1/10 of total dataset) // make sure not being selected with replacement to remove duplicates
matching_distribution = np.random.choice(non_latinamerican_art.index, p= non_latinamerican_art.pct_country_NGA / non_latinamerican_art.pct_country_NGA.sum(), size=34930, replace=False)

In [307]:
index_matching = list(matching_distribution)

In [308]:
subsample_nonla = non_latinamerican_art.iloc[index_matching, :]

In [309]:
subsample_geography = subsample_nonla.groupby('Country Name').apply(lambda x: len(x))

In [310]:
subsample_geography

Country Name
France              763
Germany             139
Italy               154
Netherlands          22
Spain                 1
Switzerland           1
United Kingdom       62
United States     33788
dtype: int64

In [311]:
subsample_geography = pd.DataFrame({'counts': subsample_geography,'proportion':subsample_geography / subsample_geography.sum()})

### Using the normalized probabilities in pct_continent_NGA to sample 10%, made the sample a little less representative to overall dataset. Since this is only a 10% sample, it is not fully representative, but I expect to train with more samples/batches.

In [312]:
subsample_geography.sort_values(by='proportion', ascending=False)

Unnamed: 0_level_0,counts,proportion
Country Name,Unnamed: 1_level_1,Unnamed: 2_level_1
United States,33788,0.967306
France,763,0.021844
Italy,154,0.004409
Germany,139,0.003979
United Kingdom,62,0.001775
Netherlands,22,0.00063
Spain,1,2.9e-05
Switzerland,1,2.9e-05


In [396]:
nonla_geographicStatistics.sort_values(by='pct_country_NGA', ascending=False).head(10)

Unnamed: 0,Continent Name,countries_present,proportion_of_continent,pct_continent_NGA,demonym,pct_country_NGA,Country,Continent,Country Name,countries_missing
38,North America,2,0.05,0.70553,American,0.70489,US,NoA,United States,38
19,Europe,25,0.5,0.249498,French,0.102478,FR,EU,France,25
24,Europe,25,0.5,0.249498,Italian,0.045463,IT,EU,Italy,25
20,Europe,25,0.5,0.249498,German,0.041107,DE,EU,Germany,25
36,Europe,25,0.5,0.249498,British,0.029466,GB,EU,United Kingdom,25
27,Europe,25,0.5,0.249498,Dutch,0.016821,NL,EU,Netherlands,25
33,Europe,25,0.5,0.249498,Spanish,0.003908,ES,EU,Spain,25
35,Europe,25,0.5,0.249498,Swiss,0.003898,CH,EU,Switzerland,25
7,Asia,6,0.113208,0.004086,Chinese,0.001939,CN,AS,China,47
13,Europe,25,0.5,0.249498,Austrian,0.001889,AT,EU,Austria,25


### Step 3 - Outputting the final combined NGA/FE data

In [395]:
non_latinamerican_art.to_csv('../../../data_samples/art_tables/non_latinamerican_art.csv', index=False)

In [397]:
subsample_nonla.reset_index(drop=True, inplace=True)

In [399]:
nonla_geographicStatistics.to_csv('../../../data_samples/nonLaArt/nonla_geographicStatistics.csv', index=False)

In [400]:
subsample_nonla.to_csv('../../../data_samples/results/processed_subset_results/non_latinamericanart_sample.csv', index=False)