In [1]:
import pandas as pd
import numpy as np
import pycountry
import pycountry_convert as pc
import random

### Step 1: Importing/Exporting of Data

The initial number of rows in the non_latinamerican_art dataset is 471542 in the database and after removing the 'bad lines', 7743 rows were removed.

In [10]:
#new version expects dtype on import however as of now only 1 row is impacted
non_latinamerican_art = pd.read_csv('../../../data_samples/art_tables/non_latin_art.csv', low_memory=False, on_bad_lines='skip')

In [19]:
non_latinamerican_art.shape

(463799, 40)

 Saving this column to its' own dataframe will allow for the EDA notebook 'Geography and Demography' to be ran and utilized later

In [20]:
#Saving the 'nationality' feature to the constituents nationalities CSV file
non_latinamerican_art.nationality.to_csv('../../../data_samples/nonLaArt/constituents_nationalities.csv', index=False)

### Step 2: of the La Art Pipeline - Feature Engineering new geographical features

#### List of Continent Codes for Determining which non Latin American countries are present in the Gallery

In [21]:
cname_alpha_2 = []
cname_alpha_3 = []
for country in pycountry.countries:
    cname_alpha_2.append(country.alpha_2)
    cname_alpha_3.append(country.alpha_3)

In [22]:
cname_alpha_2 = pd.Series(cname_alpha_2)
cname_alpha_3 = pd.Series(cname_alpha_3)

In [23]:
error_list = ['AQ', 'TF', 'EH', 'PN', 'SX', 'TL', 'UM', 'VA']
continent_names = cname_alpha_2.apply(lambda x: pc.country_alpha2_to_continent_code(x) if x not in error_list else 'non-transformable')

In [24]:
country_and_continent = pd.DataFrame([cname_alpha_2, continent_names], index= ['Country', 'Continent']).T

This is an important aspect of the data as it will pair the countries with their continents and the nationalities of the artists in the dataset

In [25]:
country_and_continent.head()

Unnamed: 0,Country,Continent
0,AW,
1,AF,AS
2,AO,AF
3,AI,
4,AX,EU


In [26]:
total_by_continent = country_and_continent.groupby('Continent').count()

### Distribution of Countries by Continent, World Wide

In [27]:
country_and_continent['Country Name'] = country_and_continent.Country.apply(pc.country_alpha2_to_country_name)

In [28]:
country_and_continent['Continent Name'] = country_and_continent.Continent.apply(lambda x: pc.convert_continent_code_to_continent_name(x) if x != 'non-transformable' else x)

In [29]:
country_and_continent.head()

Unnamed: 0,Country,Continent,Country Name,Continent Name
0,AW,,Aruba,North America
1,AF,AS,Afghanistan,Asia
2,AO,AF,Angola,Africa
3,AI,,Anguilla,North America
4,AX,EU,Åland Islands,Europe


In [30]:
constituent_nationalities = non_latinamerican_art.nationality.copy()

In [31]:
nonla_artist_origin = constituent_nationalities.value_counts(normalize=True)

In [32]:
constituent_nationalities.shape

(463799,)

In [35]:
nonla_artist_origin.head()

American    0.704890
French      0.102478
Italian     0.045463
German      0.041107
British     0.029466
Name: nationality, dtype: float64

In [36]:
nonla_artist_origin = nonla_artist_origin.reset_index(drop=False)

In [37]:
nonla_artist_origin.columns = ['demonym', 'pct_country_NGA']

In [38]:
#the assistance of an outside demonyms table which has a key to connect Country to Demonym
demonyms = pd.read_csv('../../../data_samples/results/processed_subset_results/demonyms.csv')

In [39]:
non_latines = pd.merge(country_and_continent, demonyms, how='inner', on ='Country Name')

In [40]:
nonla_geographicStatistics = pd.merge(nonla_artist_origin, non_latines, how='inner', on = 'demonym')

#### Notice: Some of the datapoints in this table are duplicated and may need to be removed to due the existence of many kinds of demonyms for each country. (one-country, many-demonyms). Could be resolved by choosing a single demonym per country.

In [52]:
nonla_geographicStatistics.drop(11, axis=0, inplace=True)
nonla_geographicStatistics.reset_index(drop=True, inplace=True)

In [57]:
nonla_geographicStatistics.head(15)

Unnamed: 0,demonym,pct_country_NGA,Country,Continent,Country Name,Continent Name
0,American,0.70489,US,NoA,United States,North America
1,French,0.102478,FR,EU,France,Europe
2,Italian,0.045463,IT,EU,Italy,Europe
3,German,0.041107,DE,EU,Germany,Europe
4,British,0.029466,GB,EU,United Kingdom,Europe
5,Dutch,0.016821,NL,EU,Netherlands,Europe
6,Spanish,0.003908,ES,EU,Spain,Europe
7,Swiss,0.003898,CH,EU,Switzerland,Europe
8,Chinese,0.001939,CN,AS,China,Asia
9,Austrian,0.001889,AT,EU,Austria,Europe


### Proportion of Countries per Continent
Real vs Non-LA Dataset Distributions

In [43]:
remove_from_index = ['non-transformable']
total_by_continent.index = pd.Series(list(total_by_continent.index)).apply(lambda x: pc.convert_continent_code_to_continent_name(x) if x not in remove_from_index else x)

In [44]:
total_by_continent['proportion'] = total_by_continent['Country'] / total_by_continent['Country'].sum()

In [45]:
total_by_continent.name = 'Actual Distribution'

In [46]:
total_by_continent

Unnamed: 0,Country,proportion
Africa,57,0.228916
Antarctica,2,0.008032
Asia,53,0.212851
Europe,50,0.200803
North America,40,0.160643
Oceania,24,0.096386
South America,15,0.060241
non-transformable,8,0.032129


In [47]:
nonla_geographicStatistics['Continent'] = nonla_geographicStatistics['Continent'].replace('NA', 'NoA')

In [78]:
nonla_continentCounts = nonla_geographicStatistics.groupby('Continent Name').apply(lambda x: len(x))

 After using the demonym dataset to link the geographic naming Data
 with the pct_country_NGA data, some datapoints were lots. (sum of pct is now ~96%) and the number of countries was reduced to ~ 52 (one duplicate)

In [58]:
nonla_geographicStatistics

Unnamed: 0,demonym,pct_country_NGA,Country,Continent,Country Name,Continent Name
0,American,0.70489,US,NoA,United States,North America
1,French,0.102478,FR,EU,France,Europe
2,Italian,0.045463,IT,EU,Italy,Europe
3,German,0.041107,DE,EU,Germany,Europe
4,British,0.029466,GB,EU,United Kingdom,Europe
5,Dutch,0.016821,NL,EU,Netherlands,Europe
6,Spanish,0.003908,ES,EU,Spain,Europe
7,Swiss,0.003898,CH,EU,Switzerland,Europe
8,Chinese,0.001939,CN,AS,China,Asia
9,Austrian,0.001889,AT,EU,Austria,Europe


In [79]:
nonla_continentCounts.name = 'Countrys_in_Continents'

In [80]:
nonla_continentCounts

Continent Name
Africa            6
Asia              6
Europe           25
North America     6
Oceania           3
South America     7
Name: Countrys_in_Continents, dtype: int64

The goal is to make nonla_continentCounts a dataframe similar to total_by_continent and use this to measure distribution by geography

In [228]:
total_by_continent

Unnamed: 0,Country,proportion
Africa,57,0.228916
Antarctica,2,0.008032
Asia,53,0.212851
Europe,50,0.200803
North America,40,0.160643
Oceania,24,0.096386
South America,15,0.060241
non-transformable,8,0.032129


#### No artists in the NGA dataset were credited as having Antarctic nationality & the non-transformable index was leftover from try-catching the PyCountry transformation of ISO codes, so they will be removed.

In [84]:
total_by_continent.drop('Antarctica', inplace=True)

In [86]:
total_by_continent.drop('non-transformable', inplace=True)

In [92]:
nonla_continentCounts = pd.DataFrame({'countries_present':nonla_continentCounts, 'proportion_of_continent': (nonla_continentCounts / total_by_continent['Country'])})

#### This table shows world-wide share of countries and their proportion out of the total number of countries.

In [94]:
total_by_continent

Unnamed: 0,Country,proportion
Africa,57,0.228916
Asia,53,0.212851
Europe,50,0.200803
North America,40,0.160643
Oceania,24,0.096386
South America,15,0.060241


#### This table shows countries present in the NGA dataset and the proportion of countries present from the continent. (e.g. 50% of countries from europe are not present in the NGA)

In [93]:
nonla_continentCounts

Unnamed: 0_level_0,countries_present,proportion_of_continent
Continent Name,Unnamed: 1_level_1,Unnamed: 2_level_1
Africa,6,0.105263
Asia,6,0.113208
Europe,25,0.5
North America,6,0.15
Oceania,3,0.125
South America,7,0.466667


#### To find out how many continents are not included in the NGA dataset, I will subtract the total for ALL countries included in the PyCountry library, and subtract the countries visible within the dataset per continent. The different will be called nonla_continentCounts_missing.

In [95]:
nonla_continentCounts_missing = total_by_continent['Country'] - nonla_continentCounts['countries_present']

In [96]:
nonla_continentCounts_missing.name = 'Missing_from_Actual_Distribution'

In [97]:
nonla_continentCounts_missing = pd.DataFrame({'countries_missing':nonla_continentCounts_missing, 'proportion_of_continent': (nonla_continentCounts_missing / total_by_continent['Country'])})

In [98]:
nonla_continentCounts_missing

Unnamed: 0,countries_missing,proportion_of_continent
Africa,51,0.894737
Asia,47,0.886792
Europe,25,0.5
North America,34,0.85
Oceania,21,0.875
South America,8,0.533333


In [99]:
#inner join of non_latines and non_latinamerican_art to get the elementwise probs
non_latinamerican_art = pd.merge(non_latinamerican_art, non_latines, how='inner',left_on='nationality', right_on='demonym')

In [100]:
non_latinamerican_art.shape

(388230, 45)

After merging the main non_latinamerican_art dataset with the external geographical information that I created, I will write the updated dataset back to the original filepath and overwrite it as it contains valuable information

In [101]:
non_latinamerican_art.head()

Unnamed: 0,uuid,iiifurl,iiifthumburl,accessioned,title,displayDate_created,roletype,role,forwarddisplayname,birthyear,...,maxpixels,assistivetext,depictstmsobjectid,objectid,constituentid,Country,Continent,Country Name,Continent Name,demonym
0,00004dec-8300-4487-8d89-562d0126b6a1,https://api.nga.gov/iiif/00004dec-8300-4487-8d...,https://api.nga.gov/iiif/00004dec-8300-4487-8d...,1,First Book: Daphnis Playing His Pipe for Chloe...,"French, 1861 - 1944",artist,artist,Aristide Maillol,1861,...,640,,11975,11975,2163.0,FR,EU,France,Europe,French
1,00004dec-8300-4487-8d89-562d0126b6a1,https://api.nga.gov/iiif/00004dec-8300-4487-8d...,https://api.nga.gov/iiif/00004dec-8300-4487-8d...,1,First Book: Daphnis Playing His Pipe for Chloe...,"French, 1861 - 1944",artist,artist,Aristide Maillol,1861,...,640,,11975,11975,2163.0,FR,EU,France,Europe,French
2,00018ee2-2b87-444d-afbf-b5d916306d2b,https://api.nga.gov/iiif/00018ee2-2b87-444d-af...,https://api.nga.gov/iiif/00018ee2-2b87-444d-af...,1,Rose and Romaine,"French, 1890 - 1968",artist,artist,Valentine Hugo,1890,...,640,,43122,43122,4397.0,FR,EU,France,Europe,French
3,00018ee2-2b87-444d-afbf-b5d916306d2b,https://api.nga.gov/iiif/00018ee2-2b87-444d-af...,https://api.nga.gov/iiif/00018ee2-2b87-444d-af...,1,Rose and Romaine,"French, 1890 - 1968",artist,artist,Valentine Hugo,1890,...,640,,43122,43122,4397.0,FR,EU,France,Europe,French
4,0002991a-98fa-42bd-bb33-fd0e2ff1ed1f,https://api.nga.gov/iiif/0002991a-98fa-42bd-bb...,https://api.nga.gov/iiif/0002991a-98fa-42bd-bb...,1,Fourth Book: Daphnis Plays to His Goats (Daphn...,"French, 1861 - 1944",artist,artist,Aristide Maillol,1861,...,640,,12011,12011,2163.0,FR,EU,France,Europe,French


In [311]:
non_latinamerican_art.to_csv('../../../data_samples/art_tables/non_latin_art.csv', index=False)

In [259]:
continent_probabilities = pd.concat([nonla_continentCounts, non_latinamerican_art['Continent Name'].value_counts(normalize=True)], axis = 1).dropna(how='all')

In [261]:
continent_probabilities['pct_continent_NGA'] = continent_probabilities['Continent Name'].fillna(0)

In [263]:
continent_probabilities.drop('Continent Name', axis = 1, inplace=True)

In [267]:
continent_probabilities.columns = ['countries_present', 'pct_countries_from_continent_NGA', 'pct_from_continent_NGA']

In [268]:
continent_probabilities

Unnamed: 0,countries_present,pct_countries_from_continent_NGA,pct_from_continent_NGA
Africa,0,0.0,0.0
Antarctica,0,0.0,0.0
Asia,0,0.0,0.0
Europe,9,0.18,0.591293
North America,1,0.025,0.408707
Oceania,0,0.0,0.0
South America,0,0.0,0.0
non-transformable,0,0.0,0.0


In [287]:
nonla_continentCounts_missing.columns = ['countries_missing', 'pct_countries_missing_from_continent_NGA']

In [294]:
nonla_continentCounts_missing['is_missing'] = nonla_continentCounts_missing.pct_countries_missing_from_continent_NGA.apply(lambda x: 0 if x < 1 else 1)

I will merge the information about the missing continent/country data with the information about the data which is contained and use this to track if there is a naming issue causing datapoints to be dropped and non-EU or non-NA images and artists to not be included in the dataset - this is a given as I have noticed artists from asian and african countries featured in the NGA but that are now missing

In [295]:
nonla_continentCounts_missing

Unnamed: 0,countries_missing,pct_countries_missing_from_continent_NGA,is_missing
Africa,57,1.0,1
Antarctica,2,1.0,1
Asia,53,1.0,1
Europe,41,0.82,0
North America,39,0.975,0
Oceania,24,1.0,1
South America,15,1.0,1
non-transformable,8,1.0,1


Below is the code that samples data from the dataset using the missing/not missing information of country and continent data to influence its decision (not ready yet as the information is not yet accurate representation)

In [299]:
nonla_geographicStatistics

Unnamed: 0,demonym,pct_country_NGA,Country,Continent,Country Name,Continent Name
0,American,0.378527,US,NoA,United States,North America
1,French,0.347459,FR,EU,France,Europe
2,German,0.119132,DE,EU,Germany,Europe
3,Italian,0.03438,IT,EU,Italy,Europe
4,British,0.028555,GB,EU,United Kingdom,Europe
5,Spanish,0.009595,ES,EU,Spain,Europe
6,Dutch,0.006967,NL,EU,Netherlands,Europe
7,Greek,0.000742,GR,EU,Greece,Europe
8,Swiss,0.000457,CH,EU,Switzerland,Europe
9,Austrian,0.000343,AT,EU,Austria,Europe


In [301]:
nonla_continentStatistics = pd.concat([continent_probabilities, nonla_continentCounts_missing], axis = 1)

In [302]:
nonla_continentStatistics.to_csv('../../../data_samples/nonLaArt/nonla_continentStatistics.csv', index=False)

In [121]:
#1580 rows for training (0.008%) & 396 (0.002%) for validation/testing == 1976 for one iteration of sample (1/100 of total dataset) // make sure not being selected with replacement to remove duplicate issue resolved in download_nonLa_art (sampling only for nonLa)
#matching_distribution = np.random.choice(non_latinamerican_art.index, p=percent_from_nonLaContinent, size=1976, replace=False)

In [123]:
#index_matching = list(matching_distribution)

In [125]:
#nonLa_art_subsample = nonLa_art.iloc[index_matching, :]

In [126]:
#subsample_countryCounts = nonLa_art_subsample.groupby('Continent').apply(lambda x: len(x))

In [127]:
#subsample_countryCounts

Continent
Africa           480
Asia             453
Europe           469
North America    384
Oceania          190
dtype: int64

In [128]:
#subsample_countryCounts = pd.DataFrame({'counts': subsample_countryCounts,'proportion':subsample_countryCounts / subsample_countryCounts.sum()})

### Using the probabilities in percent_from_nonLaContinent (with minor correction due to removing the la art causing probabilities to not sum to 1) made the sample about equally representative to overall dataset

In [129]:
#subsample_countryCounts

Unnamed: 0_level_0,counts,proportion
Continent,Unnamed: 1_level_1,Unnamed: 2_level_1
Africa,480,0.242915
Asia,453,0.229251
Europe,469,0.237348
North America,384,0.194332
Oceania,190,0.096154


In [130]:
#nonla_countryCounts

Unnamed: 0,Countries,proportion
Africa,57,0.247826
Antarctica,2,0.008696
Asia,53,0.230435
Europe,50,0.217391
North America,36,0.156522
Oceania,24,0.104348
South America,0,0.0
non-transformable,8,0.034783


### Step 3 - Feature Engineering IIIFUrl Links to view the data at the desired resolution

### Step 4 - Outputting the final combined NGA/FE data

### Downloading the art and dropping into non_laImages folder TBD in download_nonLA_art notebook!

In [24]:
#nonla_geographicStatistics.to_csv('../../../data_samples/nonLaArt/nonla_geographicStatistics.csv', index=False)

In [132]:
#nonLa_art_subsample.to_csv('../../data_samples/results/processed_subset_results/nonLa_art_sample.csv', index=False)