In [2]:
import pandas as pd
import numpy as np
import pycountry
import pycountry_convert as pc
import matplotlib

### Step 1: Importing/Exporting of Data

In [18]:
#new version expects dtype on import however as of now only 1 row is impacted
latinamerican_art = pd.read_csv('../../../data_samples/art_tables/latin_art.csv', on_bad_lines='skip')

 Saving this column to its' own dataframe will allow for the EDA notebook 'Geography and Demography' to be ran and utilized later

In [20]:
#Saving the 'nationality' feature to the constituents nationalities CSV file
latinamerican_art.nationality.to_csv('../../../data_samples/LaArt/constituents_nationalities.csv', index=False)

In [24]:
# I need to import a copy of the constituents 'nationality' column from the latin_art.csv dataset
constituent_nationalities = pd.read_csv('../../../data_samples/LaArt/constituents_nationalities.csv')

In [25]:
latinamerican_art.shape

(342, 46)

### Step 2: of the La Art Pipeline - Feature Engineering new geographical features

#### List of American Continent Codes for Determining which Latin American countries are present in the Gallery.

In [26]:
cname_alpha_2 = []
cname_alpha_3 = []
for country in pycountry.countries:
    cname_alpha_2.append(country.alpha_2)
    cname_alpha_3.append(country.alpha_3)

In [27]:
cname_alpha_2 = pd.Series(cname_alpha_2)
cname_alpha_3 = pd.Series(cname_alpha_3)

In [28]:
error_list = ['AQ', 'TF', 'EH', 'PN', 'SX', 'TL', 'UM', 'VA']
continent_names = cname_alpha_2.apply(lambda x: pc.country_alpha2_to_continent_code(x) if x not in error_list else 'non-transformable')

In [29]:
country_and_continent = pd.DataFrame([cname_alpha_2, continent_names], index= ['Country', 'Continent']).T

In [30]:
total_by_continent = country_and_continent.groupby('Continent').count()

In [31]:
remove_from_index = ['non-transformable']
total_by_continent.index = pd.Series(list(total_by_continent.index)).apply(lambda x: pc.convert_continent_code_to_continent_name(x) if x not in remove_from_index else x)

### Estimated Actual Distribution of Countries by Continent, World Wide

In [32]:
country_and_continent['Country Name'] = country_and_continent.Country.apply(pc.country_alpha2_to_country_name)

In [33]:
country_and_continent['Continent Name'] = country_and_continent.Continent.apply(lambda x: pc.convert_continent_code_to_continent_name(x) if x != 'non-transformable' else x)

In [34]:
# 'NA' string translates to null in many data structures, so code changed to use 'NoAm' string instead
country_and_continent['Continent'] = country_and_continent.Continent.apply(lambda x: 'NoAm' if x == 'NA' else x)

In [None]:
total_by_continent

Unnamed: 0,Country
Africa,57
Antarctica,2
Asia,53
Europe,50
North America,40
Oceania,24
South America,15
non-transformable,8


In [36]:
Latin_in_NA = ['BZ', 'CR', 'CU', 'DO', 'SV', 'GT', 'HT', 'HN', 'JM', 'MX', 'NI', 'PA', 'LC']
latin_in_NA_map = country_and_continent.Country.apply(lambda x: x in Latin_in_NA)
latin_in_SA_map = country_and_continent.Continent.apply(lambda x: x == 'SA')
latins = country_and_continent.where(latin_in_SA_map | latin_in_NA_map).dropna()
latins.name = 'Latin Countries'
latins.reset_index(inplace=True, drop=True)
latins['Country Name'] = latins.Country.apply(pc.country_alpha2_to_country_name)

### Latin American Group (Feature Engineering)

In [60]:
country_and_continent.head()

Unnamed: 0,Country,Continent,Country Name,Continent Name
0,AW,NoAm,Aruba,North America
1,AF,AS,Afghanistan,Asia
2,AO,AF,Angola,Africa
3,AI,NoAm,Anguilla,North America
4,AX,EU,Åland Islands,Europe


In [37]:
latins.head()

Unnamed: 0,Country,Continent,Country Name,Continent Name
0,AR,SA,Argentina,South America
1,BZ,NoAm,Belize,North America
2,BO,SA,"Bolivia, Plurinational State of",South America
3,BR,SA,Brazil,South America
4,CL,SA,Chile,South America


In [39]:
la_artist_origin = constituent_nationalities.value_counts(normalize=True)

In [41]:
la_artist_origin.index = [x[0] for x in list(la_artist_origin.index)]

In [57]:
la_artist_origin

Unnamed: 0,demonym,pct_country_NGA
0,Mexican,0.596491
1,Brazilian,0.108187
2,Argentinean,0.078947
3,Guatemalan,0.076023
4,Chilean,0.061404
5,Venezuelan,0.046784
6,Cuban,0.008772
7,Peruvian,0.008772
8,Colombian,0.005848
9,Uruguayan,0.005848


In [42]:
latins['demonym'] = ['Argentinean', 'Belizean', 'Bolivian', 'Brazilian', 'Chilean', 'Colombian', 'Costa Rican', 'Cuban', 'Dominican', 'Ecuadorian', 'Falkland Islander', 'Guatemalan', 'Guianese', 'Guyanese', 'Honduran', 'Haitian', 'Jamaican', 'Saint Lucian', 'Mexican', 'Nicaraguan', 'Panamanian', 'Peruvian', 'Paraguayan', 'South Georgian', 'Salvadoran', 'Surinamese', 'Uruguayan', 'Venezuelan']

In [43]:
la_artist_origin = la_artist_origin.reset_index()

In [44]:
la_artist_origin.columns = ['demonym', 'pct_country_NGA']

In [45]:
la_geographicStatistics = pd.merge(la_artist_origin, latins, how='inner', on ='demonym')

### Output Latin American Art data present within the NGA database (Data Prep)

In [41]:
### Important: This file is used to add geographic data to other tables in DB

In [47]:
la_geographicStatistics.to_csv('../../../data_samples/LaArt/la_geographicStatistics.csv', index=False)

In [59]:
la_geographicStatistics

Unnamed: 0,demonym,pct_country_NGA,Country,Continent,Country Name,Continent Name
0,Mexican,0.596491,MX,NoAm,Mexico,North America
1,Brazilian,0.108187,BR,SA,Brazil,South America
2,Argentinean,0.078947,AR,SA,Argentina,South America
3,Guatemalan,0.076023,GT,NoAm,Guatemala,North America
4,Chilean,0.061404,CL,SA,Chile,South America
5,Venezuelan,0.046784,VE,SA,"Venezuela, Bolivarian Republic of",South America
6,Cuban,0.008772,CU,NoAm,Cuba,North America
7,Peruvian,0.008772,PE,SA,Peru,South America
8,Colombian,0.005848,CO,SA,Colombia,South America
9,Uruguayan,0.005848,UY,SA,Uruguay,South America


### Output data with links to view the art (the amount shrank due to only a subset being uploaded to the internet / being publically available) (Data Prep)

In [53]:
#converts the iiifurl to return the full image size
latinamerican_art['expanded_url'] = latinamerican_art.iiifthumburl.apply(lambda x: x.replace('!200,200', 'full'))

In [54]:
latinamerican_art.to_csv('../../../data_samples/LaArt/latinamerican_art.csv', index=False)

In [55]:
latinamerican_art.shape

(342, 47)