In [33]:
import pandas as pd
import numpy as np
import pycountry
import pycountry_convert as pc
import random

In [17]:
nonLa_art = pd.read_csv('../../data_samples/results/whole_set_results/pa_nonLa_art.csv')
la_geographicStatistics = pd.read_csv('../../data_samples/la_geographicStatistics.csv')

### Goal: Make a similar dataset to la_geographicStatistics but with nonLa art this time, then using this information to make a representative sample of the data according to continent, which we will use to download about 2000 images for the models.

In [18]:
cname_alpha_2 = []
cname_alpha_3 = []
for country in pycountry.countries:
    cname_alpha_2.append(country.alpha_2)
    cname_alpha_3.append(country.alpha_3)

cname_alpha_2 = pd.Series(cname_alpha_2)
cname_alpha_3 = pd.Series(cname_alpha_3)

error_list = ['AQ', 'TF', 'EH', 'PN', 'SX', 'TL', 'UM', 'VA']
continent_names = cname_alpha_2.apply(lambda x: pc.country_alpha2_to_continent_code(x) if x not in error_list else 'non-transformable')

country_and_continent = pd.DataFrame([cname_alpha_2, continent_names], index=['Country', 'Continent']).T

total_by_continent = country_and_continent.groupby('Continent').count()

remove_from_index = ['non-transformable']
total_by_continent.index = pd.Series(list(total_by_continent.index)).apply(lambda x: pc.convert_continent_code_to_continent_name(x) if x not in remove_from_index else x)

In [19]:
country_and_continent['Country Name'] = country_and_continent.Country.apply(pc.country_alpha2_to_country_name)
country_and_continent['Continent Name'] = country_and_continent.Continent.apply(lambda x: pc.convert_continent_code_to_continent_name(x) if x != 'non-transformable' else x)

In [50]:
country_and_continent

Unnamed: 0,Country,Continent,Country Name,Continent Name
0,AW,,Aruba,North America
1,AF,AS,Afghanistan,Asia
2,AO,AF,Angola,Africa
3,AI,,Anguilla,North America
4,AX,EU,Åland Islands,Europe
...,...,...,...,...
244,WS,OC,Samoa,Oceania
245,YE,AS,Yemen,Asia
246,ZA,AF,South Africa,Africa
247,ZM,AF,Zambia,Africa


### Proportion of Countries per Continent
Real vs Non-LA Dataset Distributions

In [21]:
total_by_continent['proportion'] = total_by_continent['Country'] / total_by_continent['Country'].sum()

In [32]:
total_by_continent

Unnamed: 0,Country,proportion
Africa,57,0.228916
Antarctica,2,0.008032
Asia,53,0.212851
Europe,50,0.200803
North America,40,0.160643
Oceania,24,0.096386
South America,15,0.060241
non-transformable,8,0.032129


In [23]:
la_geographicStatistics['Continent'] = la_geographicStatistics['Continent'].fillna('NoA')

In [24]:
la_continentCounts = la_geographicStatistics.groupby('Continent').apply(lambda x: len(x))

In [25]:
la_continentCounts.index = ['North America', 'South America']
# 8 non-transformable country/continent pairs during labeling added to zero the south america group
la_continentCounts['South America'] += 8

In [26]:
la_continentCounts = la_continentCounts.append(pd.Series({'Africa': 0, 'Antarctica': 0, 'Asia': 0, 'Europe':0, 'Oceania':0, 'non-transformable':0}))

In [27]:
la_continentCounts = la_continentCounts.reindex(total_by_continent.index)
la_continentCounts.name = 'Country'

In [89]:
nonla_countryCounts = total_by_continent['Country'] - la_continentCounts

In [90]:
nonla_countryCounts = pd.DataFrame({'Countries':nonla_countryCounts, 'proportion': (nonla_countryCounts / nonla_countryCounts.sum())})

In [93]:
elementwise_probabilities = pd.concat([nonla_countryCounts, nonLa_art.Continent.value_counts()], axis = 1).dropna(how='any')

In [94]:
elementwise_probabilities

Unnamed: 0,Countries,proportion,Continent
Africa,57,0.247826,49.0
Asia,53,0.230435,828.0
Europe,50,0.217391,50910.0
North America,36,0.156522,145665.0
Oceania,24,0.104348,78.0
South America,0,0.0,54.0


In [95]:
elementwise_probabilities.columns = ['countries', 'proportion', 'numRows']

In [97]:
elementwise_probabilities['prob_per_entry'] = elementwise_probabilities['proportion'] / elementwise_probabilities['numRows']

In [113]:
percent_from_nonLaContinent = nonLa_art['Continent'].apply(lambda x: elementwise_probabilities.prob_per_entry[x])

In [114]:
missing_la_percent = 1 - percent_from_nonLaContinent.sum()

In [117]:
percent_from_nonLaContinent += missing_la_percent / len(percent_from_nonLaContinent)

In [121]:
#1580 rows for training (0.008%) & 396 (0.002%) for validation/testing == 1976 for one iteration of sample (1/100 of total dataset) // make sure not being selected with replacement to remove duplicate issue resolved in download_nonLa_art (sampling only for nonLa)
matching_distribution = np.random.choice(nonLa_art.index, p=percent_from_nonLaContinent, size=1976, replace=False)

In [123]:
index_matching = list(matching_distribution)

In [125]:
nonLa_art_subsample = nonLa_art.iloc[index_matching, :]

In [126]:
subsample_countryCounts = nonLa_art_subsample.groupby('Continent').apply(lambda x: len(x))

In [127]:
subsample_countryCounts

Continent
Africa           480
Asia             453
Europe           469
North America    384
Oceania          190
dtype: int64

In [128]:
subsample_countryCounts = pd.DataFrame({'counts': subsample_countryCounts,'proportion':subsample_countryCounts / subsample_countryCounts.sum()})

### Using the probabilities in percent_from_nonLaContinent (with minor correction due to removing the la art causing probabilities to not sum to 1) made the sample about equally representative to overall dataset

In [129]:
subsample_countryCounts

Unnamed: 0_level_0,counts,proportion
Continent,Unnamed: 1_level_1,Unnamed: 2_level_1
Africa,480,0.242915
Asia,453,0.229251
Europe,469,0.237348
North America,384,0.194332
Oceania,190,0.096154


In [130]:
nonla_countryCounts

Unnamed: 0,Countries,proportion
Africa,57,0.247826
Antarctica,2,0.008696
Asia,53,0.230435
Europe,50,0.217391
North America,36,0.156522
Oceania,24,0.104348
South America,0,0.0
non-transformable,8,0.034783


### Downloading the art and dropping into non_laImages folder TBD in download_nonLA_art notebook!

In [132]:
nonLa_art_subsample.to_csv('../../data_samples/results/processed_subset_results/nonLa_art_sample.csv', index=False)