# Aquire and join data soureces

In [1]:
%matplotlib inline
import pandas as pd

In [20]:
drinks = pd.read_csv('national-drinking-habits.csv')

### Note: if no header you would not have columns

In [16]:
wiki_page = 'https://en.wikipedia.org/wiki/List_of_countries_and_dependencies_by_population'
dfs = pd.read_html(wiki_page, header=0, index_col=0)

In [5]:
len(dfs)

4

In [6]:
dfs[0].shape

(1, 2)

In [9]:
dfs[2].shape

(251, 6)

In [17]:
dfs[2]

Unnamed: 0_level_0,Country (or dependent territory),Population,Date,% of world population,Source
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,China[Note 2],1378640000,"September 8, 2016",18.8%,Official population clock
2,India,1330730000,"September 8, 2016",18.1%,Official population clock
3,United States[Note 3],324429000,"September 8, 2016",4.42%,Official population clock
4,Indonesia,260581000,"July 1, 2016",3.55%,Official projection
5,Brazil,206625000,"September 8, 2016",2.81%,Official population clock
6,Pakistan,194196000,"September 8, 2016",2.64%,Official population clock
7,Nigeria,186988000,"July 1, 2016",2.55%,UN Projection
8,Bangladesh,161047000,"September 8, 2016",2.19%,Official population clock
9,Russia[Note 4],146640000,"May 1, 2016",2%,Official estimate
10,Mexico,128632000,"July 1, 2016",1.75%,Official projection


In [36]:
ugly_country_names = dfs[2]['Country (or dependent territory)']

In [48]:
populations = dfs[2].copy()
populations['Name'] = ugly_country_names.str.extract("([^[(]*)").str.strip()

  from ipykernel import kernelapp as app


In [52]:
populations[populations.Name == 'China'].Population.median()

1378640000.0

In [58]:
mdf = drinks.merge(populations, left_on='country', right_on='Name')
mdf

Unnamed: 0,country,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent,Country (or dependent territory),Population,Date,% of world population,Source,Name
0,Afghanistan,0,0,0,0.0,AS,Afghanistan,27657145,"July 1, 2016",0.38%,Annual official estimate,Afghanistan
1,Albania,89,132,54,4.9,EU,Albania,2886026,"January 1, 2016",0.039%,Annual official estimate,Albania
2,Algeria,25,0,14,0.7,AF,Algeria,40400000,"July 1, 2016",0.55%,Official annual projection,Algeria
3,Andorra,245,138,312,12.4,EU,Andorra,78014,"December 31, 2015",0.0011%,Annual official estimate,Andorra
4,Angola,217,57,45,5.9,AF,Angola,24383301,"May 16, 2014",0.33%,Preliminary 2014 census result,Angola
5,Argentina,193,25,221,8.3,SA,Argentina,43590400,"July 1, 2016",0.59%,Official annual projection,Argentina
6,Armenia,21,179,11,3.8,EU,Armenia,2995100,"July 1, 2016",0.041%,Quarterly official estimate,Armenia
7,Australia,261,72,212,10.4,OC,Australia,24182100,"September 8, 2016",0.329%,Official population clock,Australia
8,Austria,279,75,191,9.7,EU,Austria,8741753,"July 1, 2016",0.119%,Quarterly provisional figure,Austria
9,Azerbaijan,21,46,5,1.3,EU,Azerbaijan,9755500,"July 1, 2016",0.13%,Official estimate,Azerbaijan


In [59]:
mdf.country.count()

177

In [60]:
drinks.country.count()

193

In [68]:
drinks_set = set(drinks.country)
mdf_set = set(mdf.country)
drinks_set - mdf_set

{'Antigua & Barbuda',
 'Bosnia-Herzegovina',
 'Cabo Verde',
 'Congo',
 "Cote d'Ivoire",
 'DR Congo',
 'Gambia',
 'Micronesia',
 'Russian Federation',
 'Sao Tome & Principe',
 'St. Kitts & Nevis',
 'St. Lucia',
 'St. Vincent & the Grenadines',
 'Timor-Leste',
 'Trinidad & Tobago',
 'USA'}

# DBs disagree about the names of countries. We have three options:

## Option 1 | Repair after the fact
we will use and outer join instead of an inner join.

In [71]:
animals = pd.DataFrame(
    data = {
        'name': ['Chong', 'Alex', 'Byron', 'James'],
        'fav_animal': ['frog', 'panda', 'dog', 'dog']
    }
)

In [72]:
animals

Unnamed: 0,fav_animal,name
0,frog,Chong
1,panda,Alex
2,dog,Byron
3,dog,James


In [74]:
fav_colors = pd.DataFrame(
    data = {
        'name': ['Guarav', 'James', 'Chong', 'Alex'],
        'color': ['Green', 'Blue', 'Orange', 'Blue']
    }
)

fav_colors

Unnamed: 0,color,name
0,Green,Guarav
1,Blue,James
2,Orange,Chong
3,Blue,Alex


In [76]:
animals.merge(fav_colors, on='name', how='left')

Unnamed: 0,fav_animal,name,color
0,frog,Chong,Orange
1,panda,Alex,Blue
2,dog,Byron,
3,dog,James,Blue


In [89]:
drinks_outer = drinks.merge(populations,
                           left_on='country',
                           right_on='Name',
                           how='left').copy()

In [90]:
len(drinks)

193

In [99]:
set(drinks_outer[drinks_outer.Population.isnull()].country)

{'Antigua & Barbuda',
 'Bosnia-Herzegovina',
 'Cabo Verde',
 'Congo',
 "Cote d'Ivoire",
 'DR Congo',
 'Gambia',
 'Micronesia',
 'Russian Federation',
 'Sao Tome & Principe',
 'St. Kitts & Nevis',
 'St. Lucia',
 'St. Vincent & the Grenadines',
 'Timor-Leste',
 'Trinidad & Tobago',
 'USA'}

In [100]:
mappings = {'Congo': 'DRC', 'USA': 'United States'}

In [None]:
##########What to do here???????

In [None]:
######Not sure if I follow this example

In [92]:
populations.Name[populations.Name.str.startswith('Antigua')]

Rank
184    Antigua and Barbuda
Name: Name, dtype: object

In [95]:
populations.Population[populations.Name.str.startswith('Antigua')].median()

86295.0

In [97]:
antiqua_pop = populations.Population[populations.Name.str.startswith('Antigua')].median()
drinks_outer[drinks_outer.country.str.startswith('Antigua')].Population = antiqua_pop

In [None]:
######Not sure if I follow this example

## Option 2 | Fix the drinks table

In [101]:
def cor_wrong_drinks_name(x):
    if x in mappings:
        retrun mappings[x]
    return x

SyntaxError: invalid syntax (<ipython-input-101-2ce910509c60>, line 3)

## Option 3 | Fix the Wikipedia table