## Gapminder Project

#### Import libraries

In [2]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt 
import imageio

#### Dataset Readout

In [3]:
# Dataframe for Fertility
fert = pd.read_csv('/Users/andrea/Desktop/Final_Projects/Week1/data/gapminder_total_fertility.csv', index_col=0)
fert.head(5)

Unnamed: 0_level_0,1800,1801,1802,1803,1804,1805,1806,1807,1808,1809,...,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015
Total fertility rate,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Abkhazia,,,,,,,,,,,...,,,,,,,,,,
Afghanistan,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,...,6.7,6.46,6.2,5.93,5.66,5.4,5.14,4.9,4.68,4.47
Akrotiri and Dhekelia,,,,,,,,,,,...,,,,,,,,,,
Albania,4.6,4.6,4.6,4.6,4.6,4.6,4.6,4.6,4.6,4.6,...,1.85,1.8,1.76,1.74,1.74,1.75,1.76,1.77,1.78,1.78
Algeria,6.99,6.99,6.99,6.99,6.99,6.99,6.99,6.99,6.99,6.99,...,2.58,2.66,2.73,2.78,2.82,2.83,2.82,2.8,2.76,2.71


In [4]:
# Dataframe for Life Expectancy
life = pd.read_excel('/Users/andrea/Desktop/Final_Projects/Week1/data/gapminder_lifeexpectancy.xlsx', index_col=0)
#life.head(5)

In [5]:
# Dataframe for Population
pop = pd.read_excel('/Users/andrea/Desktop/Final_Projects/Week1/data/gapminder_population.xlsx', index_col=0)
#popl.head(5)

In [6]:
# Dataframe for Continents
cont = pd.read_csv('/Users/andrea/Desktop/Final_Projects/Week1/data/continents.csv', sep=';')
#cont.head(5)

#### Data Inspection

In [7]:
print("Fertility dataframe shape:", fert.shape)
print("Life Expectancy dataframe shape:", life.shape)
print("Population dataframe shape:", pop.shape)
print("Continents dataframe shape:", cont.shape)

Fertility dataframe shape: (260, 216)
Life Expectancy dataframe shape: (260, 217)
Population dataframe shape: (275, 81)
Continents dataframe shape: (194, 2)


#### Data Wrangling: Fertility 

In [8]:
fert.columns = fert.columns.astype('int64')
fert.columns

Int64Index([1800, 1801, 1802, 1803, 1804, 1805, 1806, 1807, 1808, 1809,
            ...
            2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015],
           dtype='int64', length=216)

In [9]:
fert.index.name = 'country'
fert.index

Index(['Abkhazia', 'Afghanistan', 'Akrotiri and Dhekelia', 'Albania',
       'Algeria', 'American Samoa', 'Andorra', 'Angola', 'Anguilla',
       'Antigua and Barbuda',
       ...
       'Vietnam', 'Virgin Islands (U.S.)', 'North Yemen (former)',
       'South Yemen (former)', 'Yemen', 'Yugoslavia', 'Zambia', 'Zimbabwe',
       'Åland', 'Åland'],
      dtype='object', name='country', length=260)

In [10]:
fert = fert.reset_index()
fert = fert.melt(id_vars=['country'], var_name='year', value_name='fertility rate')

In [11]:
fert.sample(5)

Unnamed: 0,country,year,fertility rate
43479,Denmark,1967,2.35
50583,Mayotte,1994,4.93
38958,Sweden,1949,2.37
30865,Russia,1918,5.72
25200,United States,1896,4.03


#### Data Wrangling: Life Expectancy

In [12]:
life.columns = life.columns.astype('int64')
life.columns

Int64Index([1800, 1801, 1802, 1803, 1804, 1805, 1806, 1807, 1808, 1809,
            ...
            2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016],
           dtype='int64', length=217)

In [13]:
life.index.name = 'country'
life.index

Index(['Abkhazia', 'Afghanistan', 'Akrotiri and Dhekelia', 'Albania',
       'Algeria', 'American Samoa', 'Andorra', 'Angola', 'Anguilla',
       'Antigua and Barbuda',
       ...
       'Vietnam', 'Virgin Islands (U.S.)', 'North Yemen (former)',
       'South Yemen (former)', 'Yemen', 'Yugoslavia', 'Zambia', 'Zimbabwe',
       'Åland', 'South Sudan'],
      dtype='object', name='country', length=260)

In [14]:
life.head

<bound method NDFrame.head of                         1800   1801   1802   1803   1804   1805   1806   1807  \
country                                                                         
Abkhazia                 NaN    NaN    NaN    NaN    NaN    NaN    NaN    NaN   
Afghanistan            28.21  28.20  28.19  28.18  28.17  28.16  28.15  28.14   
Akrotiri and Dhekelia    NaN    NaN    NaN    NaN    NaN    NaN    NaN    NaN   
Albania                35.40  35.40  35.40  35.40  35.40  35.40  35.40  35.40   
Algeria                28.82  28.82  28.82  28.82  28.82  28.82  28.82  28.82   
...                      ...    ...    ...    ...    ...    ...    ...    ...   
Yugoslavia               NaN    NaN    NaN    NaN    NaN    NaN    NaN    NaN   
Zambia                 32.60  32.60  32.60  32.60  32.60  32.60  32.60  32.60   
Zimbabwe               33.70  33.70  33.70  33.70  33.70  33.70  33.70  33.70   
Åland                    NaN    NaN    NaN    NaN    NaN    NaN    NaN    NaN  

In [15]:
life = life.reset_index()
life = life.melt(id_vars=['country'], var_name='year', value_name='life expectancy')

In [16]:
life.sample(5)

Unnamed: 0,country,year,life expectancy
5179,United Kingdom,1819,40.52
28417,French Guiana,1909,30.5
10786,Liberia,1841,31.1
22849,Transnistria,1887,
49131,Virgin Islands (U.S.),1988,74.09


#### Data Wrangling: Population

In [17]:
pop.columns = pop.columns.astype('int64')
pop.columns

Int64Index([1800, 1810, 1820, 1830, 1840, 1850, 1860, 1870, 1880, 1890, 1900,
            1910, 1920, 1930, 1940, 1950, 1951, 1952, 1953, 1954, 1955, 1956,
            1957, 1958, 1959, 1960, 1961, 1962, 1963, 1964, 1965, 1966, 1967,
            1968, 1969, 1970, 1971, 1972, 1973, 1974, 1975, 1976, 1977, 1978,
            1979, 1980, 1981, 1982, 1983, 1984, 1985, 1986, 1987, 1988, 1989,
            1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000,
            2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011,
            2012, 2013, 2014, 2015],
           dtype='int64')

In [18]:
pop.index.name = 'country'
pop.index

Index(['Abkhazia', 'Afghanistan', 'Akrotiri and Dhekelia', 'Albania',
       'Algeria', 'American Samoa', 'Andorra', 'Angola', 'Anguilla',
       'Antigua and Barbuda',
       ...
       'British Indian Ocean Territory', 'Clipperton',
       'French Southern and Antarctic Lands', 'Gaza Strip',
       'Heard and McDonald Islands', 'Northern Marianas',
       'South Georgia and the South Sandwich Islands',
       'US Minor Outlying Islands', 'Virgin Islands', 'West Bank'],
      dtype='object', name='country', length=275)

In [19]:
pop = pop.reset_index()
pop = pop.melt(id_vars=['country'], var_name='year', value_name='population')

In [20]:
pop.sample(5)

Unnamed: 0,country,year,population
803,South Yemen (former),1820,
19213,United Arab Emirates,2004,3975945.0
13780,British Virgin Islands,1985,13308.0
15755,Gambia,1992,979701.0
1343,Uzbekistan,1840,2308592.0


In [21]:
pop.shape

(22275, 3)

#### Data Wrangling: Merging Dataset

In [22]:
df = fert
df = df.merge(life)
df = df.merge(pop)
df = df.merge(cont)
df.sample(5)

Unnamed: 0,country,year,fertility rate,life expectancy,population,continent
10448,Qatar,2015,1.98,79.7,2235355.0,Asia
1777,Bosnia and Herzegovina,2011,1.26,78.2,3832310.0,Europe
11027,Serbia,1910,5.23,35.5,4726618.0,Europe
990,Bahrain,1953,6.95,43.26,123593.0,Asia
3518,Djibouti,1970,6.8,51.75,159667.0,Africa


In [23]:
df.shape

(14175, 6)

In [24]:
df.describe()

Unnamed: 0,fertility rate,life expectancy,population
count,13446.0,13524.0,14175.0
mean,4.629932,57.230718,21950500.0
std,2.005239,15.9343,94556130.0
min,1.13,4.0,1834.0
25%,2.61,44.6375,633236.0
50%,5.01,60.615,3628079.0
75%,6.46,70.7425,10893500.0
max,9.22,84.8,1376049000.0


In [25]:
# Add a flag column to be used later for plotting
it_mask = df['country'] == 'Italy'
de_mask = df['country'] == 'Germany'
ch_mask = df['country'] == 'China'
tot_mask = it_mask | de_mask | ch_mask
df['flag'] = tot_mask

df.loc[df["flag"] == False, ["flag"]] = ''

mask_it = (df['flag'] == True) & (df['country'] == 'Italy')
mask_de = (df['flag'] == True) & (df['country'] == 'Germany')
mask_ch = (df['flag'] == True) & (df['country'] == 'China')
df.loc[mask_it, 'flag'] = 'Italy'
df.loc[mask_de, 'flag'] = 'Germany'
df.loc[mask_ch, 'flag'] = 'China'

#df.head()

Unnamed: 0,country,year,fertility rate,life expectancy,population,continent,flag
0,Afghanistan,1800,7.0,28.21,3280000.0,Asia,
1,Afghanistan,1810,7.0,28.11,3280000.0,Asia,
2,Afghanistan,1820,7.0,28.01,3323519.0,Asia,
3,Afghanistan,1830,7.0,27.9,3448982.0,Asia,
4,Afghanistan,1840,7.0,27.8,3625022.0,Asia,


#### Plotting 

In [None]:
# Example Scatterplot for year 2000
year = 2000
df_yearly = df.loc[df['year'] ==year]
df_yearly.sample(5)

In [None]:
gp = sns.scatterplot(x='life expectancy',
                y='fertility rate',
                data=df_yearly,
                size='population',
                sizes=((10, 1000)),
                hue='continent',
                alpha=0.6)

plt.title('Life expectancy per country')
plt.xlabel('life expectancy [year]')
plt.ylabel('fertility rate [child]')
plt.axis([20, 100, 0, 10])

f-two-legends-from-seaborn-scatterplot
h,l = gp.get_legend_handles_labels()
leg = plt.legend(h[0:7],l[0:7],bbox_to_anchor=(0.75, 1), loc=2, frameon=False, fontsize=8)
leg.get_texts()[4].set_text('N. America')
leg.get_texts()[5].set_text('S. America')
leg.get_texts()[6].set_text('Aus. & Oc.')
#plt.gca().get_legend().remove()

x_min = plt.gca().get_xlim()[0]
y_min = plt.gca().get_ylim()[0]

for idx, row in df_yearly.iterrows():
    plt.gca().annotate(row['flag'], (row['life expectancy'], row['fertility rate']), fontsize=8)

plt.text(x_min+5, y_min+1, str(year), fontsize='large', weight='semibold', color='black')

In [None]:
# Plotting over years

for yy in range(1960,2016,1):
    df_yearly = df.loc[df['year'] == yy]    
    gp = sns.scatterplot(x='life expectancy',
                    y='fertility rate',
                    data=df_yearly,
                    size='population',
                    sizes=((10, 1000)),
                    hue='continent',
                    alpha=0.6)
    plt.title('Life expectancy per country')
    plt.xlabel('life expectancy [year]')
    plt.ylabel('fertility rate [child]')
    plt.axis([20, 100, 0, 10])
    h,l = gp.get_legend_handles_labels()
    leg = plt.legend(h[0:7],l[0:7],bbox_to_anchor=(0.75, 1), loc=2, frameon=False, fontsize=8)
    leg.get_texts()[4].set_text('N. America')
    leg.get_texts()[5].set_text('S. America')
    leg.get_texts()[6].set_text('Aus. & Oc.')
    x_min = plt.gca().get_xlim()[0]
    y_min = plt.gca().get_ylim()[0]
    for idx, row in df_yearly.iterrows():
        plt.gca().annotate(row['flag'], (row['life expectancy'], row['fertility rate']))
    plt.text(x_min+5, y_min+1, str(yy), fontsize='large', weight='semibold', color='black')
    plt.text(x_min+67, y_min+0.2, fontsize='xx-small', weight='normal', style='italic', color='black')
    plt.savefig('DF')
    plt.close()

In [None]:
images = []

for i in range(1960,2016,1):
     filename = 'DF.png'.format(i)
     images.append(imageio.imread(filename))

imageio.mimsave('../output/gapminder_output.gif', images, fps=20)

In [None]:
from IPython.display import Image
#Image('../output/gapminder_output.gif', width = 500, height = 300)
Image('../output/gapminder_output.gif')