## More with dataframes

In [1]:
import pandas as pd

In [2]:
data = pd.read_csv('gapminder_gdp_europe.csv', index_col='country')

In [3]:
print(data)

                        gdpPercap_1952  gdpPercap_1957  gdpPercap_1962  \
country                                                                  
Albania                    1601.056136     1942.284244     2312.888958   
Austria                    6137.076492     8842.598030    10750.721110   
Belgium                    8343.105127     9714.960623    10991.206760   
Bosnia and Herzegovina      973.533195     1353.989176     1709.683679   
Bulgaria                   2444.286648     3008.670727     4254.337839   
Croatia                    3119.236520     4338.231617     5477.890018   
Czech Republic             6876.140250     8256.343918    10136.867130   
Denmark                    9692.385245    11099.659350    13583.313510   
Finland                    6424.519071     7545.415386     9371.842561   
France                     7029.809327     8662.834898    10560.485530   
Germany                    7144.114393    10187.826650    12902.462910   
Greece                     3530.690067

In [4]:
# We can address a cell by it's index notation
# Rows and columns start at 0
# Top left is cell [0, 0]
data_cell = data.iloc[0, 0]
print (data_cell)

1601.056136


In [5]:
# Often more straightforward is to refer to rows and columns by name
data_cell = data.loc["Finland", "gdpPercap_1957"]
print (data_cell)

7545.415386


In [6]:
# We can also extract rows and columns from a dataframe
print (data.loc["Sweden", :])

gdpPercap_1952     8527.844662
gdpPercap_1957     9911.878226
gdpPercap_1962    12329.441920
gdpPercap_1967    15258.296970
gdpPercap_1972    17832.024640
gdpPercap_1977    18855.725210
gdpPercap_1982    20667.381250
gdpPercap_1987    23586.929270
gdpPercap_1992    23880.016830
gdpPercap_1997    25266.594990
gdpPercap_2002    29341.630930
gdpPercap_2007    33859.748350
Name: Sweden, dtype: float64


In [7]:
#or
print (data.loc[: , "gdpPercap_1977"])

country
Albania                    3533.003910
Austria                   19749.422300
Belgium                   19117.974480
Bosnia and Herzegovina     3528.481305
Bulgaria                   7612.240438
Croatia                   11305.385170
Czech Republic            14800.160620
Denmark                   20422.901500
Finland                   15605.422830
France                    18292.635140
Germany                   20512.921230
Greece                    14195.524280
Hungary                   11674.837370
Iceland                   19654.962470
Ireland                   11150.981130
Italy                     14255.984750
Montenegro                 9595.929905
Netherlands               21209.059200
Norway                    23311.349390
Poland                     9508.141454
Portugal                  10172.485720
Romania                    9356.397240
Serbia                    12980.669560
Slovak Republic           10922.664040
Slovenia                  15277.030170
Spain            

In [8]:
# Try to slice the GDP between 1962 and 1972 for Italy to Poland
# Inclusive
data_cells = data.loc["Italy":"Poland", "gdpPercap_1962": "gdpPercap_1972"]

In [9]:
print (data_cells)

             gdpPercap_1962  gdpPercap_1967  gdpPercap_1972
country                                                    
Italy           8243.582340    10022.401310    12269.273780
Montenegro      4649.593785     5907.850937     7778.414017
Netherlands    12790.849560    15363.251360    18794.745670
Norway         13450.401510    16361.876470    18965.055510
Poland          5338.752143     6557.152776     8006.506993


In [10]:
# Slices can be used in other operations
print (data.loc["Italy":"Poland", "gdpPercap_1962": "gdpPercap_1972"].max())

gdpPercap_1962    13450.40151
gdpPercap_1967    16361.87647
gdpPercap_1972    18965.05551
dtype: float64


In [11]:
print (data.loc["Italy":"Poland", "gdpPercap_1962": "gdpPercap_1972"].min())

gdpPercap_1962    4649.593785
gdpPercap_1967    5907.850937
gdpPercap_1972    7778.414017
dtype: float64


In [12]:
subset = data.loc["Italy":"Poland", "gdpPercap_1962": "gdpPercap_1972"]
print ("Subset of data is:\n", subset)

Subset of data is:
              gdpPercap_1962  gdpPercap_1967  gdpPercap_1972
country                                                    
Italy           8243.582340    10022.401310    12269.273780
Montenegro      4649.593785     5907.850937     7778.414017
Netherlands    12790.849560    15363.251360    18794.745670
Norway         13450.401510    16361.876470    18965.055510
Poland          5338.752143     6557.152776     8006.506993


In [13]:
# Find the values greater than or equal to 10000
print ("Values greater than 10000\n", subset >= 10000)

Values greater than 10000
              gdpPercap_1962  gdpPercap_1967  gdpPercap_1972
country                                                    
Italy                 False            True            True
Montenegro            False           False           False
Netherlands            True            True            True
Norway                 True            True            True
Poland                False           False           False


In [14]:
# This frame of Boolean values is called a MASK
mask = subset >= 10000

In [15]:
# Overlay the mask on top of the 'subset' dataframe
# Shew the value where the mask has a True
# Show NaN (not a number) where false
print (subset[mask])

             gdpPercap_1962  gdpPercap_1967  gdpPercap_1972
country                                                    
Italy                   NaN     10022.40131     12269.27378
Montenegro              NaN             NaN             NaN
Netherlands     12790.84956     15363.25136     18794.74567
Norway          13450.40151     16361.87647     18965.05551
Poland                  NaN             NaN             NaN


In [16]:
# NaN are ignored by mathematical and statistical operations
print (subset[subset >= 10000].describe() )

       gdpPercap_1962  gdpPercap_1967  gdpPercap_1972
count        2.000000        3.000000        3.000000
mean     13120.625535    13915.843047    16676.358320
std        466.373656     3408.589070     3817.597015
min      12790.849560    10022.401310    12269.273780
25%      12955.737547    12692.826335    15532.009725
50%      13120.625535    15363.251360    18794.745670
75%      13285.513523    15862.563915    18879.900590
max      13450.401510    16361.876470    18965.055510


In [17]:
# Questions
print(data.loc["Serbia", "gdpPercap_2007"])

9786.534714


In [18]:
print(data.iloc[0:2, 0:2])
print(data.loc["Albania":"Belgium", "gdpPercap_1952":"gdpPercap_1962"])

         gdpPercap_1952  gdpPercap_1957
country                                
Albania     1601.056136     1942.284244
Austria     6137.076492     8842.598030
         gdpPercap_1952  gdpPercap_1957  gdpPercap_1962
country                                                
Albania     1601.056136     1942.284244     2312.888958
Austria     6137.076492     8842.598030    10750.721110
Belgium     8343.105127     9714.960623    10991.206760


In [20]:
print(data.idxmin())

gdpPercap_1952    Bosnia and Herzegovina
gdpPercap_1957    Bosnia and Herzegovina
gdpPercap_1962    Bosnia and Herzegovina
gdpPercap_1967    Bosnia and Herzegovina
gdpPercap_1972    Bosnia and Herzegovina
gdpPercap_1977    Bosnia and Herzegovina
gdpPercap_1982                   Albania
gdpPercap_1987                   Albania
gdpPercap_1992                   Albania
gdpPercap_1997                   Albania
gdpPercap_2002                   Albania
gdpPercap_2007                   Albania
dtype: object


In [21]:
print(data.idxmax())

gdpPercap_1952    Switzerland
gdpPercap_1957    Switzerland
gdpPercap_1962    Switzerland
gdpPercap_1967    Switzerland
gdpPercap_1972    Switzerland
gdpPercap_1977    Switzerland
gdpPercap_1982    Switzerland
gdpPercap_1987         Norway
gdpPercap_1992         Norway
gdpPercap_1997         Norway
gdpPercap_2002         Norway
gdpPercap_2007         Norway
dtype: object
