# Introduction to Pandas
* de facto data analysis tool in Python
* developed by Wes McKinney
* Pandas = "__Pan__el" + "__Da__ta"

In [7]:
import pandas as pd
import numpy as np

AttributeError: module 'pandas' has no attribute 'core'

## Fundamental __`pandas`__ data structures

### `Series` objects in pandas

In [10]:
import pandas as pd
import numpy as np

In [11]:
series_example = pd.Series([-0.5, 0.75, 1.0, -2])
series_example

0   -0.50
1    0.75
2    1.00
3   -2.00
dtype: float64

In [12]:
series_example.values

array([-0.5 ,  0.75,  1.  , -2.  ])

In [13]:
series_example.index

RangeIndex(start=0, stop=4, step=1)

In [14]:
series_example[1]

0.75

In [15]:
series_example[1:3]

1    0.75
2    1.00
dtype: float64

### Explicit Indices

In [16]:
# change the row headers (index) to hold other names
popularity = pd.Series([17.25, 16.09, 10.31, 6.20, 4.80], index=['Java', 'C', 'Python', 'C++', 'C#'])
popularity

Java      17.25
C         16.09
Python    10.31
C++        6.20
C#         4.80
dtype: float64

In [17]:
popularity['Python']

10.31

### Exercise

In [22]:
# There are multiple ways to do indexing
# Do explicit Series indices work *exactly* the way you might expect?
# Try slicing popularity using its explicit index and find out.


## Indexers: `loc` and `iloc`
* __`loc`__ = location
* __`iloc`__ = integer index

**Think, Pair, Share**

In [23]:
popularity.loc['Python'] # you can ask for an explicit index - I want the index "Python"

10.31

In [39]:
# we might want to preserve the original order of the data set
popularity = pd.Series([17.25, 16.09, 10.31, 6.20, 4.80, 5.40], index=['4', 5, '4', '1', '2', 4])
popularity

4    17.25
5    16.09
4    10.31
1     6.20
2     4.80
4     5.40
dtype: float64

In [42]:
popularity.loc['4']

4    17.25
4    10.31
dtype: float64

In [44]:
popularity.iloc[4]

4.8

In [31]:
# loc = location, what we named
# iloc = integer index, what the actual index is

In [25]:
popularity.loc['Java':'Python'] # in both cases, I'm specifying an index which is a string

Java      17.25
C         16.09
Python    10.31
dtype: float64

**Share**

In [26]:
popularity.iloc[0] # give me the first value in the series. Even though the indices are different, we can use numerical indices.

17.25

In [27]:
popularity.iloc[:2]

Java    17.25
C       16.09
dtype: float64

## Series vs. Dictionary

**Think, Pair, Share** 

In [45]:
# here is a python dictionary of countries mapped to population number
population_dict = {'France': 65429495,
                   'Germany': 82408706,
                   'Russia': 143910127,
                   'Japan': 126922333}
population_dict

{'France': 65429495,
 'Germany': 82408706,
 'Japan': 126922333,
 'Russia': 143910127}

In [47]:
population = pd.Series(population_dict)
population

France      65429495
Germany     82408706
Japan      126922333
Russia     143910127
dtype: int64

### Interacting with Series

In [48]:
population['Russia']

143910127

### Exercise

In [51]:
# Try slicing on the population Series on your own.
# Would slicing be possible if Series keys were not ordered?

population['France':'Japan']
# we can replicate the order from the original series, because the series itself has an order, like a dictionary
# the slicing matches the order we would expect

France      65429495
Germany     82408706
Japan      126922333
dtype: int64

In [52]:
population.values # to get to the actual data - Panda keeps an array

array([ 65429495,  82408706, 126922333, 143910127,   2937590])

In [53]:
population.values, population.index # the index is different than the array

(array([ 65429495,  82408706, 126922333, 143910127,   2937590]),
 Index(['France', 'Germany', 'Japan', 'Russia', 'Albania'], dtype='object'))

In [54]:
# Try running population['Albania'] = 2937590 (or another country of your choice)
# What order do the keys appear in when you run population? Is it what you expected?
population['Albania'] = 2937590

In [55]:
population

France      65429495
Germany     82408706
Japan      126922333
Russia     143910127
Albania      2937590
dtype: int64

In [56]:
pop2 = pd.Series({'Spain': 46432074, 'France': 102321, 'Albania': 50532}) # add more items to the dictionary
# for keys that appear in , the sum is undefined
# we only get values that appear in both series
# if they appear in one serie but not the other - they'll appear but we won't know what they are (we'll get NaN = Not a Number)
# there are NaNs only in floats, not in integers 
# Panda doesn't crash with missing data
population + pop2

Albania     2988122.0
France     65531816.0
Germany           NaN
Japan             NaN
Russia            NaN
Spain             NaN
dtype: float64

In [57]:
# let's make a dictionary in Python 3
# order - do the items in the order I inserted them (we shouldn't rely in this, bc someone else might use a different python order)
## Panda indeed doesn't rely on dictionaries, so it won't be affected
dictionary_example = {'Carmel':5.4, 'Yarden':5.2, 'Lior':5.2}
dictionary_example 

{'Carmel': 5.4, 'Lior': 5.2, 'Yarden': 5.2}

In [60]:
%%python2 
## does not retain the order of the dictionary
dictionary_example = {'Carmel':5.4, 'Yarden':5.2, 'Lior':5.2}
print(dictionary_example)

{'Lior': 5.2, 'Carmel': 5.4, 'Yarden': 5.2}


## `DataFrame` object in pandas

In [None]:
## series is a glorified list/array, since it has indices

In [61]:
area_dict = {'Albania': 28748,
             'France': 643801,
             'Germany': 357386,
             'Japan': 377972,
             'Russia': 17125200}
area = pd.Series(area_dict)
area

Albania       28748
France       643801
Germany      357386
Japan        377972
Russia     17125200
dtype: int64

In [66]:
# in the DataFrame method, we're passing a dictionary that maps column headers to the dictionary
# we get a two-dimensional table, or spreadsheet
countries = pd.DataFrame({'Population': population, 'Area': area})
countries

Unnamed: 0,Area,Population
Albania,28748,2937590
France,643801,65429495
Germany,357386,82408706
Japan,377972,126922333
Russia,17125200,143910127


In [63]:
countries['Capital'] = ['Tirana', 'Paris', 'Berlin', 'Tokyo', 'Moscow']
countries
# since the column 'Capital' didn't exist before, it will add it as a new column
# this way we can manipulate our 'spreadsheet' programmatically

Unnamed: 0,Area,Population,Capital
Albania,28748,2937590,Tirana
France,643801,65429495,Paris
Germany,357386,82408706,Berlin
Japan,377972,126922333,Tokyo
Russia,17125200,143910127,Moscow


In [64]:
countries = countries[['Capital', 'Area', 'Population']]
countries

Unnamed: 0,Capital,Area,Population
Albania,Tirana,28748,2937590
France,Paris,643801,65429495
Germany,Berlin,357386,82408706
Japan,Tokyo,377972,126922333
Russia,Moscow,17125200,143910127


In [67]:
countries['Population Density'] = countries['Population'] / countries['Area']
countries

## it doesn't work if you run the "Capital" thing first, since it's not a numerical column
## so you can go back and run the first thing, then skip the "Capital" and run this thing here

Unnamed: 0,Area,Population,Population Density
Albania,28748,2937590,102.184152
France,643801,65429495,101.629999
Germany,357386,82408706,230.587393
Japan,377972,126922333,335.798242
Russia,17125200,143910127,8.403413


In [72]:
countries['Area']

Albania       28748
France       643801
Germany      357386
Japan        377972
Russia     17125200
Name: Area, dtype: int64

### Exercise

In [69]:
# Now try accessing row data with a command like countries['Japan']

countries.['Japan'] # won't work because Japan is not a key


SyntaxError: invalid syntax (<ipython-input-69-6f75564c55e7>, line 3)

**Think, Pair, Share**

In [70]:
countries.loc['Japan']

Area                  3.779720e+05
Population            1.269223e+08
Population Density    3.357982e+02
Name: Japan, dtype: float64

In [71]:
countries.loc['Japan']['Area']

377972.0

### Exercise

In [None]:
# Can you think of a way to return the area of Japan without using .iloc?
# Hint: Try putting the column index first.
# Can you slice along these indices as well?


### DataSeries Creation

In [None]:
countries['Debt-to-GDP Ratio'] = np.nan ##braided column
## np.nan is how you get a non-numerical value
countries

In [73]:
## I've got the data for Russia and Japan indices
debt = pd.Series([0.19, 2.36], index=['Russia', 'Japan'])
## we'll put the values into the indices, but since we have a partial dataset - it will assign them with NaN
countries['Debt-to-GDP Ratio'] = debt
countries

Unnamed: 0,Area,Population,Population Density,Debt-to-GDP Ratio
Albania,28748,2937590,102.184152,
France,643801,65429495,101.629999,
Germany,357386,82408706,230.587393,
Japan,377972,126922333,335.798242,2.36
Russia,17125200,143910127,8.403413,0.19


In [None]:
# remove 'Capital'
del countries['Capital']
countries

In [74]:
# T (transpose) - switches the columns and the rows
# each column is consistent in its data type
# when we transpose it, the rows will be the same data type as before, so we need to modify this as well
countries.T

Unnamed: 0,Albania,France,Germany,Japan,Russia
Area,28748.0,643801.0,357386.0,377972.0,17125200.0
Population,2937590.0,65429500.0,82408710.0,126922300.0,143910100.0
Population Density,102.1842,101.63,230.5874,335.7982,8.403413
Debt-to-GDP Ratio,,,,2.36,0.19


In [75]:
pd.DataFrame(np.random.rand(3, 2),
             columns=['foo', 'bar'],
             index=['a', 'b', 'c'])

Unnamed: 0,foo,bar
a,0.417913,0.789495
b,0.476605,0.90772
c,0.364008,0.059643


## Manipulating data in pandas

### Index objects in pandas

In [76]:
popularity = pd.Series([17.25, 16.09, 10.31, 6.20, 4.80], index=['Java', 'C', 'Python', 'C++', 'C#'])
index = popularity.index
index

Index(['Java', 'C', 'Python', 'C++', 'C#'], dtype='object')

In [77]:
index[2]

'Python'

In [78]:
index[::2]

Index(['Java', 'Python', 'C#'], dtype='object')

**Share**

In [79]:
index[1] = 'See'

TypeError: Index does not support mutable operations

## Data Selection in Series

In [80]:
series_example2 = pd.Series([-0.5, 0.75, 1.0, -2], index=['a', 'b', 'c', 'd'])
series_example2

a   -0.50
b    0.75
c    1.00
d   -2.00
dtype: float64

In [81]:
series_example2['b']

0.75

In [82]:
'a' in series_example2

True

In [83]:
series_example2.keys()

Index(['a', 'b', 'c', 'd'], dtype='object')

In [84]:
list(series_example2.items())

[('a', -0.5), ('b', 0.75), ('c', 1.0), ('d', -2.0)]

In [85]:
series_example2['e'] = 1.25
series_example2

a   -0.50
b    0.75
c    1.00
d   -2.00
e    1.25
dtype: float64

## Data Selection in DataFrames

In [86]:
area = pd.Series({'Albania': 28748,
                  'France': 643801,
                  'Germany': 357386,
                  'Japan': 377972,
                  'Russia': 17125200})
population = pd.Series ({'Albania': 2937590,
                         'France': 65429495,
                         'Germany': 82408706,
                         'Russia': 143910127,
                         'Japan': 126922333})
countries = pd.DataFrame({'Area': area, 'Population': population})
countries

Unnamed: 0,Area,Population
Albania,28748,2937590
France,643801,65429495
Germany,357386,82408706
Japan,377972,126922333
Russia,17125200,143910127


In [87]:
countries['Area']

Albania       28748
France       643801
Germany      357386
Japan        377972
Russia     17125200
Name: Area, dtype: int64

In [88]:
countries['Population Density'] = countries['Population'] / countries['Area']
countries

Unnamed: 0,Area,Population,Population Density
Albania,28748,2937590,102.184152
France,643801,65429495,101.629999
Germany,357386,82408706,230.587393
Japan,377972,126922333,335.798242
Russia,17125200,143910127,8.403413


### DataFrame as two-dimensional array

In [89]:
countries.values

array([[2.87480000e+04, 2.93759000e+06, 1.02184152e+02],
       [6.43801000e+05, 6.54294950e+07, 1.01629999e+02],
       [3.57386000e+05, 8.24087060e+07, 2.30587393e+02],
       [3.77972000e+05, 1.26922333e+08, 3.35798242e+02],
       [1.71252000e+07, 1.43910127e+08, 8.40341292e+00]])

In [90]:
countries.T

Unnamed: 0,Albania,France,Germany,Japan,Russia
Area,28748.0,643801.0,357386.0,377972.0,17125200.0
Population,2937590.0,65429500.0,82408710.0,126922300.0,143910100.0
Population Density,102.1842,101.63,230.5874,335.7982,8.403413


In [91]:
countries.iloc[:3, :2]

Unnamed: 0,Area,Population
Albania,28748,2937590
France,643801,65429495
Germany,357386,82408706


In [92]:
countries.loc[:'Germany', :'Population']

Unnamed: 0,Area,Population
Albania,28748,2937590
France,643801,65429495
Germany,357386,82408706


### Exercise

In [93]:
# Can you think of how to combine masking and fancy indexing in one line?
# Your masking could be something like countries['Population Density'] > 200
# Your fancy indexing could be something like ['Population', 'Population Density']
# Be sure to put the the masking and fancy indexing inside the square brackets: countries.loc[]


# Operating on Data in Pandas

**Think, Pair, Share** For each of these Sections.

## Index alignment with Series

For our first example, suppose we are combining two different data sources and find only the top five countries by *area* and the top five countries by *population*:

In [94]:
area = pd.Series({'Russia': 17075400, 'Canada':  9984670,
                  'USA': 9826675, 'China': 9598094, 
                  'Brazil': 8514877})
population = pd.Series({'China': 1409517397, 'India': 1339180127,
                        'USA': 324459463, 'Indonesia': 322179605, 
                        'Brazil': 207652865})

In [95]:
# Now divide these to compute the population density
pop_density = area/population
pop_density

Brazil       0.041005
Canada            NaN
China        0.006809
India             NaN
Indonesia         NaN
Russia            NaN
USA          0.030286
dtype: float64

In [96]:
series1 = pd.Series([2, 4, 6], index=[0, 1, 2])
series2 = pd.Series([3, 5, 7], index=[1, 2, 3])
series1 + series2

0     NaN
1     7.0
2    11.0
3     NaN
dtype: float64

In [97]:
series1.add(series2, fill_value=0)

0     2.0
1     7.0
2    11.0
3     7.0
dtype: float64

Much better!

## Index Alignment with DataFrames

In [98]:
rng = np.random.RandomState(42)
df1 = pd.DataFrame(rng.randint(0, 20, (2, 2)),
                   columns=list('AB'))
df1

Unnamed: 0,A,B
0,6,19
1,14,10


In [99]:
df2 = pd.DataFrame(rng.randint(0, 10, (3, 3)),
                   columns=list('BAC'))
df2

Unnamed: 0,B,A,C
0,7,4,6
1,9,2,6
2,7,4,3


In [100]:
# Add df1 and df2. Is the output what you expected?
df1 + df2

Unnamed: 0,A,B,C
0,10.0,26.0,
1,16.0,19.0,
2,,,


In [101]:
fill = df1.stack().mean()
df1.add(df2, fill_value=fill)

Unnamed: 0,A,B,C
0,10.0,26.0,18.25
1,16.0,19.0,18.25
2,16.25,19.25,15.25


## Operations Between DataFrames and Series
* Index and column alignment gets maintained in operations between a `DataFrame` and a `Series` as well. To see this, consider a common operation in data science, wherein we find the difference of a `DataFrame` and one of its rows. Because pandas inherits ufuncs from NumPy, pandas will compute the difference row-wise by default:

In [102]:
df3 = pd.DataFrame(rng.randint(10, size=(3, 4)), columns=list('WXYZ'))
df3

Unnamed: 0,W,X,Y,Z
0,7,7,2,5
1,4,1,7,5
2,1,4,0,9


In [13]:
df3 - df3.iloc[0]

Unnamed: 0,W,X,Y,Z
0,0,0,0,0
1,-3,-6,5,0
2,-6,-3,-2,4


In [14]:
df3.subtract(df3['X'], axis=0)

Unnamed: 0,W,X,Y,Z
0,0,0,-5,-2
1,3,0,6,4
2,-3,0,-4,5


In [15]:
halfrow = df3.iloc[0, ::2]
halfrow

W    7
Y    2
Name: 0, dtype: int64

In [16]:
df3 - halfrow

Unnamed: 0,W,X,Y,Z
0,0.0,,0.0,
1,-3.0,,5.0,
2,-6.0,,-2.0,
