In [293]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import pandas as pd
import numpy as np
import sys

# Day 7

## Index Objects

pandas’s Index objects are responsible for holding the axis labels and other metadata
(like the axis name or names). Any array or other sequence of labels used when constructing
a Series or DataFrame is internally converted to an Index:

In [328]:
obj = pd.Series(range(3), index=['a', 'b', 'c'])
obj

index = obj.index
index

a    0
b    1
c    2
dtype: int64

Index(['a', 'b', 'c'], dtype='object')

In [295]:
index[1:]

Index(['b', 'c'], dtype='object')

In [296]:
# Index objects are immutable and thus can’t be modified by the user:
index[1] = 'd'

TypeError: Index does not support mutable operations

In [379]:
# Immutability is important so that Index objects can be safely shared among data structures:
index = pd.Index(np.arange(3))
obj2 = pd.Series([1.5, -2.5, 0], index=index)

index
obj2

obj2.index is index

Int64Index([0, 1, 2], dtype='int64')

0    1.5
1   -2.5
2    0.0
dtype: float64

True

## Essential Functionality
In this section, I’ll walk you through the fundamental mechanics of interacting with
the data contained in a Series or DataFrame. In next sessions we will delve more deeply
into data analysis and manipulation topics using pandas.

#### Reindexing
A critical method on pandas objects is reindex, which means to create a new object
with the data conformed to a new index. Consider a simple example from above:

In [381]:
obj = pd.Series([4.5, 7.2, -5.3, 3.6], index=['d', 'b', 'a', 'c'])
obj

d    4.5
b    7.2
a   -5.3
c    3.6
dtype: float64

In [334]:
# Calling reindex on this Series rearranges the data according to the new index, introducing
# missing values if any index values were not already present:

obj2 = obj.reindex(['a', 'b', 'c', 'd', 'e'])
obj2

a   -5.3
b    7.2
c    3.6
d    4.5
e    NaN
dtype: float64

In [384]:
obj.reindex(['a', 'b', 'c', 'd', 'e'], fill_value='')

a   -5.3
b    7.2
c    3.6
d    4.5
e       
dtype: object

In [385]:
# For ordered data like time series, it may be desirable to do some interpolation or filling
# of values when reindexing. The method option allows us to do this, using a method such
# as ffill which forward fills the values:

obj3 = pd.Series(['blue', 'purple', 'yellow'], index=[0, 2, 4])

obj3
obj3.reindex(range(6), method='ffill')

0      blue
2    purple
4    yellow
dtype: object

0      blue
1      blue
2    purple
3    purple
4    yellow
5    yellow
dtype: object

In [302]:
# With DataFrame, reindex can alter either the (row) index, columns, or both. When
# passed just a sequence, the rows are reindexed in the result:

frame = pd.DataFrame(np.arange(9).reshape((3, 3)), index=['a', 'c', 'd'],
                  columns=['Ohio', 'Texas', 'California'])

frame

Unnamed: 0,Ohio,Texas,California
a,0,1,2
c,3,4,5
d,6,7,8


In [303]:
frame2 = frame.reindex(['a', 'b', 'c', 'd'])
frame2

Unnamed: 0,Ohio,Texas,California
a,0.0,1.0,2.0
b,,,
c,3.0,4.0,5.0
d,6.0,7.0,8.0


In [304]:
# The columns can be reindexed using the columns keyword:

states = ['Texas', 'Utah', 'California']
frame.reindex(columns=states)

Unnamed: 0,Texas,Utah,California
a,1,,2
c,4,,5
d,7,,8


In [344]:
# Both can be reindexed in one shot, though interpolation will only apply row-wise (axis 0):

frame.reindex(index=['a', 'b', 'c', 'd'], columns=states)

Unnamed: 0,Texas,Utah,California
a,1.0,,2.0
b,,,
c,4.0,,5.0
d,7.0,,8.0


#### Dropping entries from an axis

Dropping one or more entries from an axis is easy if you have an index array or list
without those entries. As that can require a bit of munging and set logic, the drop
method will return a new object with the indicated value or values deleted from an axis:

In [306]:
obj = pd.Series(np.arange(5.), index=['a', 'b', 'c', 'd', 'e'])
new_obj = obj.drop('c')
new_obj

a    0.0
b    1.0
d    3.0
e    4.0
dtype: float64

In [307]:
obj.drop(['d', 'c'])

a    0.0
b    1.0
e    4.0
dtype: float64

In [346]:
# With DataFrame, index values can be deleted from either axis:

data = pd.DataFrame(np.arange(16).reshape((4, 4)),index=['Ohio', 'Colorado', 'Utah', 'New York'],
                 columns=['one', 'two', 'three', 'four'])

data
data.drop(['Colorado', 'Ohio'])

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


Unnamed: 0,one,two,three,four
Utah,8,9,10,11
New York,12,13,14,15


In [309]:
# Dropping columns

data.drop('two', axis = 1)
data.drop(['two', 'four'], axis=1)

Unnamed: 0,one,three,four
Ohio,0,2,3
Colorado,4,6,7
Utah,8,10,11
New York,12,14,15


Unnamed: 0,one,three
Ohio,0,2
Colorado,4,6
Utah,8,10
New York,12,14


#### Indexing, selection, and filtering
Series indexing (obj[...]) works analogously to NumPy array indexing, except you can
use the Series’s index values instead of only integers. Here are some examples this:

In [389]:
obj = pd.Series(np.arange(4.), index=['a', 'b', 'c', 'd'])

obj

obj['b']
obj[1]
obj[2:4]

a    0.0
b    1.0
c    2.0
d    3.0
dtype: float64

1.0

1.0

c    2.0
d    3.0
dtype: float64

In [390]:
obj < 2

a     True
b     True
c    False
d    False
dtype: bool

In [392]:
obj[['b', 'a', 'd']]
obj[[1, 3]]
obj[obj < 2]

b    1.0
a    0.0
d    3.0
dtype: float64

b    1.0
d    3.0
dtype: float64

a    0.0
b    1.0
dtype: float64

In [393]:
# Slicing with labels behaves differently than normal Python slicing in that the endpoint is inclusive:
obj['b':'c']

# Setting using these methods works just as you would expect:
obj['b':'c'] = 5
obj

b    1.0
c    2.0
dtype: float64

a    0.0
b    5.0
c    5.0
d    3.0
dtype: float64

In [313]:
# As you’ve seen above, indexing into a DataFrame is for retrieving one or more columns either with a single 
# value or sequence:

data = pd.DataFrame(np.arange(16).reshape((4, 4)), index=['Ohio', 'Colorado', 'Utah', 'New York'],
                 columns=['one', 'two', 'three', 'four'])

data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [314]:
data['two']
data[['three', 'one']]

Ohio         1
Colorado     5
Utah         9
New York    13
Name: two, dtype: int32

Unnamed: 0,three,one
Ohio,2,0
Colorado,6,4
Utah,10,8
New York,14,12


In [394]:
data['three'] > 5

Ohio        False
Colorado     True
Utah         True
New York     True
Name: three, dtype: bool

In [348]:
# Indexing like this has a few special cases. First selecting rows by slicing or a boolean array:
data[:2]
data[data['three'] > 5]

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7


Unnamed: 0,one,two,three,four
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [349]:
# This might seem inconsistent but this syntax arose out of practicality and nothing more. 
# Another use case is in indexing with a boolean DataFrame, such as one produced by a scalar comparison
data < 5

Unnamed: 0,one,two,three,four
Ohio,True,True,True,True
Colorado,True,False,False,False
Utah,False,False,False,False
New York,False,False,False,False


In [350]:
data[data < 5] = 0
data

Unnamed: 0,one,two,three,four
Ohio,0,0,0,0
Colorado,0,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [319]:
data.one

Ohio         0
Colorado     0
Utah         8
New York    12
Name: one, dtype: int32

In [320]:
data['three']

Ohio         0
Colorado     6
Utah        10
New York    14
Name: three, dtype: int32

In [321]:
data.loc['Utah']

one       8
two       9
three    10
four     11
Name: Utah, dtype: int32

In [322]:
data.iloc[0:3,1]

Ohio        0
Colorado    5
Utah        9
Name: two, dtype: int32

## Arithmetic and data alignment

One of the most important pandas features is the behavior of arithmetic between objects
with different indexes. When adding together objects, if any index pairs are not
the same, the respective index in the result will be the union of the index pairs. Let’s
look at a simple example:

In [66]:
s1 = pd.Series([7.3, -2.5, 3.4, 1.5], index=['a', 'c', 'd', 'e'])
s2 = pd.Series([-2.1, 3.6, -1.5, 4, 3.1], index=['a', 'c', 'e', 'f', 'g'])

s1
s2

a    7.3
c   -2.5
d    3.4
e    1.5
dtype: float64

a   -2.1
c    3.6
e   -1.5
f    4.0
g    3.1
dtype: float64

In [67]:
# Adding these together yields:
s1 + s2

a    5.2
c    1.1
d    NaN
e    0.0
f    NaN
g    NaN
dtype: float64

In [69]:
# The internal data alignment introduces NA values in the indices that don’t overlap.
# Missing values propagate in arithmetic computations.

#In the case of DataFrame, alignment is performed on both the rows and the columns:

df1 = pd.DataFrame(np.arange(9.).reshape((3, 3)), columns=list('bcd'),
                index=['Ohio', 'Texas', 'Colorado'])

df2 = pd.DataFrame(np.arange(12.).reshape((4, 3)), columns=list('bde'),
                index=['Utah', 'Ohio', 'Texas', 'Oregon'])

df1
df2

Unnamed: 0,b,c,d
Ohio,0.0,1.0,2.0
Texas,3.0,4.0,5.0
Colorado,6.0,7.0,8.0


Unnamed: 0,b,d,e
Utah,0.0,1.0,2.0
Ohio,3.0,4.0,5.0
Texas,6.0,7.0,8.0
Oregon,9.0,10.0,11.0


In [70]:
# Adding these together returns a DataFrame whose index and columns are the unions of the ones in each DataFrame:

df1 + df2

Unnamed: 0,b,c,d,e
Colorado,,,,
Ohio,3.0,,6.0,
Oregon,,,,
Texas,9.0,,12.0,
Utah,,,,


#### Arithmetic methods with fill values
In arithmetic operations between differently-indexed objects, you might want to fill
with a special value, like 0, when an axis label is found in one object but not the other:

In [395]:
df1 = pd.DataFrame(np.arange(12.).reshape((3, 4)), columns=list('abcd'))
df2 = pd.DataFrame(np.arange(20.).reshape((4, 5)), columns=list('abcde'))

df1
df2

Unnamed: 0,a,b,c,d
0,0.0,1.0,2.0,3.0
1,4.0,5.0,6.0,7.0
2,8.0,9.0,10.0,11.0


Unnamed: 0,a,b,c,d,e
0,0.0,1.0,2.0,3.0,4.0
1,5.0,6.0,7.0,8.0,9.0
2,10.0,11.0,12.0,13.0,14.0
3,15.0,16.0,17.0,18.0,19.0


In [396]:
#  Adding these together results in NA values in the locations that don’t overlap:
df1 + df2

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,
1,9.0,11.0,13.0,15.0,
2,18.0,20.0,22.0,24.0,
3,,,,,


In [397]:
# Using the add method on df1, I pass df2 and an argument to fill_value:
df1.add(df2, fill_value=0)

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,4.0
1,9.0,11.0,13.0,15.0,9.0
2,18.0,20.0,22.0,24.0,14.0
3,15.0,16.0,17.0,18.0,19.0


In [398]:
df2.columns

Index(['a', 'b', 'c', 'd', 'e'], dtype='object')

In [75]:
# Relatedly, when reindexing a Series or DataFrame, you can also specify a different fill value:
df1.reindex(columns=df2.columns, fill_value=0)

Unnamed: 0,a,b,c,d,e
0,0.0,1.0,2.0,3.0,0
1,4.0,5.0,6.0,7.0,0
2,8.0,9.0,10.0,11.0,0


In [None]:
# Some more flexible arithmetic methods

# Method     Description
# add        Method for addition (+)
# sub        Method for subtraction (-)
# div        Method for division (/)
# mul        Method for multiplication (*)

#### Operations between DataFrame and Series
As with NumPy arrays, arithmetic between DataFrame and Series is well-defined. First,
as a motivating example, consider the difference between a 2D array and one of its rows:

In [76]:
arr = np.arange(12.).reshape((3, 4))
arr

array([[ 0.,  1.,  2.,  3.],
       [ 4.,  5.,  6.,  7.],
       [ 8.,  9., 10., 11.]])

In [79]:
arr[0]
np.array([ 0., 1., 2., 3.])

array([0., 1., 2., 3.])

array([0., 1., 2., 3.])

In [80]:
arr - arr[0]

array([[0., 0., 0., 0.],
       [4., 4., 4., 4.],
       [8., 8., 8., 8.]])

In [82]:
frame = pd.DataFrame(np.arange(12.).reshape((4, 3)), columns=list('bde'), index=['Utah', 'Ohio', 'Texas', 'Oregon'])
frame

Unnamed: 0,b,d,e
Utah,0.0,1.0,2.0
Ohio,3.0,4.0,5.0
Texas,6.0,7.0,8.0
Oregon,9.0,10.0,11.0


In [84]:
series = frame.iloc[0]
series

b    0.0
d    1.0
e    2.0
Name: Utah, dtype: float64

In [85]:
# By default, arithmetic between DataFrame and Series matches the index of the Series
# on the DataFrame's columns, broadcasting down the rows

frame - series

Unnamed: 0,b,d,e
Utah,0.0,0.0,0.0
Ohio,3.0,3.0,3.0
Texas,6.0,6.0,6.0
Oregon,9.0,9.0,9.0


In [89]:
# If an index value is not found in either the DataFrame’s columns or the Series’s index,
# the objects will be reindexed to form the union:

series2 = pd.Series(range(3), index=['b', 'e', 'f'])
frame + series2

Unnamed: 0,b,d,e,f
Utah,0.0,,3.0,
Ohio,3.0,,6.0,
Texas,6.0,,9.0,
Oregon,9.0,,12.0,


#### Function application and mapping

In [399]:
# NumPy ufuncs (element-wise array methods) work fine with pandas objects:

frame = pd.DataFrame(np.random.randn(4, 3), columns=list('bde'), index=['Utah', 'Ohio', 'Texas', 'Oregon'])
frame

np.abs(frame)

Unnamed: 0,b,d,e
Utah,0.801248,-0.079584,0.904604
Ohio,-0.090931,-0.539557,-0.002064
Texas,-0.120773,-1.386287,0.620191
Oregon,1.501839,-1.330355,-0.459878


Unnamed: 0,b,d,e
Utah,0.801248,0.079584,0.904604
Ohio,0.090931,0.539557,0.002064
Texas,0.120773,1.386287,0.620191
Oregon,1.501839,1.330355,0.459878


In [400]:
# Another frequent operation is applying a function on 1D arrays to each column or row. 
# DataFrame’s apply method does exactly this:

f = lambda x: x.max() - x.min()

frame.apply(f)
frame.apply(f, axis=1)

b    1.622613
d    1.306702
e    1.364482
dtype: float64

Utah      0.984189
Ohio      0.537492
Texas     2.006478
Oregon    2.832195
dtype: float64

In [363]:
# The function passed to apply need not return a scalar value, it can also return a Series
# with multiple values:

def f(x):
    return pd.Series([x.min(), x.max()], index=['min', 'max'])

In [369]:
frame.apply(f)

Unnamed: 0,b,d,e
min,-1.538037,-0.731637,-1.852203
max,1.827822,0.410738,-0.380022


#### Sorting and ranking
Sorting a data set by some criterion is another important built-in operation. To sort
lexicographically by row or column index, use the sort_index method, which returns
a new, sorted object:

In [104]:
obj = pd.Series(range(4), index=['d', 'a', 'b', 'c'])
obj.sort_index()

a    1
b    2
c    3
d    0
dtype: int64

In [401]:
# With a DataFrame, you can sort by index on either axis:
frame = pd.DataFrame(np.arange(8).reshape((2, 4)), index=['three', 'one'], columns=['d', 'a', 'b', 'c'])

frame
frame.sort_index()
frame.sort_index(axis=1)

Unnamed: 0,d,a,b,c
three,0,1,2,3
one,4,5,6,7


Unnamed: 0,d,a,b,c
one,4,5,6,7
three,0,1,2,3


Unnamed: 0,a,b,c,d
three,1,2,3,0
one,5,6,7,4


In [107]:
# The data is sorted in ascending order by default, but can be sorted in descending order, too:
frame.sort_index(axis=1, ascending=False)

Unnamed: 0,d,c,b,a
three,0,3,2,1
one,4,7,6,5


In [117]:
# To sort a Series by its values:
obj = pd.Series([4, 7, -3, 2])

obj.sort_values()
obj.sort_values(ascending = False)

2   -3
3    2
0    4
1    7
dtype: int64

1    7
0    4
3    2
2   -3
dtype: int64

In [402]:
# Any missing values are sorted to the end of the Series by default:
obj = pd.Series([4, np.nan, 7, np.nan, -3, 2])

obj
obj.sort_values()

0    4.0
1    NaN
2    7.0
3    NaN
4   -3.0
5    2.0
dtype: float64

4   -3.0
5    2.0
0    4.0
2    7.0
1    NaN
3    NaN
dtype: float64

In [125]:
# On DataFrame, you may want to sort by the values in one or more columns. To do so, 
# pass one or more column names to the by option:
frame = pd.DataFrame({'b': [4, 7, -3, 2], 'a': [0, 1, 0, 1]})
frame

frame.sort_index(axis = 0)

Unnamed: 0,b,a
0,4,0
1,7,1
2,-3,0
3,2,1


Unnamed: 0,b,a
0,4,0
1,7,1
2,-3,0
3,2,1


In [127]:
# sort by column
frame.sort_values(by = 'b') 

Unnamed: 0,b,a
2,-3,0
3,2,1
0,4,0
1,7,1


In [129]:
#To sort by multiple columns, pass a list of names:
frame.sort_values(by=['a', 'b'])

Unnamed: 0,b,a
2,-3,0
0,4,0
3,2,1
1,7,1


#### Axis indexes with duplicate values
Up until now all of the examples I’ve showed you have had unique axis labels (index
values). While many pandas functions (like reindex) require that the labels be unique,
it’s not mandatory. Let’s consider a small Series with duplicate indices:

In [136]:
obj = pd.Series(range(5), index=['a', 'a', 'b', 'b', 'c'])
obj

a    0
a    1
b    2
b    3
c    4
dtype: int64

In [137]:
# The index’s is_unique property can tell you whether its values are unique or not:
obj.index.is_unique

False

In [138]:
# Data selection is one of the main things that behaves differently with duplicates. Indexing
# a value with multiple entries returns a Series while single entries return a scalar value:
obj['a']
obj['c']

a    0
a    1
dtype: int64

4

In [140]:
df = pd.DataFrame(np.random.randn(4, 3), index=['a', 'a', 'b', 'b'])
df

Unnamed: 0,0,1,2
a,-2.416306,-0.679961,-0.780025
a,-0.391383,0.22518,0.405252
b,0.401572,0.624115,-0.087554
b,-0.442068,-0.769922,-0.153757


In [141]:
df.loc['b']

Unnamed: 0,0,1,2
b,0.401572,0.624115,-0.087554
b,-0.442068,-0.769922,-0.153757


### Summarizing and Computing Descriptive Statistics
pandas objects are equipped with a set of common mathematical and statistical methods.
Most of these fall into the category of reductions or summary statistics, methods
that extract a single value (like the sum or mean) from a Series or a Series of values from
the rows or columns of a DataFrame. Compared with the equivalent methods of vanilla
NumPy arrays, they are all built from the ground up to exclude missing data. Consider
a small DataFrame:

In [404]:
df = pd.DataFrame([[1.4, np.nan], [7.1, -4.5],[np.nan, np.nan], [0.75, -1.3]],
               index=['a', 'b', 'c', 'd'],columns=['one', 'two'])

df

Unnamed: 0,one,two
a,1.4,
b,7.1,-4.5
c,,
d,0.75,-1.3


In [405]:
# Calling DataFrame’s sum method returns a Series containing column sums
df.sum()

one    9.25
two   -5.80
dtype: float64

In [406]:
# Passing axis=1 sums over the rows instead:
df.sum(axis=1)

a    1.40
b    2.60
c    0.00
d   -0.55
dtype: float64

In [407]:
# NA values are excluded unless the entire slice (row or column in this case) is NA. This
# can be disabled using the skipna option:

df.mean(axis=1, skipna=False)

a      NaN
b    1.300
c      NaN
d   -0.275
dtype: float64

In [148]:
# Options for reduction methods

# METHOD           DESCRIPTION
# axis             Axis to reduce over. 0 for DataFrame’s rows and 1 for columns.
# skipna           Exclude missing values, True by default.

In [408]:
# Some methods, like idxmin and idxmax, return indirect statistics like the index value
# where the minimum or maximum values are attained:
df.idxmax()

one    b
two    d
dtype: object

In [409]:
df

Unnamed: 0,one,two
a,1.4,
b,7.1,-4.5
c,,
d,0.75,-1.3


In [150]:
# Other methods are accumulations:
df.cumsum()

Unnamed: 0,one,two
a,1.4,
b,8.5,-4.5
c,,
d,9.25,-5.8


In [151]:
# Another type of method is neither a reduction nor an accumulation. describe is one
# such example, producing multiple summary statistics in one shot:

df.describe()

Unnamed: 0,one,two
count,3.0,2.0
mean,3.083333,-2.9
std,3.493685,2.262742
min,0.75,-4.5
25%,1.075,-3.7
50%,1.4,-2.9
75%,4.25,-2.1
max,7.1,-1.3


In [327]:
# On non-numeric data, describe produces alternate summary statistics:
obj = pd.Series(['a', 'a', 'b', 'c'] * 4)

obj
obj.describe()

0     a
1     a
2     b
3     c
4     a
5     a
6     b
7     c
8     a
9     a
10    b
11    c
12    a
13    a
14    b
15    c
dtype: object

count     16
unique     3
top        a
freq       8
dtype: object

In [None]:
# Descriptive and summary statistics

# METHOD                 DESCRIPTION
# count                  Number of non-NA values
# describe               Compute set of summary statistics for Series or each DataFrame column
# min, max               Compute minimum and maximum values
# argmin, argmax         Compute index locations (integers) at which minimum or maximum value obtained, respectively
# idxmin, idxmax         Compute index values at which minimum or maximum value obtained, respectively
# quantile               Compute sample quantile ranging from 0 to 1
# sum                    Sum of values
# mean                   Mean of values
# median                 Arithmetic median (50% quantile) of values
# mad                    Mean absolute deviation from mean value
# var                    Sample variance of values
# std                    Sample standard deviation of values
# skew                   Sample skewness (3rd moment) of values
# kurt                   Sample kurtosis (4th moment) of values
# cumsum                 Cumulative sum of values
# cummin, cummax         Cumulative minimum or maximum of values, respectively
# cumprod                Cumulative product of values
# diff                   Compute 1st arithmetic difference (useful for time series)
# pct_change             Compute percent changes

### Unique Values, Value Counts, and Membership
Another class of related methods extracts information about the values contained in a
one-dimensional Series. To illustrate these, consider this example:

In [373]:
obj = pd.Series(['c', 'a', 'd', 'a', 'a', 'b', 'b', 'c', 'c'])

# The first function is unique, which gives you an array of the unique values in a Series:
uniques = obj.unique()
uniques

array(['c', 'a', 'd', 'b'], dtype=object)

In [374]:
# The unique values are not necessarily returned in sorted order, but could be sorted after
# the fact if needed (uniques.sort()). Relatedly, value_counts computes a Series containing
# value frequencies:

obj.value_counts()

c    3
a    3
b    2
d    1
dtype: int64

In [376]:
# The Series is sorted by value in descending order as a convenience. value_counts is also
# available as a top-level pandas method that can be used with any array or sequence:

pd.value_counts(obj.values, sort=False)

a    3
d    1
b    2
c    3
dtype: int64

In [None]:
# Lastly, isin is responsible for vectorized set membership and can be very useful in
# filtering a data set down to a subset of values in a Series or column in a DataFrame

In [161]:
mask = obj.isin(['b', 'c'])

mask
obj[mask]

0     True
1    False
2    False
3    False
4    False
5     True
6     True
7     True
8     True
dtype: bool

0    c
5    b
6    b
7    c
8    c
dtype: object

In [163]:
# Unique, value counts, and binning methods

# Method              Description
# isin                Compute boolean array indicating whether each Series value is contained in the passed sequence of values.
# unique              Compute array of unique values in a Series, returned in the order observed.
# value_counts        Return a Series containing unique values as its index and frequencies as its values, ordered count in descending order.

### Handling Missing Data
Missing data is common in most data analysis applications. One of the goals in designing
pandas was to make working with missing data as painless as possible. For
example, all of the descriptive statistics on pandas objects exclude missing data. pandas uses the floating point value NaN (Not a Number) to represent missing data in
both floating as well as in non-floating point arrays. It is just used as a sentinel that can
be easily detected

In [377]:
string_data = pd.Series(['aardvark', 'artichoke', np.nan, 'avocado'])
string_data
string_data.isnull()

0     aardvark
1    artichoke
2          NaN
3      avocado
dtype: object

0    False
1    False
2     True
3    False
dtype: bool

In [170]:
# The built-in Python None value is also treated as NA in object arrays:
string_data[0] = None
string_data.isnull()

0     True
1    False
2     True
3    False
dtype: bool

In [171]:
# NA handling methods
# Argument              Description
# dropna                Filter axis labels based on whether values for each label have missing data, with varying thresholds for how much missing data to tolerate.
# fillna                Fill in missing data with some value or using an interpolation method such as 'ffill' or 'bfill'.
# isnull                Return like-type object containing boolean values indicating which values are missing / NA.
# notnull               Negation of isnull.

### Filtering Out Missing Data
You have a number of options for filtering out missing data. While doing it by hand is
always an option, dropna can be very helpful. On a Series, it returns the Series with only
the non-null data and index values:

In [173]:
from numpy import nan as NA

In [174]:
data = pd.Series([1, NA, 3.5, NA, 7])
data.dropna()

0    1.0
2    3.5
4    7.0
dtype: float64

In [175]:
# Naturally, you could have computed this yourself by boolean indexing:
data[data.notnull()]

0    1.0
2    3.5
4    7.0
dtype: float64

In [178]:
# With DataFrame objects, these are a bit more complex. You may want to drop rows
# or columns which are all NA or just those containing any NAs. dropna by default drops
# any row containing a missing value:

data = pd.DataFrame([[1., 6.5, 3.], [1., NA, NA],[NA, NA, NA], [NA, 6.5, 3.]])
data

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [179]:
cleaned = data.dropna()
cleaned

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


In [180]:
# Passing how='all' will only drop rows that are all NA:
data.dropna(how='all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


In [182]:
# Dropping columns in the same way is only a matter of passing axis=1:
data[4] = NA
data

Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,
1,1.0,,,
2,,,,
3,,6.5,3.0,


In [183]:
data.dropna(axis=1, how='all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [184]:
# A related way to filter out DataFrame rows tends to concern time series data. Suppose you want to keep only rows 
# containing a certain number of observations. You can indicate this with the thresh argument:

df = pd.DataFrame(np.random.randn(7, 3))

In [187]:
df.iloc[:4, 1] = NA; df.iloc[:2, 2] = NA

In [188]:
df

Unnamed: 0,0,1,2
0,-0.470949,,
1,-0.676064,,
2,0.338393,,-1.169157
3,1.412685,,1.683623
4,-1.780245,-1.324202,-0.783069
5,-0.851906,0.559228,-0.648063
6,-1.249362,-0.063537,-1.039349


In [189]:
df.dropna(thresh=3)

Unnamed: 0,0,1,2
4,-1.780245,-1.324202,-0.783069
5,-0.851906,0.559228,-0.648063
6,-1.249362,-0.063537,-1.039349


### Filling in Missing Data
Rather than filtering out missing data (and potentially discarding other data along with
it), you may want to fill in the “holes” in any number of ways. For most purposes, the
fillna method is the workhorse function to use. Calling fillna with a constant replaces
missing values with that value:

In [190]:
df.fillna(0)

Unnamed: 0,0,1,2
0,-0.470949,0.0,0.0
1,-0.676064,0.0,0.0
2,0.338393,0.0,-1.169157
3,1.412685,0.0,1.683623
4,-1.780245,-1.324202,-0.783069
5,-0.851906,0.559228,-0.648063
6,-1.249362,-0.063537,-1.039349


In [192]:
# Calling fillna with a dict you can use a different fill value for each column:
df.fillna({1: 0.5, 3: -1})

Unnamed: 0,0,1,2
0,-0.470949,0.5,
1,-0.676064,0.5,
2,0.338393,0.5,-1.169157
3,1.412685,0.5,1.683623
4,-1.780245,-1.324202,-0.783069
5,-0.851906,0.559228,-0.648063
6,-1.249362,-0.063537,-1.039349


In [194]:
# fillna returns a new object, but you can modify the existing object in place:
# always returns a reference to the filled object

_ = df.fillna(0, inplace=True)

df

Unnamed: 0,0,1,2
0,-0.470949,0.0,0.0
1,-0.676064,0.0,0.0
2,0.338393,0.0,-1.169157
3,1.412685,0.0,1.683623
4,-1.780245,-1.324202,-0.783069
5,-0.851906,0.559228,-0.648063
6,-1.249362,-0.063537,-1.039349


In [197]:
# The same interpolation methods available for reindexing can be used with fillna
df = pd.DataFrame(np.random.randn(6, 3))
df.iloc[2:, 1] = NA; df.iloc[4:, 2] = NA
df

Unnamed: 0,0,1,2
0,0.258082,-1.344454,-1.565255
1,-0.18191,1.30715,0.990387
2,-0.258835,,0.862337
3,0.855741,,-3.045072
4,1.021922,,
5,-0.88796,,


In [198]:
df.fillna(method='ffill')

Unnamed: 0,0,1,2
0,0.258082,-1.344454,-1.565255
1,-0.18191,1.30715,0.990387
2,-0.258835,1.30715,0.862337
3,0.855741,1.30715,-3.045072
4,1.021922,1.30715,-3.045072
5,-0.88796,1.30715,-3.045072


In [199]:
df.fillna(method='ffill', limit=2)

Unnamed: 0,0,1,2
0,0.258082,-1.344454,-1.565255
1,-0.18191,1.30715,0.990387
2,-0.258835,1.30715,0.862337
3,0.855741,1.30715,-3.045072
4,1.021922,,-3.045072
5,-0.88796,,-3.045072


In [202]:
# With fillna you can do lots of other things with a little creativity. For example, you
# might pass the mean or median value of a Series:

data = pd.Series([1., NA, 3.5, NA, 7])
data.fillna(data.mean())

0    1.000000
1    3.833333
2    3.500000
3    3.833333
4    7.000000
dtype: float64

In [203]:
# fillna function arguments

# Argument                Description
# value                   Scalar value or dict-like object to use to fill missing values
# method                  Interpolation, by default 'ffill' if function called with no other arguments
# axis                    Axis to fill on, default axis=0
# inplace                 Modify the calling object without producing a copy
# limit                   For forward and backward filling, maximum number of consecutive periods to fill