In [1]:
import pandas as pd
import numpy as np

'''
There is another data combination situation that can’t be expressed as either a merge
or concatenation operation. You may have two datasets whose indexes overlap in full
or part. As a motivating example, consider NumPy’s where function, which performs
the array-oriented equivalent of an if-else expression
'''

a = pd.Series([np.nan, 2.5, np.nan, 3.5, 4.5, np.nan],
              index=['f', 'e', 'd', 'c', 'b', 'a'])

b = pd.Series(np.arange(len(a), dtype=np.float64),
              index=['f', 'e', 'd', 'c', 'b', 'a'])
b[-1] = np.nan

a

f    NaN
e    2.5
d    NaN
c    3.5
b    4.5
a    NaN
dtype: float64

In [2]:
b

f    0.0
e    1.0
d    2.0
c    3.0
b    4.0
a    NaN
dtype: float64

In [3]:
np.where(a.isnull, b , a)

array([ 0.,  1.,  2.,  3.,  4., nan])

In [15]:
'''
Series has a combine_first method, which performs the equivalent of this operation
along with pandas’s usual data alignment logic
'''

new_b = b[:-2]
new_b

f    0.0
e    1.0
d    2.0
c    3.0
dtype: float64

In [16]:
new_a = a[2:]
new_a

d    NaN
c    3.5
b    4.5
a    NaN
dtype: float64

In [17]:
new_b.combine_first(new_a)

a    NaN
b    4.5
c    3.0
d    2.0
e    1.0
f    0.0
dtype: float64

In [12]:
'''
With DataFrames, combine_first does the same thing column by column, so you
can think of it as “patching” missing data in the calling object with data from the
object you pass
'''

df1 = pd.DataFrame({'a': [1., np.nan, 5., np.nan],
                    'b': [np.nan, 2., np.nan, 6.],
                    'c': range(2, 18, 4)})
df2 = pd.DataFrame({'a': [5., 4., np.nan, 3., 7.],
                    'b': [np.nan, 3., 4., 6., 8.]})
df1

Unnamed: 0,a,b,c
0,1.0,,2
1,,2.0,6
2,5.0,,10
3,,6.0,14


In [13]:
df2

Unnamed: 0,a,b
0,5.0,
1,4.0,3.0
2,,4.0
3,3.0,6.0
4,7.0,8.0


In [14]:
df1.combine_first(df2)

Unnamed: 0,a,b,c
0,1.0,,2.0
1,4.0,2.0,6.0
2,5.0,4.0,10.0
3,3.0,6.0,14.0
4,7.0,8.0,
