In [None]:
# Pandas Merge_Ordered
https://www.skytowner.com/explore/pandas_merge_ordered_method

In [1]:
import pandas as pd

In [2]:
df_products = pd.DataFrame({"product": ["computer", "smartphone", "headphones"],
                            "bought_by": ["bob", "alex", "david"]},
                             index=["A","B","C"])
df_customers = pd.DataFrame({"name":["alex","bob","cathy"], "age":[10, 20, 30]})

In [3]:
df_products

Unnamed: 0,product,bought_by
A,computer,bob
B,smartphone,alex
C,headphones,david


In [4]:
df_customers

Unnamed: 0,name,age
0,alex,10
1,bob,20
2,cathy,30


In [5]:
#outer join
pd.merge_ordered(df_products, df_customers, left_on="bought_by", right_on="name", how="outer")

Unnamed: 0,product,bought_by,name,age
0,smartphone,alex,alex,10.0
1,computer,bob,bob,20.0
2,,,cathy,30.0
3,headphones,david,,


In [6]:
#Unlike merge(~), merge_ordered(~) allows to fill missing values that arise due to the join.
#By default, fill_method=None, which means that the resulting NaN are left as is:
pd.merge_ordered(df_products, df_customers, left_on="bought_by", right_on="name", how="outer")

Unnamed: 0,product,bought_by,name,age
0,smartphone,alex,alex,10.0
1,computer,bob,bob,20.0
2,,,cathy,30.0
3,headphones,david,,


In [8]:
#To fill those NaN, set fill_method="ffill" like so:

pd.merge_ordered(df_products, df_customers, left_on="bought_by", right_on="name", 
                 how="outer", fill_method="ffill")

Unnamed: 0,product,bought_by,name,age
0,smartphone,alex,alex,10
1,computer,bob,bob,20
2,computer,bob,cathy,30
3,headphones,david,cathy,30


Note that this example is just to illustrate how the filling works - we will never perform such fillings. A practical use case of this filling logic is reserved mainly for Time-series when it makes more sense to fill with the previously recorded datetime.

In [9]:
#y default, left_by=None, which means that resulting DataFrame is constructed using a traditional join:
pd.merge_ordered(df_products, df_customers, left_on="bought_by", right_on="name", how="outer")

Unnamed: 0,product,bought_by,name,age
0,smartphone,alex,alex,10.0
1,computer,bob,bob,20.0
2,,,cathy,30.0
3,headphones,david,,


In [10]:
#
pd.merge_ordered(df_products, df_customers, left_on="bought_by", 
                 right_on="name", how="outer", left_by="product")

Unnamed: 0,product,bought_by,name,age
0,computer,,alex,10.0
1,computer,bob,bob,20.0
2,computer,,cathy,30.0
3,smartphone,alex,alex,10.0
4,smartphone,,bob,20.0
5,smartphone,,cathy,30.0
6,headphones,,alex,10.0
7,headphones,,bob,20.0
8,headphones,,cathy,30.0
9,headphones,david,,


In [11]:
df_products = pd.DataFrame({"product": ["computer", "smartphone", "headphones"],
                            "age": [7,8,9],
                            "bought_by": ["bob", "alex", "bob"]},
                             index=["A","B","C"])
df_customers = pd.DataFrame({"name":["alex","bob","cathy"], "age":[10, 20, 30]})

In [12]:
#Notice how the two DataFrames have an overlapping column label - age.
#By default, suffixes=("_x","_y"), which means that if the merged DataFrame has overlapping column labels, then the suffix "_x" will be appended to the overlapping column label of the left DataFrame, and "_y" to the right DataFrame:

pd.merge_ordered(df_products, df_customers, left_on="bought_by", right_on="name", how="outer")

Unnamed: 0,product,age_x,bought_by,name,age_y
0,smartphone,8.0,alex,alex,10
1,computer,7.0,bob,bob,20
2,headphones,9.0,bob,bob,20
3,,,,cathy,30


In [13]:
pd.merge_ordered(df_products, df_customers, left_on="bought_by", right_on="name", 
                 how="outer", suffixes=["_A","_B"])

Unnamed: 0,product,age_A,bought_by,name,age_B
0,smartphone,8.0,alex,alex,10
1,computer,7.0,bob,bob,20
2,headphones,9.0,bob,bob,20
3,,,,cathy,30


In [14]:
pd.merge_ordered(df_products, df_customers, left_on="bought_by", right_on="name", 
                 how="outer", suffixes=[None,"_B"])

Unnamed: 0,product,age,bought_by,name,age_B
0,smartphone,8.0,alex,alex,10
1,computer,7.0,bob,bob,20
2,headphones,9.0,bob,bob,20
3,,,,cathy,30
