## Combining datasets
Pandas provides various facilities for easily combining Series ans Datasets
- **Merge:** combining DataFrames accross row2s or columns
- **Join:** combining data on a key column or index
- **Concatenate:** combining data on common columns or indexes

In [1]:
from helpers import sample_df, hdisplay, nowrap_display
import pandas as pd

### Concat
Pandas .concat() method concatenates dataframe row-, or columnwise,  
with optional set logic (union or intersection) of the indexes on the other axis


Concat simply stacks multiple dataframes together, either vertically or horizontally after aligning on row/column.

In [10]:
# Sample data
left = sample_df("A0", "D3", "L_")
right = sample_df("A0", "D3", "R_")

hdisplay([left,right], ["left", "right"])


Unnamed: 0,A,B,C,D
0,L_A0,L_B0,L_C0,L_D0
1,L_A1,L_B1,L_C1,L_D1
2,L_A2,L_B2,L_C2,L_D2
3,L_A3,L_B3,L_C3,L_D3

Unnamed: 0,A,B,C,D
0,R_A0,R_B0,R_C0,R_D0
1,R_A1,R_B1,R_C1,R_D1
2,R_A2,R_B2,R_C2,R_D2
3,R_A3,R_B3,R_C3,R_D3


In [16]:
hdisplay([
    pd.concat([left, right], axis="index"),
    pd.concat([left, right], axis="columns")],
    ["axis='index'", "axis='columns'"])

Unnamed: 0,A,B,C,D
0,L_A0,L_B0,L_C0,L_D0
1,L_A1,L_B1,L_C1,L_D1
2,L_A2,L_B2,L_C2,L_D2
3,L_A3,L_B3,L_C3,L_D3
0,R_A0,R_B0,R_C0,R_D0
1,R_A1,R_B1,R_C1,R_D1
2,R_A2,R_B2,R_C2,R_D2
3,R_A3,R_B3,R_C3,R_D3

Unnamed: 0,A,B,C,D,A.1,B.1,C.1,D.1
0,L_A0,L_B0,L_C0,L_D0,R_A0,R_B0,R_C0,R_D0
1,L_A1,L_B1,L_C1,L_D1,R_A1,R_B1,R_C1,R_D1
2,L_A2,L_B2,L_C2,L_D2,R_A2,R_B2,R_C2,R_D2
3,L_A3,L_B3,L_C3,L_D3,R_A3,R_B3,R_C3,R_D3


In [27]:
# pd.concat([left, right]).reset_index()
# pd.concat([left, right]).set_index("C")




hdisplay([
        pd.concat([left, right], ignore_index=False),
        pd.concat([left, right], ignore_index=True),
        pd.concat([left, right], axis="columns", ignore_index=True)],
        ["ignore_index=False", "ignore_index=True", "axis=columns, ignore_index=True"]

)



Unnamed: 0,A,B,C,D
0,L_A0,L_B0,L_C0,L_D0
1,L_A1,L_B1,L_C1,L_D1
2,L_A2,L_B2,L_C2,L_D2
3,L_A3,L_B3,L_C3,L_D3
0,R_A0,R_B0,R_C0,R_D0
1,R_A1,R_B1,R_C1,R_D1
2,R_A2,R_B2,R_C2,R_D2
3,R_A3,R_B3,R_C3,R_D3

Unnamed: 0,A,B,C,D
0,L_A0,L_B0,L_C0,L_D0
1,L_A1,L_B1,L_C1,L_D1
2,L_A2,L_B2,L_C2,L_D2
3,L_A3,L_B3,L_C3,L_D3
4,R_A0,R_B0,R_C0,R_D0
5,R_A1,R_B1,R_C1,R_D1
6,R_A2,R_B2,R_C2,R_D2
7,R_A3,R_B3,R_C3,R_D3

Unnamed: 0,0,1,2,3,4,5,6,7
0,L_A0,L_B0,L_C0,L_D0,R_A0,R_B0,R_C0,R_D0
1,L_A1,L_B1,L_C1,L_D1,R_A1,R_B1,R_C1,R_D1
2,L_A2,L_B2,L_C2,L_D2,R_A2,R_B2,R_C2,R_D2
3,L_A3,L_B3,L_C3,L_D3,R_A3,R_B3,R_C3,R_D3


In [31]:
# new sample data
left = sample_df("A0", "D3", "L_")
right = sample_df("C2", "F5", "R_")

hdisplay([
        left, 
        right],
        ["Left", "Right"]
)


Unnamed: 0,A,B,C,D
0,L_A0,L_B0,L_C0,L_D0
1,L_A1,L_B1,L_C1,L_D1
2,L_A2,L_B2,L_C2,L_D2
3,L_A3,L_B3,L_C3,L_D3

Unnamed: 0,C,D,E,F
2,R_C2,R_D2,R_E2,R_F2
3,R_C3,R_D3,R_E3,R_F3
4,R_C4,R_D4,R_E4,R_F4
5,R_C5,R_D5,R_E5,R_F5


In [39]:
hdisplay([
        pd.concat([left, right], axis="index"),
        pd.concat([left, right], axis="columns", join="outer"), # union (default)
        pd.concat([left, right], axis="columns", join="inner")],  # intersect
        ["axis='index'", "axis='columns'", "axis='columns', join='inner'" ]
)

Unnamed: 0,A,B,C,D,E,F
0,L_A0,L_B0,L_C0,L_D0,,
1,L_A1,L_B1,L_C1,L_D1,,
2,L_A2,L_B2,L_C2,L_D2,,
3,L_A3,L_B3,L_C3,L_D3,,
2,,,R_C2,R_D2,R_E2,R_F2
3,,,R_C3,R_D3,R_E3,R_F3
4,,,R_C4,R_D4,R_E4,R_F4
5,,,R_C5,R_D5,R_E5,R_F5

Unnamed: 0,A,B,C,D,C.1,D.1,E,F
0,L_A0,L_B0,L_C0,L_D0,,,,
1,L_A1,L_B1,L_C1,L_D1,,,,
2,L_A2,L_B2,L_C2,L_D2,R_C2,R_D2,R_E2,R_F2
3,L_A3,L_B3,L_C3,L_D3,R_C3,R_D3,R_E3,R_F3
4,,,,,R_C4,R_D4,R_E4,R_F4
5,,,,,R_C5,R_D5,R_E5,R_F5

Unnamed: 0,A,B,C,D,C.1,D.1,E,F
2,L_A2,L_B2,L_C2,L_D2,R_C2,R_D2,R_E2,R_F2
3,L_A3,L_B3,L_C3,L_D3,R_C3,R_D3,R_E3,R_F3


In [58]:
hdisplay([
        pd.concat([left, right], keys=["left", "right"], axis="index"),
        pd.concat([left, right], keys=["left", "right"], axis="columns").loc[:,"right"]],
        ["axis='index', keys=[left, right]", "axis='columns', keys=[left, right]"]

)


Unnamed: 0,Unnamed: 1,A,B,C,D,E,F
left,0,L_A0,L_B0,L_C0,L_D0,,
left,1,L_A1,L_B1,L_C1,L_D1,,
left,2,L_A2,L_B2,L_C2,L_D2,,
left,3,L_A3,L_B3,L_C3,L_D3,,
right,2,,,R_C2,R_D2,R_E2,R_F2
right,3,,,R_C3,R_D3,R_E3,R_F3
right,4,,,R_C4,R_D4,R_E4,R_F4
right,5,,,R_C5,R_D5,R_E5,R_F5

Unnamed: 0,C,D,E,F
0,,,,
1,,,,
2,R_C2,R_D2,R_E2,R_F2
3,R_C3,R_D3,R_E3,R_F3
4,R_C4,R_D4,R_E4,R_F4
5,R_C5,R_D5,R_E5,R_F5


### Join  

Joins first aligns the index of two dataframes, and then pick up the remaining columns from the aligned rows of eadh dataframe

In [70]:
left = sample_df("A0", "D3", "L_").add_prefix("L")
right = sample_df("C2", "F5", "R_").add_prefix("R")
hdisplay([left, right], ["Left", "Right"])

Unnamed: 0,LA,LB,LC,LD
0,L_A0,L_B0,L_C0,L_D0
1,L_A1,L_B1,L_C1,L_D1
2,L_A2,L_B2,L_C2,L_D2
3,L_A3,L_B3,L_C3,L_D3

Unnamed: 0,RC,RD,RE,RF
2,R_C2,R_D2,R_E2,R_F2
3,R_C3,R_D3,R_E3,R_F3
4,R_C4,R_D4,R_E4,R_F4
5,R_C5,R_D5,R_E5,R_F5


In [78]:
hdisplay([
    left.join(right, how="left"),
    left.join(right, how="right")],
    ["how='left'", "how='right'", "new"]
    )

Unnamed: 0,LA,LB,LC,LD,RC,RD,RE,RF
0,L_A0,L_B0,L_C0,L_D0,,,,
1,L_A1,L_B1,L_C1,L_D1,,,,
2,L_A2,L_B2,L_C2,L_D2,R_C2,R_D2,R_E2,R_F2
3,L_A3,L_B3,L_C3,L_D3,R_C3,R_D3,R_E3,R_F3

Unnamed: 0,LA,LB,LC,LD,RC,RD,RE,RF
2,L_A2,L_B2,L_C2,L_D2,R_C2,R_D2,R_E2,R_F2
3,L_A3,L_B3,L_C3,L_D3,R_C3,R_D3,R_E3,R_F3
4,,,,,R_C4,R_D4,R_E4,R_F4
5,,,,,R_C5,R_D5,R_E5,R_F5


In [81]:
hdisplay([
    left.join(right, how="inner"),
    left.join(right, how="outer")],
    ["how='inner'", "how='outer'"]
    )

Unnamed: 0,LA,LB,LC,LD,RC,RD,RE,RF
2,L_A2,L_B2,L_C2,L_D2,R_C2,R_D2,R_E2,R_F2
3,L_A3,L_B3,L_C3,L_D3,R_C3,R_D3,R_E3,R_F3

Unnamed: 0,LA,LB,LC,LD,RC,RD,RE,RF
0,L_A0,L_B0,L_C0,L_D0,,,,
1,L_A1,L_B1,L_C1,L_D1,,,,
2,L_A2,L_B2,L_C2,L_D2,R_C2,R_D2,R_E2,R_F2
3,L_A3,L_B3,L_C3,L_D3,R_C3,R_D3,R_E3,R_F3
4,,,,,R_C4,R_D4,R_E4,R_F4
5,,,,,R_C5,R_D5,R_E5,R_F5
