## Combining Datasets: Concat and Append
Eber David Gaytan Medina


Some of the most interesting studies of data come from combining different data sources. These operations can involve anything from very straightforward concatenation of two different datasets, to more complicated database-style joins and merges that correctly handle any overlaps between the datasets. Series and DataFrames are built with this type of operation in mind, and Pandas includes functions and methods that make this sort of data wrangling fast and straightforward.


In [None]:
import pandas as pd
import numpy as np

def make_df(cols, ind):
    """Quickly make a DataFrame"""
    data = {c: [str(c) + str(i) for i in ind]
            for c in cols}
    return pd.DataFrame(data, ind)

# example DataFrame
make_df('ABC', range(3))
A	B	C
0	A0	B0	C0
1	A1	B1	C1
2	A2	B2	C2

class display(object):
    """Display HTML representation of multiple objects"""
    template = """<div style="float: left; padding: 10px;">
    <p style='font-family:"Courier New", Courier, monospace'>{0}</p>{1}
    </div>"""
    def __init__(self, *args):
        self.args = args
        
    def _repr_html_(self):
        return '\n'.join(self.template.format(a, eval(a)._repr_html_())
                         for a in self.args)
    
    def __repr__(self):
        return '\n\n'.join(a + '\n' + repr(eval(a))
                           for a in self.args)
    

x = [1, 2, 3]
y = [4, 5, 6]
z = [7, 8, 9]
np.concatenate([x, y, z])
array([1, 2, 3, 4, 5, 6, 7, 8, 9])

x = [[1, 2],
     [3, 4]]
np.concatenate([x, x], axis=1)
array([[1, 2, 1, 2],
       [3, 4, 3, 4]])

# Signature in Pandas v0.18
pd.concat(objs, axis=0, join='outer', join_axes=None, ignore_index=False,
          keys=None, levels=None, names=None, verify_integrity=False,
          copy=True)
pd.concat() can be used for a simple concatenation of Series or DataFrame objects, just as np.concatenate() can be used for simple concatenations of arrays:

ser1 = pd.Series(['A', 'B', 'C'], index=[1, 2, 3])
ser2 = pd.Series(['D', 'E', 'F'], index=[4, 5, 6])
pd.concat([ser1, ser2])
1    A
2    B
3    C
4    D
5    E
6    F
dtype: object

df1 = make_df('AB', [1, 2])
df2 = make_df('AB', [3, 4])
display('df1', 'df2', 'pd.concat([df1, df2])')
df1

A	B
1	A1	B1
2	A2	B2
df2

A	B
3	A3	B3
4	A4	B4
pd.concat([df1, df2])

A	B
1	A1	B1
2	A2	B2
3	A3	B3
4	A4	B4

df3 = make_df('AB', [0, 1])
df4 = make_df('CD', [0, 1])
display('df3', 'df4', "pd.concat([df3, df4], axis='col')")
df3

A	B
0	A0	B0
1	A1	B1
df4

C	D
0	C0	D0
1	C1	D1
pd.concat([df3, df4], axis='col')

A	B	C	D
0	A0	B0	C0	D0
1	A1	B1	C1	D1

x = make_df('AB', [0, 1])
y = make_df('AB', [2, 3])
y.index = x.index  # make duplicate indices!
display('x', 'y', 'pd.concat([x, y])')
x

A	B
0	A0	B0
1	A1	B1
y

A	B
0	A2	B2
1	A3	B3
pd.concat([x, y])

A	B
0	A0	B0
1	A1	B1
0	A2	B2
1	A3	B3

try:
    pd.concat([x, y], verify_integrity=True)
except ValueError as e:
    print("ValueError:", e)

display('x', 'y', 'pd.concat([x, y], ignore_index=True)')
x

A	B
0	A0	B0
1	A1	B1
y

A	B
0	A2	B2
1	A3	B3
pd.concat([x, y], ignore_index=True)

A	B
0	A0	B0
1	A1	B1
2	A2	B2
3	A3	B3

display('x', 'y', "pd.concat([x, y], keys=['x', 'y'])")
x

A	B
0	A0	B0
1	A1	B1
y

A	B
0	A2	B2
1	A3	B3
pd.concat([x, y], keys=['x', 'y'])

A	B
x	0	A0	B0
1	A1	B1
y	0	A2	B2
1	A3	B3

df5 = make_df('ABC', [1, 2])
df6 = make_df('BCD', [3, 4])
display('df5', 'df6', 'pd.concat([df5, df6])')
df5

A	B	C
1	A1	B1	C1
2	A2	B2	C2
df6

B	C	D
3	B3	C3	D3
4	B4	C4	D4
pd.concat([df5, df6])

A	B	C	D
1	A1	B1	C1	NaN
2	A2	B2	C2	NaN
3	NaN	B3	C3	D3
4	NaN	B4	C4	D4

display('df5', 'df6',
        "pd.concat([df5, df6], join='inner')")
df5

A	B	C
1	A1	B1	C1
2	A2	B2	C2
df6

B	C	D
3	B3	C3	D3
4	B4	C4	D4
pd.concat([df5, df6], join='inner')

B	C
1	B1	C1
2	B2	C2
3	B3	C3
4	B4	C4

display('df5', 'df6',
        "pd.concat([df5, df6], join_axes=[df5.columns])")
df5

A	B	C
1	A1	B1	C1
2	A2	B2	C2
df6

B	C	D
3	B3	C3	D3
4	B4	C4	D4
pd.concat([df5, df6], join_axes=[df5.columns])

A	B	C
1	A1	B1	C1
2	A2	B2	C2
3	NaN	B3	C3
4	NaN	B4	C4

display('df1', 'df2', 'df1.append(df2)')
df1

A	B
1	A1	B1
2	A2	B2
df2

A	B
3	A3	B3
4	A4	B4
df1.append(df2)

A	B
1	A1	B1
2	A2	B2
3	A3	B3
4	A4	B4
