In [1]:
import pandas as pd
import numpy as np

In [2]:
#For convenience, we’ll define this function, which creates a DataFrame of a particular
#form that will be useful below
def make_df(cols,ind):
    data={c:[str(c)+str(i) for i in ind] for c in cols}
    return pd.DataFrame(data,ind)

In [3]:
make_df('ABC',range(3))

Unnamed: 0,A,B,C
0,A0,B0,C0
1,A1,B1,C1
2,A2,B2,C2


# Simple Concatenation with pd.concat

In [4]:
s1=pd.Series(['A','B','C'],index=[1,2,3])
s2=pd.Series(['D','E','F'],index=[4,5,6])

In [5]:
pd.concat([s1,s1])

1    A
2    B
3    C
1    A
2    B
3    C
dtype: object

In [6]:
#working with dataframes

In [7]:
df1=make_df('AB',[1,2])

In [8]:
df1

Unnamed: 0,A,B
1,A1,B1
2,A2,B2


In [12]:
df2=make_df('AB',[3,4])

In [13]:
df2

Unnamed: 0,A,B
3,A3,B3
4,A4,B4


In [15]:
pd.concat([df1,df2]) #row wise i.e. axis=0 (default)

Unnamed: 0,A,B
1,A1,B1
2,A2,B2
3,A3,B3
4,A4,B4


In [21]:
df1=make_df('AB',[1,2])
df2=make_df('CD',[1,2])

In [22]:
df1

Unnamed: 0,A,B
1,A1,B1
2,A2,B2


In [23]:
df2

Unnamed: 0,C,D
1,C1,D1
2,C2,D2


In [24]:
#concatenation column wise

In [25]:
pd.concat([df1,df2],axis=1)

Unnamed: 0,A,B,C,D
1,A1,B1,C1,D1
2,A2,B2,C2,D2


# Duplicate indices

In [26]:
#One important difference between np.concatenate and pd.concat is that Pandas
#concatenation preserves indices, even if the result will have duplicate indices!

In [27]:
df1=make_df('AB',[1,2])
df2=make_df('AB',[1,2])

In [28]:
pd.concat([df1,df2])

Unnamed: 0,A,B
1,A1,B1
2,A2,B2
1,A1,B1
2,A2,B2


In [29]:
#Catching the repeats as an error. If you’d like to simply verify that the indices in the
#result of pd.concat() do not overlap, you can specify the verify_integrity flag.
#With this set to True , the concatenation will raise an exception if there are duplicate
#indices

In [30]:
#Ignoring the index. Sometimes the index itself does not matter, and you would prefer
#it to simply be ignored. You can specify this option using the ignore_index flag. With
#this set to True , the concatenation will create a new integer index for the resulting
#Series

In [31]:
pd.concat([df1,df2],ignore_index=True)

Unnamed: 0,A,B
0,A1,B1
1,A2,B2
2,A1,B1
3,A2,B2


In [32]:
#Adding MultiIndex keys. Another alternative is to use the keys option to specify a label
#for the data sources; the result will be a hierarchically indexed series containing the
#data

In [33]:
pd.concat([df1,df2],keys=['df1','df2'])

Unnamed: 0,Unnamed: 1,A,B
df1,1,A1,B1
df1,2,A2,B2
df2,1,A1,B1
df2,2,A2,B2


# Concatenation with joins

In [34]:
#In the simple examples we just looked at, we were mainly concatenating DataFrame s
#with shared column names. In practice, data from different sources might have differ‐
#ent sets of column names, and pd.concat offers several options in this case. Consider
#the concatenation of the following two DataFrame s, which have some (but not all!)
#columns in common:

In [35]:
df1=make_df('ABC',[1,2])
df2=make_df('BCD',[3,4])

In [36]:
pd.concat([df1,df2])

Unnamed: 0,A,B,C,D
1,A1,B1,C1,
2,A2,B2,C2,
3,,B3,C3,D3
4,,B4,C4,D4


In [43]:
#By default, the entries for which no data is available are filled with NA values. To
#change this, we can specify  join  parameter of the concatenate function. By default, the join is a union of the input columns
#( join='outer' ), but we can change this to an intersection of the columns using
#join='inner' :

In [44]:
pd.concat([df1,df2],join='inner')

Unnamed: 0,B,C
1,B1,C1
2,B2,C2
3,B3,C3
4,B4,C4


In [45]:
#The append() method

In [46]:
#Because direct array concatenation is so common, Series and DataFrame objects
#have an append method that can accomplish the same thing in fewer keystrokes. For
#example, rather than calling pd.concat([df1, df2]) , you can simply call
#df1.append(df2) :

In [48]:
df1=make_df('AB',[1,2])

In [49]:
df2=make_df('AB',[3,4])

In [50]:
df1

Unnamed: 0,A,B
1,A1,B1
2,A2,B2


In [51]:
df2

Unnamed: 0,A,B
3,A3,B3
4,A4,B4


In [52]:
df1.append(df2)

Unnamed: 0,A,B
1,A1,B1
2,A2,B2
3,A3,B3
4,A4,B4


In [53]:
#Keep in mind that unlike the append() and extend() methods of Python lists, the
#append() method in Pandas does not modify the original object—instead, it creates a
#new object with the combined data. It also is not a very efficient method, because it
#involves creation of a new index and data buffer. Thus, if you plan to do multiple
#append operations, it is generally better to build a list of DataFrame s and pass them all
#at once to the concat() function

# Thank You