# Combining Dataset: concat & append

In [1]:
import pandas as pd
import numpy as np

In [15]:
# Defining a function to create a DataFrame

def create_df(cols,ind):
    '''Quickly creating a dataframe'''
    data = {c:[str(c) + str(i) for i in ind] for c in cols}
    return pd.DataFrame(data)
create_df('ABC', range(3))

Unnamed: 0,A,B,C
0,A0,B0,C0
1,A1,B1,C1
2,A2,B2,C2


In [11]:
# Recall: cocncatenation of numpy arrays

x = [1,2,3,4]
y = [3,4,5,6]   # Combining the contents of two or more arrays into a single array
z = [4,5,6,7]

np.concatenate((x,y,z))

array([1, 2, 3, 4, 3, 4, 5, 6, 4, 5, 6, 7])

In [26]:
# Simple concatenation with pd.concat
# pd.concat() can be used concatenate series or dataframe

ser1 = pd.Series(['A','B','C'], index = [1,2,3])
ser2 = pd.Series(['E', 'F', 'G'], index = [4,5,6])
pd.concat([ser1,ser2])

# It also work with DataFrames

df1 = create_df('AB', [1,2])
df2 = create_df('CD',[3,4])

print(df1); print(df2); print(pd.concat([df1,df2]))

# Concatenating along the column

print(df1); print(df2); print(pd.concat([df1,df2], axis =1))


    A   B
0  A1  B1
1  A2  B2
    C   D
0  C3  D3
1  C4  D4
     A    B    C    D
0   A1   B1  NaN  NaN
1   A2   B2  NaN  NaN
0  NaN  NaN   C3   D3
1  NaN  NaN   C4   D4
    A   B
0  A1  B1
1  A2  B2
    C   D
0  C3  D3
1  C4  D4
    A   B   C   D
0  A1  B1  C3  D3
1  A2  B2  C4  D4
    A   B
0  A0  B0
1  A1  B1
    C   D
0  C0  D0
1  C2  D2
     A    B    C    D
0   A0   B0  NaN  NaN
1   A1   B1  NaN  NaN
0  NaN  NaN   C0   D0
1  NaN  NaN   C2   D2


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.


  del sys.path[0]
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.




In [36]:

# Using verify_integrity flag in pandas to check if there are duplicates of indices in the result

x = create_df('AB',[0,1])
y = create_df('AB', [0,2])

x.index = y.index
print(x); print(y)
pd.concat([x,y], verify_integrity=True)  # This will raise an error

# We can ignore the index by allowing the concatenate create a new integer index for the series
pd.concat([x,y], ignore_index=True)

    A   B
0  A0  B0
1  A1  B1
    A   B
0  A0  B0
1  A2  B2


Unnamed: 0,A,B
0,A0,B0
1,A1,B1
2,A0,B0
3,A2,B2


In [43]:
# Concatenation with join
# Data with various column names

df3 = create_df('ABC', [1,2])
df4 = create_df('BCD', [3,4])
print(pd.concat([df3,df4], join='inner'))

# Using the join_axes argument
pd.concat([df3,df4], join_axes=[df3.columns])



    B   C
0  B1  C1
1  B2  C2
0  B3  C3
1  B4  C4


Unnamed: 0,A,B,C,B.1,C.1,D
0,A1,B1,C1,B3,C3,D3
1,A2,B2,C2,B4,C4,D4


In [45]:
# append method performs same function as the concat
x.append(y, axis = 1)

TypeError: append() got an unexpected keyword argument 'axis'