# Combining Dataset: concat & append

In [1]:
import pandas as pd
import numpy as np

In [2]:
# Defining a function to create a DataFrame

def create_df(cols,ind):
    '''Quickly creating a dataframe'''
    data = {c:[str(c) + str(i) for i in ind] for c in cols}
    return pd.DataFrame(data)
create_df('ABC', range(3))

Unnamed: 0,A,B,C
0,A0,B0,C0
1,A1,B1,C1
2,A2,B2,C2


In [3]:
# Recall: cocncatenation of numpy arrays

x = [1,2,3,4]
y = [3,4,5,6]   # Combining the contents of two or more arrays into a single array
z = [4,5,6,7]

np.concatenate((x,y,z))

array([1, 2, 3, 4, 3, 4, 5, 6, 4, 5, 6, 7])

In [4]:
# Simple concatenation with pd.concat
# pd.concat() can be used concatenate series or dataframe

ser1 = pd.Series(['A','B','C'], index = [1,2,3])
ser2 = pd.Series(['E', 'F', 'G'], index = [4,5,6])
pd.concat([ser1,ser2])

# It also work with DataFrames

df1 = create_df('AB', [1,2])
df2 = create_df('CD',[3,4])

print(df1); print(df2); print(pd.concat([df1,df2]))

# Concatenating along the column

print(df1); print(df2); print(pd.concat([df1,df2], axis =1))


    A   B
0  A1  B1
1  A2  B2
    C   D
0  C3  D3
1  C4  D4
     A    B    C    D
0   A1   B1  NaN  NaN
1   A2   B2  NaN  NaN
0  NaN  NaN   C3   D3
1  NaN  NaN   C4   D4
    A   B
0  A1  B1
1  A2  B2
    C   D
0  C3  D3
1  C4  D4
    A   B   C   D
0  A1  B1  C3  D3
1  A2  B2  C4  D4


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.


  del sys.path[0]


In [5]:

# Using verify_integrity flag in pandas to check if there are duplicates of indices in the result

x = create_df('AB',[0,1])
y = create_df('AB', [0,2])

x.index = y.index
print(x); print(y)
# pd.concat([x,y], verify_integrity=True)  # This will raise an error

# We can ignore the index by allowing the concatenate create a new integer index for the series
pd.concat([x,y], ignore_index=True)

# Concatenation with join
# Data with various column names

df3 = create_df('ABC', [1,2])
df4 = create_df('BCD', [3,4])
print(pd.concat([df3,df4], join='inner'))

# Using the join_axes argument
pd.concat([df3,df4], join_axes=[df3.columns])



    A   B
0  A0  B0
1  A1  B1
    A   B
0  A0  B0
1  A2  B2
    B   C
0  B1  C1
1  B2  C2
0  B3  C3
1  B4  C4


Unnamed: 0,A,B,C
0,A1,B1,C1
1,A2,B2,C2
0,,B3,C3
1,,B4,C4


# Combining Data set: merge() and join()

In [20]:
# Categories of join
# one-to-one

df1 = pd.DataFrame({'employee':['Bob', 'Jake', 'Lisa', 'Sue'],
                    'group': ['Accounting', 'Engineering', 'Engineering', 'HR']})
df2 = pd.DataFrame({'employee': ['Lisa', 'Bob', 'Jake','Sue'],
                    'hire_date':[2004,2012,2006,2014]})
df3 = pd.merge(df1,df2)

# Many-to-one

df4  = pd.DataFrame({'group':['Accounting', 'Engineeroing', 'HR'],
                     'supervisor': ['Carly', 'Guido', 'Steve']})

# Another example on many-to-one
dl1 = pd.DataFrame({'key':['b','b','a','c','a','a','b'],
                    'data1': range(7)})
dl2 = pd.DataFrame({'key': ['a','b','d'],
                    'data2': range(3)})
pd.merge(dl1,dl2, on='key')

# Seperating the value of the data if the column names are different
dl3 = pd.DataFrame({'lkey':['b','b','a','c','a','a','b'],
                    'data1': range(7)})
dl4 = pd.DataFrame({'rkey': ['a','b','d'],
                    'data2': range(3)})
merged = pd.merge(dl3,dl4, right_on='rkey', left_on='lkey')
# pd.merge(df3,df4)
# print(df3)
# print(df4)
# print(pd.merge(df3,df4))

Unnamed: 0,lkey,data1,rkey,data2
3,a,2,a,0
4,a,4,a,0
5,a,5,a,0
0,b,0,b,1
1,b,1,b,1
2,b,6,b,1
