Concatenation of NumPy Arrays

In [1]:
import numpy as np
x = [1, 2, 3]
y = [4, 5, 6]
z = [7, 8, 9]
np.concatenate([x, y, z])

array([1, 2, 3, 4, 5, 6, 7, 8, 9])

In [2]:
x = [[1, 2],
     [3, 4]]
np.concatenate([x, x], axis=1)

array([[1, 2, 1, 2],
       [3, 4, 3, 4]])

In [3]:
import pandas as pd
ser1 = pd.Series(['A', 'B', 'C'], index=[1, 2, 3])
ser2 = pd.Series(['D', 'E', 'F'], index=[4, 5, 6])
pd.concat([ser1, ser2])

1    A
2    B
3    C
4    D
5    E
6    F
dtype: object

In [4]:
def make_df(cols, ind):
    """Quickly make a DataFrame"""
    data = {c: [str(c) + str(i) for i in ind]
            for c in cols}
    return pd.DataFrame(data, ind)

# example DataFrame
make_df('ABC', range(3))

Unnamed: 0,A,B,C
0,A0,B0,C0
1,A1,B1,C1
2,A2,B2,C2


In [7]:
df1 = make_df('AB', [1, 2])
df2 = make_df('AB', [3, 4])
# display("df1", "df2", "pd.concat([df1, df2])") # WILL WORK ON COLAB
print(df1, df2, pd.concat([df1, df2]), sep='\n\n')

    A   B
1  A1  B1
2  A2  B2

    A   B
3  A3  B3
4  A4  B4

    A   B
1  A1  B1
2  A2  B2
3  A3  B3
4  A4  B4


In [9]:
df3 = make_df('AB', [0, 1])
df4 = make_df('CD', [0, 1])
# display('df3', 'df4', "pd.concat([df3, df4], axis=1)")
print(df3, df4, pd.concat([df3, df4], axis=1), sep='\n\n')

    A   B
0  A0  B0
1  A1  B1

    C   D
0  C0  D0
1  C1  D1

    A   B   C   D
0  A0  B0  C0  D0
1  A1  B1  C1  D1


Duplicate indices

One important difference between np.concatenate and pd.concat is that Pandas concatenation preserves indices, even if the result will have duplicate indices! Consider this simple example:

In [11]:
x = make_df('AB', [0, 1])
y = make_df('AB', [2, 3])
y.index = x.index  # make duplicate indices!
# display('x', 'y', 'pd.concat([x, y])')
print("x", x, "y", y, "concat", pd.concat([x, y]), sep='\n\n')

x

    A   B
0  A0  B0
1  A1  B1

y

    A   B
0  A2  B2
1  A3  B3

concat

    A   B
0  A0  B0
1  A1  B1
0  A2  B2
1  A3  B3


In [12]:
# display('x', 'y', 'pd.concat([x, y], ------------------------------)')
print("x", x, "y", y, "concat", pd.concat([x, y], ignore_index=True), sep='\n\n')

x

    A   B
0  A0  B0
1  A1  B1

y

    A   B
0  A2  B2
1  A3  B3

concat

    A   B
0  A0  B0
1  A1  B1
2  A2  B2
3  A3  B3


Adding MultiIndex keys

In [13]:
# display('x', 'y', "pd.concat([x, y], -------------------------------------)")
print("x", x, "y", y, "concat", pd.concat([x, y], keys=["x", "y"]), sep='\n\n')

x

    A   B
0  A0  B0
1  A1  B1

y

    A   B
0  A2  B2
1  A3  B3

concat

      A   B
x 0  A0  B0
  1  A1  B1
y 0  A2  B2
  1  A3  B3


Concatenation with joins

In [15]:
df5 = make_df('ABC', [1, 2])
df6 = make_df('BCD', [3, 4])
# display('df5', 'df6', 'pd.concat([df5, df6])')
print("df5", df5, "df6", df6, "concat", pd.concat([df5, df6]), sep='\n\n')

df5

    A   B   C
1  A1  B1  C1
2  A2  B2  C2

df6

    B   C   D
3  B3  C3  D3
4  B4  C4  D4

concat

     A   B   C    D
1   A1  B1  C1  NaN
2   A2  B2  C2  NaN
3  NaN  B3  C3   D3
4  NaN  B4  C4   D4


In [16]:
# display('df5', 'df6',
#         "pd.concat([df5, df6], ------------------------------------')")
print("df5", df5, "df6", df6, "concat", 
      pd.concat([df5, df6], join="inner"), sep='\n\n')

df5

    A   B   C
1  A1  B1  C1
2  A2  B2  C2

df6

    B   C   D
3  B3  C3  D3
4  B4  C4  D4

concat

    B   C
1  B1  C1
2  B2  C2
3  B3  C3
4  B4  C4


In [17]:
# display('df5', 'df6',
#         "pd.concat([df5, df6], -----------------------------------')")
print("df5", df5, "df6", df6, "concat", 
      pd.concat([df5, df6], join="outer"), sep='\n\n')

df5

    A   B   C
1  A1  B1  C1
2  A2  B2  C2

df6

    B   C   D
3  B3  C3  D3
4  B4  C4  D4

concat

     A   B   C    D
1   A1  B1  C1  NaN
2   A2  B2  C2  NaN
3  NaN  B3  C3   D3
4  NaN  B4  C4   D4


The append() method

In [19]:
# display('df1', 'df2', 'df1.append(df2)')
print("df1", df1, "df2", df2, "df1.append(df2)", df1.append(df2), sep='\n\n')

df1

    A   B
1  A1  B1
2  A2  B2

df2

    A   B
3  A3  B3
4  A4  B4

df1.append(df2)

    A   B
1  A1  B1
2  A2  B2
3  A3  B3
4  A4  B4


In [21]:
df=pd.read_csv('1.csv')

In [22]:
df.shape

(7, 4)

In [23]:
df.head()


Unnamed: 0,Gender,Age,Annual Income (k$),Spending Score (1-100)
0,Male,19,15,39
1,Male,21,15,81
2,Female,20,16,6
3,Female,23,16,77
4,Female,31,17,40


In [24]:
df1=pd.read_csv('2.csv')

In [25]:
df1.head()

Unnamed: 0,Gender,Age,Annual Income (k$),Spending Score (1-100)
0,Female,23,18,94
1,Male,64,19,3
2,Female,30,19,72
3,Male,67,19,14
4,Female,35,19,99


In [26]:
df.append(df1)

Unnamed: 0,Gender,Age,Annual Income (k$),Spending Score (1-100)
0,Male,19,15,39
1,Male,21,15,81
2,Female,20,16,6
3,Female,23,16,77
4,Female,31,17,40
5,Female,22,17,76
6,Female,35,18,6
0,Female,23,18,94
1,Male,64,19,3
2,Female,30,19,72


In [27]:
df.append(df1, ignore_index=True)

Unnamed: 0,Gender,Age,Annual Income (k$),Spending Score (1-100)
0,Male,19,15,39
1,Male,21,15,81
2,Female,20,16,6
3,Female,23,16,77
4,Female,31,17,40
5,Female,22,17,76
6,Female,35,18,6
7,Female,23,18,94
8,Male,64,19,3
9,Female,30,19,72


In [28]:
df1.keys()

Index(['Gender', 'Age', 'Annual Income (k$)', 'Spending Score (1-100)'], dtype='object')

In [29]:
df3=pd.read_csv('3.csv')

In [30]:
df3.shape

(10, 4)

In [31]:
df3.head()

Unnamed: 0,Gender,Position,Level,Salary
0,Female,Business Analyst,1,45000
1,Male,Junior Consultant,2,50000
2,Female,Senior Consultant,3,60000
3,Male,Manager,4,80000
4,Female,Country Manager,5,110000


In [32]:
df1.append(df3)

Unnamed: 0,Gender,Age,Annual Income (k$),Spending Score (1-100),Position,Level,Salary
0,Female,23.0,18.0,94.0,,,
1,Male,64.0,19.0,3.0,,,
2,Female,30.0,19.0,72.0,,,
3,Male,67.0,19.0,14.0,,,
4,Female,35.0,19.0,99.0,,,
5,Female,58.0,20.0,15.0,,,
6,Female,24.0,20.0,77.0,,,
7,Male,37.0,20.0,13.0,,,
0,Female,,,,Business Analyst,1.0,45000.0
1,Male,,,,Junior Consultant,2.0,50000.0


In [33]:
df3.append(df1)

Unnamed: 0,Gender,Position,Level,Salary,Age,Annual Income (k$),Spending Score (1-100)
0,Female,Business Analyst,1.0,45000.0,,,
1,Male,Junior Consultant,2.0,50000.0,,,
2,Female,Senior Consultant,3.0,60000.0,,,
3,Male,Manager,4.0,80000.0,,,
4,Female,Country Manager,5.0,110000.0,,,
5,Female,Region Manager,6.0,150000.0,,,
6,Female,Partner,7.0,200000.0,,,
7,Male,Senior Partner,8.0,300000.0,,,
8,Female,C-level,9.0,500000.0,,,
9,Male,CEO,10.0,1000000.0,,,


In [34]:
df1.append(df3, sort=True).fillna("no value")

Unnamed: 0,Age,Annual Income (k$),Gender,Level,Position,Salary,Spending Score (1-100)
0,23.0,18.0,Female,no value,no value,no value,94.0
1,64.0,19.0,Male,no value,no value,no value,3.0
2,30.0,19.0,Female,no value,no value,no value,72.0
3,67.0,19.0,Male,no value,no value,no value,14.0
4,35.0,19.0,Female,no value,no value,no value,99.0
5,58.0,20.0,Female,no value,no value,no value,15.0
6,24.0,20.0,Female,no value,no value,no value,77.0
7,37.0,20.0,Male,no value,no value,no value,13.0
0,no value,no value,Female,1.0,Business Analyst,45000.0,no value
1,no value,no value,Male,2.0,Junior Consultant,50000.0,no value


In [35]:
Newdf = df.append([df, df1, df3], ignore_index=True)
Newdf.shape

(32, 7)

Combining the dataframe using concat()

In [36]:
pd.concat([df, df1], ignore_index=True)

Unnamed: 0,Gender,Age,Annual Income (k$),Spending Score (1-100)
0,Male,19,15,39
1,Male,21,15,81
2,Female,20,16,6
3,Female,23,16,77
4,Female,31,17,40
5,Female,22,17,76
6,Female,35,18,6
7,Female,23,18,94
8,Male,64,19,3
9,Female,30,19,72


In [37]:
pd.concat([df, df3])

Unnamed: 0,Gender,Age,Annual Income (k$),Spending Score (1-100),Position,Level,Salary
0,Male,19.0,15.0,39.0,,,
1,Male,21.0,15.0,81.0,,,
2,Female,20.0,16.0,6.0,,,
3,Female,23.0,16.0,77.0,,,
4,Female,31.0,17.0,40.0,,,
5,Female,22.0,17.0,76.0,,,
6,Female,35.0,18.0,6.0,,,
0,Female,,,,Business Analyst,1.0,45000.0
1,Male,,,,Junior Consultant,2.0,50000.0
2,Female,,,,Senior Consultant,3.0,60000.0


In [40]:
concatdf = pd.concat([df, df3], keys=["xls 1", "xls 2"])
concatdf

Unnamed: 0,Unnamed: 1,Gender,Age,Annual Income (k$),Spending Score (1-100),Position,Level,Salary
xls 1,0,Male,19.0,15.0,39.0,,,
xls 1,1,Male,21.0,15.0,81.0,,,
xls 1,2,Female,20.0,16.0,6.0,,,
xls 1,3,Female,23.0,16.0,77.0,,,
xls 1,4,Female,31.0,17.0,40.0,,,
xls 1,5,Female,22.0,17.0,76.0,,,
xls 1,6,Female,35.0,18.0,6.0,,,
xls 2,0,Female,,,,Business Analyst,1.0,45000.0
xls 2,1,Male,,,,Junior Consultant,2.0,50000.0
xls 2,2,Female,,,,Senior Consultant,3.0,60000.0


In [41]:
concatdf.loc["xls 1"]

Unnamed: 0,Gender,Age,Annual Income (k$),Spending Score (1-100),Position,Level,Salary
0,Male,19.0,15.0,39.0,,,
1,Male,21.0,15.0,81.0,,,
2,Female,20.0,16.0,6.0,,,
3,Female,23.0,16.0,77.0,,,
4,Female,31.0,17.0,40.0,,,
5,Female,22.0,17.0,76.0,,,
6,Female,35.0,18.0,6.0,,,


In [42]:
concatdf = pd.concat([df, df3], keys=["xls 1", "xls 2"], axis=1)
concatdf

Unnamed: 0_level_0,xls 1,xls 1,xls 1,xls 1,xls 2,xls 2,xls 2,xls 2
Unnamed: 0_level_1,Gender,Age,Annual Income (k$),Spending Score (1-100),Gender,Position,Level,Salary
0,Male,19.0,15.0,39.0,Female,Business Analyst,1,45000
1,Male,21.0,15.0,81.0,Male,Junior Consultant,2,50000
2,Female,20.0,16.0,6.0,Female,Senior Consultant,3,60000
3,Female,23.0,16.0,77.0,Male,Manager,4,80000
4,Female,31.0,17.0,40.0,Female,Country Manager,5,110000
5,Female,22.0,17.0,76.0,Female,Region Manager,6,150000
6,Female,35.0,18.0,6.0,Female,Partner,7,200000
7,,,,,Male,Senior Partner,8,300000
8,,,,,Female,C-level,9,500000
9,,,,,Male,CEO,10,1000000


In [44]:
pd.concat([df, df3], join="inner")

Unnamed: 0,Gender
0,Male
1,Male
2,Female
3,Female
4,Female
5,Female
6,Female
0,Female
1,Male
2,Female


In [45]:
pd.concat([df3, df], join="inner")

Unnamed: 0,Gender
0,Female
1,Male
2,Female
3,Male
4,Female
5,Female
6,Female
7,Male
8,Female
9,Male


In [46]:
pd.concat([df, df3], join="outer")

Unnamed: 0,Gender,Age,Annual Income (k$),Spending Score (1-100),Position,Level,Salary
0,Male,19.0,15.0,39.0,,,
1,Male,21.0,15.0,81.0,,,
2,Female,20.0,16.0,6.0,,,
3,Female,23.0,16.0,77.0,,,
4,Female,31.0,17.0,40.0,,,
5,Female,22.0,17.0,76.0,,,
6,Female,35.0,18.0,6.0,,,
0,Female,,,,Business Analyst,1.0,45000.0
1,Male,,,,Junior Consultant,2.0,50000.0
2,Female,,,,Senior Consultant,3.0,60000.0
