# Pandas concat tricks

This is a notebook for the medium article [Pandas concat() tricks you should know to speed up your data analysis](https://towardsdatascience.com/pandas-concat-tricks-you-should-know-to-speed-up-your-data-analysis-cd3d4fdfe6dd)

Please check out article for instructions

**License**: [BSD 2-Clause](https://opensource.org/licenses/BSD-2-Clause)

In [1]:
import pandas as pd

In [2]:
df1 = pd.DataFrame({
    'name': ['A', 'B', 'C', 'D'],
    'math': [60,89,82,70],
    'physics': [66,95,83,66],
    'chemistry': [61,91,77,70]
})
df1

Unnamed: 0,name,math,physics,chemistry
0,A,60,66,61
1,B,89,95,91
2,C,82,83,77
3,D,70,66,70


In [3]:
df2 = pd.DataFrame({
    'name': ['E', 'F', 'G', 'H'],
    'math': [66,95,83,66],
    'physics': [60,89,82,70],
    'chemistry': [90,81,78,90]
})
df2

Unnamed: 0,name,math,physics,chemistry
0,E,66,60,90
1,F,95,89,81
2,G,83,82,78
3,H,66,70,90


## 1. Dealing with index and axis

In [4]:
pd.concat([df1,df2])

Unnamed: 0,name,math,physics,chemistry
0,A,60,66,61
1,B,89,95,91
2,C,82,83,77
3,D,70,66,70
0,E,66,60,90
1,F,95,89,81
2,G,83,82,78
3,H,66,70,90


In [5]:
pd.concat([df1, df2],ignore_index=True)

Unnamed: 0,name,math,physics,chemistry
0,A,60,66,61
1,B,89,95,91
2,C,82,83,77
3,D,70,66,70
4,E,66,60,90
5,F,95,89,81
6,G,83,82,78
7,H,66,70,90


In [6]:
pd.concat([df1, df2], axis=1)

Unnamed: 0,name,math,physics,chemistry,name.1,math.1,physics.1,chemistry.1
0,A,60,66,61,E,66,60,90
1,B,89,95,91,F,95,89,81
2,C,82,83,77,G,83,82,78
3,D,70,66,70,H,66,70,90


## 2. Avoiding duplicate indices

In [7]:
try:
    pd.concat([df1,df2], verify_integrity=True)
except ValueError as e:
    print('ValueError:', e)

ValueError: Indexes have overlapping values: Int64Index([0, 1, 2, 3], dtype='int64')


## 3. Adding a hierarchical index with keys and names options

In [8]:
res = pd.concat([df1,df2],keys=['Year 1','Year 2'])
res

Unnamed: 0,Unnamed: 1,name,math,physics,chemistry
Year 1,0,A,60,66,61
Year 1,1,B,89,95,91
Year 1,2,C,82,83,77
Year 1,3,D,70,66,70
Year 2,0,E,66,60,90
Year 2,1,F,95,89,81
Year 2,2,G,83,82,78
Year 2,3,H,66,70,90


In [9]:
res.loc['Year 1']

Unnamed: 0,name,math,physics,chemistry
0,A,60,66,61
1,B,89,95,91
2,C,82,83,77
3,D,70,66,70


In [10]:
pd.concat(
    [df1, df2], 
    keys=['Year 1', 'Year 2'],
    names=['Class', None],
)

Unnamed: 0_level_0,Unnamed: 1_level_0,name,math,physics,chemistry
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Year 1,0,A,60,66,61
Year 1,1,B,89,95,91
Year 1,2,C,82,83,77
Year 1,3,D,70,66,70
Year 2,0,E,66,60,90
Year 2,1,F,95,89,81
Year 2,2,G,83,82,78
Year 2,3,H,66,70,90


In [11]:
pd.concat(
    [df1, df2], 
    keys=['Year 1', 'Year 2'],
    names=['Class', None],
).reset_index(level=0)

Unnamed: 0,Class,name,math,physics,chemistry
0,Year 1,A,60,66,61
1,Year 1,B,89,95,91
2,Year 1,C,82,83,77
3,Year 1,D,70,66,70
0,Year 2,E,66,60,90
1,Year 2,F,95,89,81
2,Year 2,G,83,82,78
3,Year 2,H,66,70,90


In [12]:
# Pass a string to level
pd.concat(
    [df1, df2], 
    keys=['Year 1', 'Year 2'],
    names=['Class', None],
).reset_index(level='Class')

Unnamed: 0,Class,name,math,physics,chemistry
0,Year 1,A,60,66,61
1,Year 1,B,89,95,91
2,Year 1,C,82,83,77
3,Year 1,D,70,66,70
0,Year 2,E,66,60,90
1,Year 2,F,95,89,81
2,Year 2,G,83,82,78
3,Year 2,H,66,70,90


## 4. Columns matching and sorting

In [13]:
df1 = pd.DataFrame({
    'name': ['A', 'B', 'C', 'D'],
    'chemistry': [61,91,77,70],
    'physics': [66,95,83,66],
    'math': [60,89,82,70],
})
df1

Unnamed: 0,name,chemistry,physics,math
0,A,61,66,60
1,B,91,95,89
2,C,77,83,82
3,D,70,66,70


In [14]:
df2 = pd.DataFrame({
    'name': ['E', 'F', 'G', 'H'],
    'math': [66,95,83,66],
    'physics': [60,89,82,70],
    'chemistry': [90,81,78,90]
})
df2

Unnamed: 0,name,math,physics,chemistry
0,E,66,60,90
1,F,95,89,81
2,G,83,82,78
3,H,66,70,90


In [15]:
pd.concat([df1, df2])

Unnamed: 0,name,chemistry,physics,math
0,A,61,66,60
1,B,91,95,89
2,C,77,83,82
3,D,70,66,70
0,E,90,60,66
1,F,81,89,95
2,G,78,82,83
3,H,90,70,66


In [16]:
# sort by columns name
pd.concat([df1, df2], sort=True)

Unnamed: 0,chemistry,math,name,physics
0,61,60,A,66
1,91,89,B,95
2,77,82,C,83
3,70,70,D,66
0,90,66,E,60
1,81,95,F,89
2,78,83,G,82
3,90,66,H,70


In [17]:
# custom sort
custom_sort = ['math', 'chemistry', 'physics', 'name']
res = pd.concat([df1, df2])
res[custom_sort]

Unnamed: 0,math,chemistry,physics,name
0,60,61,66,A
1,89,91,95,B
2,82,77,83,C
3,70,70,66,D
0,66,90,60,E
1,95,81,89,F
2,83,78,82,G
3,66,90,70,H


## 5. Loading and concatenating datasets from a bunch of CSV files

In [18]:
# Bad
import pathlib2 as pl2
ps = pl2.Path('data/sp3')

res = None
for p in ps.glob('*.csv'):
    if res is None:
        res = pd.read_csv(p)
    else:
        res = pd.concat([res, pd.read_csv(p)])
res

Unnamed: 0.1,Unnamed: 0,Date,AveragePrice,Total Volume,4046,4225,4770,Total Bags,Small Bags,Large Bags,XLarge Bags,type,year,region
0,0,2015-12-27,0.98,625475.10,93034.20,246747.13,19103.69,266590.08,38620.02,227884.21,85.85,conventional,2015,Denver
1,1,2015-12-20,1.05,528944.54,113403.55,188263.20,24477.83,202799.96,34993.02,167806.94,0.00,conventional,2015,Denver
2,2,2015-12-13,0.83,741702.50,96222.34,321764.32,29349.34,294366.50,34244.40,260040.92,81.18,conventional,2015,Denver
3,3,2015-12-06,0.76,838225.19,111259.34,378124.28,8807.96,340033.61,35086.04,304844.07,103.50,conventional,2015,Denver
4,4,2015-11-29,1.12,429109.64,85241.45,179982.42,14146.28,149739.49,31920.41,117819.08,0.00,conventional,2015,Denver
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
333,7,2018-02-04,1.41,283378.47,22474.66,55360.49,133.41,205409.91,70232.59,134666.91,510.41,organic,2018,West
334,8,2018-01-28,1.80,185974.53,22918.40,33051.14,93.52,129911.47,77822.23,51986.86,102.38,organic,2018,West
335,9,2018-01-21,1.83,189317.99,27049.44,33561.32,439.47,128267.76,76091.99,51947.50,228.27,organic,2018,West
336,10,2018-01-14,1.82,207999.67,33869.12,47435.14,433.52,126261.89,89115.78,37133.99,12.12,organic,2018,West


In [20]:
import pathlib2 as pl2
ps = pl2.Path('data/sp3')

dfs = (
    pd.read_csv(p, encoding='utf8') for p in ps.glob('*.csv')
)

res = pd.concat(dfs)
res

Unnamed: 0.1,Unnamed: 0,Date,AveragePrice,Total Volume,4046,4225,4770,Total Bags,Small Bags,Large Bags,XLarge Bags,type,year,region
0,0,2015-12-27,0.98,625475.10,93034.20,246747.13,19103.69,266590.08,38620.02,227884.21,85.85,conventional,2015,Denver
1,1,2015-12-20,1.05,528944.54,113403.55,188263.20,24477.83,202799.96,34993.02,167806.94,0.00,conventional,2015,Denver
2,2,2015-12-13,0.83,741702.50,96222.34,321764.32,29349.34,294366.50,34244.40,260040.92,81.18,conventional,2015,Denver
3,3,2015-12-06,0.76,838225.19,111259.34,378124.28,8807.96,340033.61,35086.04,304844.07,103.50,conventional,2015,Denver
4,4,2015-11-29,1.12,429109.64,85241.45,179982.42,14146.28,149739.49,31920.41,117819.08,0.00,conventional,2015,Denver
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
333,7,2018-02-04,1.41,283378.47,22474.66,55360.49,133.41,205409.91,70232.59,134666.91,510.41,organic,2018,West
334,8,2018-01-28,1.80,185974.53,22918.40,33051.14,93.52,129911.47,77822.23,51986.86,102.38,organic,2018,West
335,9,2018-01-21,1.83,189317.99,27049.44,33561.32,439.47,128267.76,76091.99,51947.50,228.27,organic,2018,West
336,10,2018-01-14,1.82,207999.67,33869.12,47435.14,433.52,126261.89,89115.78,37133.99,12.12,organic,2018,West
