# Chapter 6: pandas in Depth: Data Manipulation

<div id="toc"></div>

## 6.1 Data Preparation

### Merging

In [1]:
import numpy as np
import pandas as pd

In [2]:
frame1 = pd.DataFrame( {'id':['ball','pencil','pen','mug','ashtray'],
                        'price': [12.33,11.44,33.21,13.23,33.62]})
frame1

Unnamed: 0,id,price
0,ball,12.33
1,pencil,11.44
2,pen,33.21
3,mug,13.23
4,ashtray,33.62


In [3]:
frame2 = pd.DataFrame( {'id':['pencil','pencil','ball','pen'],
'color': ['white','red','red','black']})
frame2

Unnamed: 0,color,id
0,white,pencil
1,red,pencil
2,red,ball
3,black,pen


In [4]:
pd.merge(frame1,frame2)

Unnamed: 0,id,price,color
0,ball,12.33,red
1,pencil,11.44,white
2,pencil,11.44,red
3,pen,33.21,black


In [5]:
frame1 = pd.DataFrame( {'id':['ball','pencil','pen','mug','ashtray'],
                        'color': ['white','red','red','black','green'],
                        'brand': ['OMG','ABC','ABC','POD','POD']})
frame1

Unnamed: 0,brand,color,id
0,OMG,white,ball
1,ABC,red,pencil
2,ABC,red,pen
3,POD,black,mug
4,POD,green,ashtray


In [6]:
frame2 = pd.DataFrame( {'id':['pencil','pencil','ball','pen'],
                        'brand': ['OMG','POD','ABC','POD']})
frame2

Unnamed: 0,brand,id
0,OMG,pencil
1,POD,pencil
2,ABC,ball
3,POD,pen


In [7]:
pd.merge(frame1,frame2)

Unnamed: 0,brand,color,id


In [8]:
pd.merge(frame1,frame2,on='id')

Unnamed: 0,brand_x,color,id,brand_y
0,OMG,white,ball,ABC
1,ABC,red,pencil,OMG
2,ABC,red,pencil,POD
3,ABC,red,pen,POD


In [9]:
pd.merge(frame1,frame2,on='brand')
frame2.columns = ['brand','sid']
frame2

Unnamed: 0,brand,sid
0,OMG,pencil
1,POD,pencil
2,ABC,ball
3,POD,pen


In [10]:
pd.merge(frame1, frame2, left_on='id', right_on='sid')

Unnamed: 0,brand_x,color,id,brand_y,sid
0,OMG,white,ball,ABC,ball
1,ABC,red,pencil,OMG,pencil
2,ABC,red,pencil,POD,pencil
3,ABC,red,pen,POD,pen


In [11]:
frame2.columns = ['brand','id']

In [12]:
pd.merge(frame1,frame2,on='id')

Unnamed: 0,brand_x,color,id,brand_y
0,OMG,white,ball,ABC
1,ABC,red,pencil,OMG
2,ABC,red,pencil,POD
3,ABC,red,pen,POD


In [13]:
pd.merge(frame1,frame2,on='id',how='outer')

Unnamed: 0,brand_x,color,id,brand_y
0,OMG,white,ball,ABC
1,ABC,red,pencil,OMG
2,ABC,red,pencil,POD
3,ABC,red,pen,POD
4,POD,black,mug,
5,POD,green,ashtray,


In [14]:
pd.merge(frame1,frame2,on='id',how='left')

Unnamed: 0,brand_x,color,id,brand_y
0,OMG,white,ball,ABC
1,ABC,red,pencil,OMG
2,ABC,red,pencil,POD
3,ABC,red,pen,POD
4,POD,black,mug,
5,POD,green,ashtray,


In [15]:
pd.merge(frame1,frame2,on=['id','brand'],how='outer')

Unnamed: 0,brand,color,id
0,OMG,white,ball
1,ABC,red,pencil
2,ABC,red,pen
3,POD,black,mug
4,POD,green,ashtray
5,OMG,,pencil
6,POD,,pencil
7,ABC,,ball
8,POD,,pen


* Merging on Index

In [16]:
pd.merge(frame1,frame2,right_index=True, left_index=True)

Unnamed: 0,brand_x,color,id_x,brand_y,id_y
0,OMG,white,ball,OMG,pencil
1,ABC,red,pencil,POD,pencil
2,ABC,red,pen,ABC,ball
3,POD,black,mug,POD,pen


In [None]:
frame1.join(frame2)

In [18]:
frame2.columns = ['brand2','id2']
frame1.join(frame2)

Unnamed: 0,brand,color,id,brand2,id2
0,OMG,white,ball,OMG,pencil
1,ABC,red,pencil,POD,pencil
2,ABC,red,pen,ABC,ball
3,POD,black,mug,POD,pen
4,POD,green,ashtray,,


## 6.2 Concatenating

In [24]:
array1 = np.array([[0, 1, 2],[3, 4, 5],[6, 7, 8]])

In [25]:
array2 = np.arange(9).reshape((3,3))+6
array2

array([[ 6,  7,  8],
       [ 9, 10, 11],
       [12, 13, 14]])

In [26]:
np.concatenate([array1,array2],axis=1)

array([[ 0,  1,  2,  6,  7,  8],
       [ 3,  4,  5,  9, 10, 11],
       [ 6,  7,  8, 12, 13, 14]])

In [27]:
np.concatenate([array1,array2],axis=0)

array([[ 0,  1,  2],
       [ 3,  4,  5],
       [ 6,  7,  8],
       [ 6,  7,  8],
       [ 9, 10, 11],
       [12, 13, 14]])

In [28]:
ser1 = pd.Series(np.random.rand(4), index=[1,2,3,4])
ser1

1    0.910493
2    0.298214
3    0.318329
4    0.884459
dtype: float64

In [29]:
ser2 = pd.Series(np.random.rand(4), index=[5,6,7,8])
ser2

5    0.895608
6    0.423411
7    0.103493
8    0.696658
dtype: float64

In [30]:
pd.concat([ser1,ser2])

1    0.910493
2    0.298214
3    0.318329
4    0.884459
5    0.895608
6    0.423411
7    0.103493
8    0.696658
dtype: float64

In [31]:
pd.concat([ser1,ser2],axis=1)

Unnamed: 0,0,1
1,0.910493,
2,0.298214,
3,0.318329,
4,0.884459,
5,,0.895608
6,,0.423411
7,,0.103493
8,,0.696658


In [33]:
pd.concat([ser1,ser2],axis=1,join='inner')

Unnamed: 0,0,1


In [34]:
pd.concat([ser1,ser2], keys=[1,2])

1  1    0.910493
   2    0.298214
   3    0.318329
   4    0.884459
2  5    0.895608
   6    0.423411
   7    0.103493
   8    0.696658
dtype: float64

In [35]:
pd.concat([ser1,ser2], axis=1, keys=[1,2])

Unnamed: 0,1,2
1,0.910493,
2,0.298214,
3,0.318329,
4,0.884459,
5,,0.895608
6,,0.423411
7,,0.103493
8,,0.696658


In [36]:
frame1 = pd.DataFrame(np.random.rand(9).reshape(3,3), index=[1,2,3], columns=['A','B','C'])
frame2 = pd.DataFrame(np.random.rand(9).reshape(3,3), index=[4,5,6], columns=['A','B','C'])
pd.concat([frame1, frame2])

Unnamed: 0,A,B,C
1,0.47594,0.447771,0.37267
2,0.349441,0.83173,0.577195
3,0.559221,0.541314,0.877827
4,0.248061,0.890715,0.259952
5,0.687247,0.995097,0.988769
6,0.513229,0.056054,0.886335


In [37]:
pd.concat([frame1, frame2], axis=1)

Unnamed: 0,A,B,C,A.1,B.1,C.1
1,0.47594,0.447771,0.37267,,,
2,0.349441,0.83173,0.577195,,,
3,0.559221,0.541314,0.877827,,,
4,,,,0.248061,0.890715,0.259952
5,,,,0.687247,0.995097,0.988769
6,,,,0.513229,0.056054,0.886335


### Combining

In [38]:
ser1 = pd.Series(np.random.rand(5),index=[1,2,3,4,5])
ser1

1    0.891562
2    0.632302
3    0.881810
4    0.278750
5    0.729209
dtype: float64

In [39]:
ser2 = pd.Series(np.random.rand(4),index=[2,4,5,6])
ser2

2    0.920502
4    0.723257
5    0.138429
6    0.628079
dtype: float64

In [40]:
ser1.combine_first(ser2)

1    0.891562
2    0.632302
3    0.881810
4    0.278750
5    0.729209
6    0.628079
dtype: float64

In [41]:
ser2.combine_first(ser1)

1    0.891562
2    0.920502
3    0.881810
4    0.723257
5    0.138429
6    0.628079
dtype: float64

In [None]:
ser1[:3].combine_first(ser2[:3])

### Pivoting

* Pivoting with Hierarchical Indexing

In [43]:
frame1 = pd.DataFrame(np.arange(9).reshape(3,3),
                      index=['white','black','red'],
                      columns=['ball','pen','pencil'])
frame1

Unnamed: 0,ball,pen,pencil
white,0,1,2
black,3,4,5
red,6,7,8


In [44]:
frame1.stack()

white  ball      0
       pen       1
       pencil    2
black  ball      3
       pen       4
       pencil    5
red    ball      6
       pen       7
       pencil    8
dtype: int32

In [None]:
ser5.unstack()

In [None]:
ser5.unstack(0)

* Pivoting from “Long” to “Wide” Format

In [46]:
longframe = pd.DataFrame({ 'color':['white','white','white',
                                    'red','red','red',
                                    'black','black','black'],
                          'item':['ball','pen','mug',
                                  'ball','pen','mug',
                                  'ball','pen','mug'],
                          'value': np.random.rand(9)})
longframe

Unnamed: 0,color,item,value
0,white,ball,0.692423
1,white,pen,0.122552
2,white,mug,0.685354
3,red,ball,0.013963
4,red,pen,0.544619
5,red,mug,0.897542
6,black,ball,0.859759
7,black,pen,0.00958
8,black,mug,0.702358


### Removing

In [47]:
wideframe = longframe.pivot('color','item')
wideframe

Unnamed: 0_level_0,value,value,value
item,ball,mug,pen
color,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
black,0.859759,0.702358,0.00958
red,0.013963,0.897542,0.544619
white,0.692423,0.685354,0.122552


In [48]:
frame1 = pd.DataFrame(np.arange(9).reshape(3,3),
                      index=['white','black','red'],
                      columns=['ball','pen','pencil'])
frame1

Unnamed: 0,ball,pen,pencil
white,0,1,2
black,3,4,5
red,6,7,8


In [49]:
del frame1['ball']
frame1

Unnamed: 0,pen,pencil
white,1,2
black,4,5
red,7,8


In [None]:
frame1.drop('white')

## 6.3 Data Transformation

### Removing Duplicates

In [50]:
dframe = pd.DataFrame({ 'color': ['white','white','red','red','white'], 
                       'value': [2,1,3,3,2]})
dframe

Unnamed: 0,color,value
0,white,2
1,white,1
2,red,3
3,red,3
4,white,2


In [52]:
dframe.duplicated()

0    False
1    False
2    False
3     True
4     True
dtype: bool

In [51]:
dframe[dframe.duplicated()]

Unnamed: 0,color,value
3,red,3
4,white,2


In [53]:
dframe[dframe.duplicated()]

Unnamed: 0,color,value
3,red,3
4,white,2


### Mapping

In [None]:
map = {
    'label1' : 'value1,
    'label2' : 'value2,
    ...
}

* replace(): replaces values
* map(): creates a new column
* rename(): replaces the index values

* Replacing Values via Mapping

In [54]:
frame = pd.DataFrame({ 'item':['ball','mug','pen','pencil','ashtray'],
                      'color':['white','rosso','verde','black','yellow'],
                      'price':[5.56,4.20,1.30,0.56,2.75]})
frame

Unnamed: 0,color,item,price
0,white,ball,5.56
1,rosso,mug,4.2
2,verde,pen,1.3
3,black,pencil,0.56
4,yellow,ashtray,2.75


In [55]:
newcolors = {
    'rosso': 'red',
    'verde': 'green'
}

In [56]:
frame.replace(newcolors)

Unnamed: 0,color,item,price
0,white,ball,5.56
1,red,mug,4.2
2,green,pen,1.3
3,black,pencil,0.56
4,yellow,ashtray,2.75


In [57]:
ser = pd.Series([1,3,np.nan,4,6,np.nan,3])
ser

0    1.0
1    3.0
2    NaN
3    4.0
4    6.0
5    NaN
6    3.0
dtype: float64

In [58]:
ser.replace(np.nan,0)

0    1.0
1    3.0
2    0.0
3    4.0
4    6.0
5    0.0
6    3.0
dtype: float64

* Adding Values via Mapping

In [59]:
frame = pd.DataFrame({ 'item':['ball','mug','pen','pencil','ashtray'],
                      'color':['white','red','green','black','yellow']})
frame

Unnamed: 0,color,item
0,white,ball
1,red,mug
2,green,pen
3,black,pencil
4,yellow,ashtray


In [60]:
price = {
    'ball' : 5.56,
    'mug' : 4.20,
    'bottle' : 1.30,
    'scissors' : 3.41,
    'pen' : 1.30,
    'pencil' : 0.56,
    'ashtray' : 2.75
}

In [None]:
frame['price'] = frame['item'].map(prices)
frame

* Rename the Indexes of the Axes

In [62]:
frame

Unnamed: 0,color,item
0,white,ball
1,red,mug
2,green,pen
3,black,pencil
4,yellow,ashtray


In [63]:
reindex = {
    0: 'first',
    1: 'second',
    2: 'third',
    3: 'fourth',
    4: 'fifth'}
frame.rename(reindex)

Unnamed: 0,color,item
first,white,ball
second,red,mug
third,green,pen
fourth,black,pencil
fifth,yellow,ashtray


In [64]:
recolumn = {
    'item':'object',
    'price': 'value'}
frame.rename(index=reindex, columns=recolumn)

Unnamed: 0,color,object
first,white,ball
second,red,mug
third,green,pen
fourth,black,pencil
fifth,yellow,ashtray


In [65]:
frame.rename(index={1:'first'}, columns={'item':'object'})

Unnamed: 0,color,object
0,white,ball
first,red,mug
2,green,pen
3,black,pencil
4,yellow,ashtray


In [66]:
frame.rename(columns={'item':'object'}, inplace=True)
frame

Unnamed: 0,color,object
0,white,ball
1,red,mug
2,green,pen
3,black,pencil
4,yellow,ashtray


## 6.4 Discretization and Binning

In [67]:
results = [12,34,67,55,28,90,99,12,3,56,74,44,87,23,49,89,87]

In [68]:
bins = [0,25,50,75,100]

In [69]:
cat = pd.cut(results, bins)
cat

[(0, 25], (25, 50], (50, 75], (50, 75], (25, 50], ..., (75, 100], (0, 25], (25, 50], (75, 100], (75, 100]]
Length: 17
Categories (4, object): [(0, 25] < (25, 50] < (50, 75] < (75, 100]]

In [None]:
cat.levels

In [71]:
cat.labels

  if __name__ == '__main__':


array([0, 1, 2, 2, 1, 3, 3, 0, 0, 2, 2, 1, 3, 0, 1, 3, 3], dtype=int8)

In [72]:
pd.value_counts(cat)

(75, 100]    5
(50, 75]     4
(25, 50]     4
(0, 25]      4
dtype: int64

In [74]:
bin_names = ['unlikely','less likely','likely','highly likely']

In [75]:
pd.cut(results, bins, labels=bin_names)

[unlikely, less likely, likely, likely, less likely, ..., highly likely, unlikely, less likely, highly likely, highly likely]
Length: 17
Categories (4, object): [unlikely < less likely < likely < highly likely]

In [76]:
 pd.cut(results, 5)

[(2.904, 22.2], (22.2, 41.4], (60.6, 79.8], (41.4, 60.6], (22.2, 41.4], ..., (79.8, 99], (22.2, 41.4], (41.4, 60.6], (79.8, 99], (79.8, 99]]
Length: 17
Categories (5, object): [(2.904, 22.2] < (22.2, 41.4] < (41.4, 60.6] < (60.6, 79.8] < (79.8, 99]]

In [77]:
quintiles = pd.qcut(results, 5)
quintiles

[[3, 24], (24, 46], (62.6, 87], (46, 62.6], (24, 46], ..., (62.6, 87], [3, 24], (46, 62.6], (87, 99], (62.6, 87]]
Length: 17
Categories (5, object): [[3, 24] < (24, 46] < (46, 62.6] < (62.6, 87] < (87, 99]]

In [78]:
 pd.value_counts(quintiles)

(62.6, 87]    4
[3, 24]       4
(87, 99]      3
(46, 62.6]    3
(24, 46]      3
dtype: int64

### Detecting and Filtering Outliers

In [79]:
randframe = pd.DataFrame(np.random.randn(1000,3))

In [80]:
randframe.describe()

Unnamed: 0,0,1,2
count,1000.0,1000.0,1000.0
mean,-0.021353,0.034104,0.041467
std,1.01294,1.006746,1.015152
min,-3.250735,-3.426949,-3.768719
25%,-0.692757,-0.672788,-0.661038
50%,-0.006635,0.036966,0.07184
75%,0.661155,0.723325,0.696483
max,3.205903,3.432151,3.725687


In [81]:
randframe.std()

0    1.012940
1    1.006746
2    1.015152
dtype: float64

In [82]:
randframe[(np.abs(randframe) > (3*randframe.std())).any(1)]

Unnamed: 0,0,1,2
9,-1.043426,1.53199,3.080348
18,0.077319,-1.846109,3.046813
51,3.205903,0.054044,1.437197
157,-0.914126,0.982495,-3.476512
182,0.723537,-3.426949,-1.413697
210,-1.613691,3.432151,-0.725403
541,0.729019,-1.153548,3.725687
546,-3.250735,-0.128765,0.662599
604,-0.326745,3.104579,0.962763
745,1.336574,3.082801,0.920802


## 6.5 Permutation

In [83]:
nframe = pd.DataFrame(np.arange(25).reshape(5,5))
nframe

Unnamed: 0,0,1,2,3,4
0,0,1,2,3,4
1,5,6,7,8,9
2,10,11,12,13,14
3,15,16,17,18,19
4,20,21,22,23,24


In [84]:
new_order = np.random.permutation(5)
new_order

array([3, 2, 1, 0, 4])

In [87]:
nframe.take(new_order)

Unnamed: 0,0,1,2,3,4
3,15,16,17,18,19
4,20,21,22,23,24
2,10,11,12,13,14


In [86]:
new_order = [3,4,2]
nframe.take(new_order)

Unnamed: 0,0,1,2,3,4
3,15,16,17,18,19
4,20,21,22,23,24
2,10,11,12,13,14


* Random Sampling

In [88]:
sample = np.random.randint(0, len(nframe), size=3)
sample

array([3, 0, 3])

In [89]:
nframe.take(sample)

Unnamed: 0,0,1,2,3,4
3,15,16,17,18,19
0,0,1,2,3,4
3,15,16,17,18,19


## 6.6 String Manipulation

### Built-in Methods for Manipulation of Strings

In [90]:
text = '16 Bolton Avenue , Boston'
text.split(',')

['16 Bolton Avenue ', ' Boston']

In [91]:
tokens = [s.strip() for s in text.split(',')]
tokens

['16 Bolton Avenue', 'Boston']

In [92]:
address, city = [s.strip() for s in text.split(',')]
address

'16 Bolton Avenue'

In [93]:
city

'Boston'

In [94]:
address + ',' + city

'16 Bolton Avenue,Boston'

In [95]:
strings = ['A+','A','A-','B','BB','BBB','C+']

In [96]:
';'.join(strings)

'A+;A;A-;B;BB;BBB;C+'

In [97]:
'Boston' in text

True

In [98]:
text.index('Boston')

19

In [99]:
text.find('Boston')

19

In [100]:
text.index('New York')

ValueError: substring not found

In [101]:
text.find('New York')

-1

In [102]:
text.count('e')

2

In [103]:
text.count('Avenue')

1

In [104]:
text.replace('Avenue','Street')

'16 Bolton Street , Boston'

In [105]:
text.replace('1','')

'6 Bolton Avenue , Boston'

### Regular Expressions

* pattern matching  
* substitution  
* splitting  

In [106]:
import re

In [107]:
text = "This is an\t odd \n text!"

In [108]:
re.split('\s+', text)

['This', 'is', 'an', 'odd', 'text!']

In [109]:
regex = re.compile('\s+')

In [110]:
regex.split(text)

['This', 'is', 'an', 'odd', 'text!']

In [111]:
text = 'This is my address: 16 Bolton Avenue, Boston'
re.findall('A\w+',text)

['Avenue']

In [112]:
re.findall('[A,a]\w+',text)

['address', 'Avenue']

In [113]:
re.search('[A,a]\w+',text)

<_sre.SRE_Match object; span=(11, 18), match='address'>

In [114]:
search = re.search('[A,a]\w+',text)
search.start()

11

In [115]:
search.end()

18

In [116]:
text[search.start():search.end()]

'address'

In [117]:
re.match('[A,a]\w+',text)

In [118]:
re.match('T\w+',text)

<_sre.SRE_Match object; span=(0, 4), match='This'>

In [119]:
match = re.match('T\w+',text)

In [120]:
text[match.start():match.end()]

'This'

## 6.7 Data Aggregation

### GroupBy

### A Practical Example

In [121]:
frame = pd.DataFrame({ 'color': ['white','red','green','red','green'],
                      'object': ['pen','pencil','pencil','ashtray','pen'],
                      'price1' : [5.56,4.20,1.30,0.56,2.75],
                      'price2' : [4.75,4.12,1.60,0.75,3.15]})
frame

Unnamed: 0,color,object,price1,price2
0,white,pen,5.56,4.75
1,red,pencil,4.2,4.12
2,green,pencil,1.3,1.6
3,red,ashtray,0.56,0.75
4,green,pen,2.75,3.15


In [122]:
group = frame['price1'].groupby(frame['color'])
group

<pandas.core.groupby.SeriesGroupBy object at 0x000000A9E3AC4EF0>

In [123]:
group.groups

{'green': Int64Index([2, 4], dtype='int64'),
 'red': Int64Index([1, 3], dtype='int64'),
 'white': Int64Index([0], dtype='int64')}

In [124]:
group.mean()

color
green    2.025
red      2.380
white    5.560
Name: price1, dtype: float64

In [125]:
group.sum()

color
green    4.05
red      4.76
white    5.56
Name: price1, dtype: float64

### Hierarchical Grouping

In [126]:
ggroup = frame['price1'].groupby([frame['color'],frame['object']])
ggroup.groups

{('green', 'pen'): Int64Index([4], dtype='int64'),
 ('green', 'pencil'): Int64Index([2], dtype='int64'),
 ('red', 'ashtray'): Int64Index([3], dtype='int64'),
 ('red', 'pencil'): Int64Index([1], dtype='int64'),
 ('white', 'pen'): Int64Index([0], dtype='int64')}

In [127]:
ggroup.sum()

color  object 
green  pen        2.75
       pencil     1.30
red    ashtray    0.56
       pencil     4.20
white  pen        5.56
Name: price1, dtype: float64

In [128]:
frame[['price1','price2']].groupby(frame['color']).mean()

Unnamed: 0_level_0,price1,price2
color,Unnamed: 1_level_1,Unnamed: 2_level_1
green,2.025,2.375
red,2.38,2.435
white,5.56,4.75


In [129]:
frame.groupby(frame['color']).mean()

Unnamed: 0_level_0,price1,price2
color,Unnamed: 1_level_1,Unnamed: 2_level_1
green,2.025,2.375
red,2.38,2.435
white,5.56,4.75


## 6.8 Group Iteration

In [131]:
for name, group in frame.groupby('color'):
    print (name)
    print (group)

green
   color  object  price1  price2
2  green  pencil    1.30    1.60
4  green     pen    2.75    3.15
red
  color   object  price1  price2
1   red   pencil    4.20    4.12
3   red  ashtray    0.56    0.75
white
   color object  price1  price2
0  white    pen    5.56    4.75


### Chain of Transformations

In [132]:
result1 = frame['price1'].groupby(frame['color']).mean()
type(result1)

pandas.core.series.Series

In [133]:
result2 = frame.groupby(frame['color']).mean()
type(result2)

pandas.core.frame.DataFrame

In [134]:
frame['price1'].groupby(frame['color']).mean()
frame.groupby(frame['color'])['price1'].mean()

color
green    2.025
red      2.380
white    5.560
Name: price1, dtype: float64

In [135]:
(frame.groupby(frame['color']).mean())['price1']

color
green    2.025
red      2.380
white    5.560
Name: price1, dtype: float64

In [136]:
means = frame.groupby('color').mean().add_prefix('mean_')
means

Unnamed: 0_level_0,mean_price1,mean_price2
color,Unnamed: 1_level_1,Unnamed: 2_level_1
green,2.025,2.375
red,2.38,2.435
white,5.56,4.75


### Functions on Groups

In [137]:
group = frame.groupby('color')
group['price1'].quantile(0.6)

color
green    2.170
red      2.744
white    5.560
Name: price1, dtype: float64

In [138]:
def range(series):
    return series.max() - series.min()
group['price1'].agg(range)

color
green    1.45
red      3.64
white    0.00
Name: price1, dtype: float64

In [139]:
group.agg(range)

Unnamed: 0_level_0,price1,price2
color,Unnamed: 1_level_1,Unnamed: 2_level_1
green,1.45,1.55
red,3.64,3.37
white,0.0,0.0


In [140]:
group['price1'].agg(['mean','std',range])

Unnamed: 0_level_0,mean,std,range
color,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
green,2.025,1.025305,1.45
red,2.38,2.573869,3.64
white,5.56,,0.0


## 6.9 Advanced Data Aggregation

In [141]:
frame = pd.DataFrame({ 'color':['white','red','green','red','green'],
                      'price1':[5.56,4.20,1.30,0.56,2.75],
                      'price2':[4.75,4.12,1.60,0.75,3.15]})
frame

Unnamed: 0,color,price1,price2
0,white,5.56,4.75
1,red,4.2,4.12
2,green,1.3,1.6
3,red,0.56,0.75
4,green,2.75,3.15


In [147]:
sums = frame.groupby('color').sum().add_prefix('tot_')
sums

Unnamed: 0_level_0,tot_price1,tot_price2
color,Unnamed: 1_level_1,Unnamed: 2_level_1
green,4.05,4.75
red,4.76,4.87
white,5.56,4.75


In [None]:
merge(frame,sums,left_on='color',right_index=True)

In [149]:
frame.groupby('color').transform(np.sum).add_prefix('tot_')

Unnamed: 0,tot_price1,tot_price2
0,5.56,4.75
1,4.76,4.87
2,4.05,4.75
3,4.76,4.87
4,4.05,4.75


In [None]:
frame = DataFrame( { 'color':['white','black','white','white','black','black'],
                    'status':['up','up','down','down','down','up'],
                    'value1':[12.33,14.55,22.34,27.84,23.40,18.33],
                    'value2':[11.23,31.80,29.99,31.18,18.25,22.44]})
frame

In [None]:
frame.groupby(['color','status']).apply( lambda x: x.max())

In [None]:
frame.rename(index=reindex, columns=recolumn)

In [None]:
temp = date_range('1/1/2015', periods=10, freq= 'H')
temp

In [None]:
timeseries = Series(np.random.rand(10), index=temp)
timeseries

In [None]:
timetable = DataFrame( {'date': temp, 'value1' : np.random.rand(10),
                        'value2' : np.random.rand(10)})
timetable

In [None]:
timetable['cat'] = ['up','down','left','left','up','up','down','right','right','up']
timetable

## 6.10 Conclusions