# Example for pandas `groupby`
## Copyright (C) Princeton Consultants, 2017-2018
### First import pandas library

In [1]:
import pandas as pd

### Read in some data

In [2]:
peoplehoursperday = pd.read_csv("peoplehoursperday.csv", index_col=[0,1])
peoplehoursperday.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,hours
id,day,Unnamed: 2_level_1
ADK0000000000ICFPT,Mo,0
BEL0000000001JDGQU,Mo,8
CFM0000000002KEHRV,Mo,0
DGN0000000003LFISW,Mo,0
EHO0000000004MGJTX,Mo,0


### Group by the 'id'

In [3]:
peoplehoursperday.groupby(by='id').sum().head()

Unnamed: 0_level_0,hours
id,Unnamed: 1_level_1
ADK0000000000ICFPT,6
ADK0000000026ICFPT,8
ADK0000000052ICFPT,16
ADK0000000078ICFPT,16
ADK0000000104ICFPT,25


### Group by the 'day'

In [4]:
peoplehoursperday.groupby(by='day').sum()

Unnamed: 0_level_0,hours
day,Unnamed: 1_level_1
Fr,1110
Mo,1518
Sa,133
Su,88
Th,1696
Tu,1818
We,1757


### Group by the day, and get the sum and the count

In [5]:
peoplehoursperday[peoplehoursperday.hours > 0].groupby(by='day').agg(['sum', 'count'])

Unnamed: 0_level_0,hours,hours
Unnamed: 0_level_1,sum,count
day,Unnamed: 1_level_2,Unnamed: 2_level_2
Fr,1110,203
Mo,1518,264
Sa,133,34
Su,88,25
Th,1696,291
Tu,1818,311
We,1757,299


### Note how the column names are a `MultiIndex`. Also, we would like the days to be ordered.
### Create a DataFrame that indicates the order of each of the days

In [6]:
dayorder = pd.DataFrame.from_records([('Mo',1),('Tu',2),('We',3),('Th',4),('Fr',5),('Sa',6),('Su',7)], columns=['day','order'])
dayorder

Unnamed: 0,day,order
0,Mo,1
1,Tu,2
2,We,3
3,Th,4
4,Fr,5
5,Sa,6
6,Su,7


### Merge the groupby and the day order. 

In [7]:
(peoplehoursperday[peoplehoursperday.hours > 0]
 .groupby(by='day')
 .agg(['sum', 'count'])
 .merge(dayorder, left_index=True, right_on='day')
 )



Unnamed: 0,"(hours, sum)","(hours, count)",day,order
4,1110,203,Fr,5
0,1518,264,Mo,1
5,133,34,Sa,6
6,88,25,Su,7
3,1696,291,Th,4
1,1818,311,Tu,2
2,1757,299,We,3


### The warning occurs because the dayorder DataFrame has columns with single names.  The groupby operator with `agg` created columns that have hierarchical names backed by a `MultiIndex`.  See if we can eliminate the warning.
### This is done in 3 steps. 
1. First we create the original result that has hierarchical columns
2. Then we rename the columns to get rid of the index, using a Python concept
3. Then we merge with the dayorder table, sort by the order, and drop the order column
### Then we add an additional column via a calculation

In [8]:
tmp =(peoplehoursperday[peoplehoursperday.hours > 0]
 .groupby(by='day')
 .agg(['sum', 'count']))
tmp.columns = ['_'.join(c) for c in tmp.columns.tolist()]
tmp
(tmp
 .merge(dayorder, left_index=True, right_on='day')
 .sort_values(by='order')
 .drop(['order'], axis=1)
 .set_index(['day'])
 .assign(hoursperday=tmp.hours_sum/tmp.hours_count)
 )

Unnamed: 0_level_0,hours_sum,hours_count,hoursperday
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Mo,1518,264,5.75
Tu,1818,311,5.845659
We,1757,299,5.876254
Th,1696,291,5.828179
Fr,1110,203,5.46798
Sa,133,34,3.911765
Su,88,25,3.52


### Alternatively, we can use the calendar library built in to python, and the map() method on a Series

In [9]:
import calendar

In [10]:
list(calendar.day_abbr)

['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']

In [11]:
(
peoplehoursperday[peoplehoursperday.hours > 0]
 .groupby(by='day')
 .agg(['sum', 'count'])
 .reset_index()
 .assign(order=lambda df: 
         df['day'].map(
             {nam[:2]:i for i,nam in enumerate(calendar.day_abbr)}
             ))
 .sort_values('order')
 .set_index('day')
 .drop(columns='order')
 .assign(hoursperday = lambda df: df.loc[:,('hours','sum')]/df.loc[:,('hours','count')])
)


Unnamed: 0_level_0,hours,hours,hoursperday
Unnamed: 0_level_1,sum,count,Unnamed: 3_level_1
day,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
Mo,1518,264,5.75
Tu,1818,311,5.845659
We,1757,299,5.876254
Th,1696,291,5.828179
Fr,1110,203,5.46798
Sa,133,34,3.911765
Su,88,25,3.52
