# Aggrerations


In [1]:
import numpy as np
import pandas as pd

In [2]:
weather = pd.read_csv('weather_by_station.csv', index_col = 'date',
                      parse_dates = True)
weather.head()


Unnamed: 0_level_0,datatype,station,value,station_name
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2018-01-01,PRCP,GHCND:US1CTFR0039,0.0,"STAMFORD 4.2 S, CT US"
2018-01-01,PRCP,GHCND:US1NJBG0015,0.0,"NORTH ARLINGTON 0.7 WNW, NJ US"
2018-01-01,SNOW,GHCND:US1NJBG0015,0.0,"NORTH ARLINGTON 0.7 WNW, NJ US"
2018-01-01,PRCP,GHCND:US1NJBG0017,0.0,"GLEN ROCK 0.7 SSE, NJ US"
2018-01-01,SNOW,GHCND:US1NJBG0017,0.0,"GLEN ROCK 0.7 SSE, NJ US"


In this example cut() has three parameters, the column in which we wish to cut (Must be one dimensional array), bins that separates the values into equal parts and lastly, labels, which is used to label the three separated parts.

In [3]:
fb = pd.read_csv('fb_2018.csv', 
                 index_col = 'date', 
                 parse_dates = True
).assign(
    trading_volume = lambda x: pd.cut(x.volume, 
    bins = 3, 
    labels = ['low', 'med', 'high'])
)
fb.head() 



Unnamed: 0_level_0,open,high,low,close,volume,trading_volume
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2018-01-02,177.68,181.58,177.55,181.42,18151903,low
2018-01-03,181.88,184.78,181.33,184.67,16886563,low
2018-01-04,184.9,186.21,184.0996,184.33,13880896,low
2018-01-05,185.59,186.9,184.93,186.85,13574535,low
2018-01-08,187.2,188.9,186.33,188.28,17994726,low


In [4]:
#Using .2f to provide the float with two digits

pd.set_option('display.float_format', lambda x : '%.2f' % x)

# Summarizing DataFrames

In [5]:
#Using agg to do aggregations in one go using dict

fb_agg= fb.agg(
    {
        'open' : np.mean,
        'high' : np.max,
        'low'  : np.min,
        'close' : np.mean,
        'volume' : np.sum
    }
)
fb_agg

  fb_agg= fb.agg(
  fb_agg= fb.agg(
  fb_agg= fb.agg(
  fb_agg= fb.agg(


open            171.45
high            218.62
low             123.02
close           171.51
volume   6949682394.00
dtype: float64

In [6]:
weather.query(
    'station == "GHCND:USW00094728"'
).pivot(columns= 'datatype', values = 'value')[['SNOW', 'PRCP']].agg('sum')

datatype
SNOW   1007.00
PRCP   1665.30
dtype: float64

In [7]:
fb.agg({
    'open' : 'mean',
    'high': ['min', 'max'],
    'low' : ['min' , 'max'],
    'close' : 'mean'
})

Unnamed: 0,open,high,low,close
mean,171.45,,,171.51
min,,129.74,123.02,
max,,218.62,214.27,


# Using groupby()

In [8]:
#Taking the mean by group

fb.groupby('trading_volume').mean()

  fb.groupby('trading_volume').mean()


Unnamed: 0_level_0,open,high,low,close,volume
trading_volume,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
low,171.36,173.46,169.31,171.43,24547207.71
med,175.82,179.42,172.11,175.14,79072559.12
high,167.73,170.48,161.57,168.16,141924023.33


In [9]:
fb.groupby('trading_volume')['close'].agg(['min', 'max', 'mean'])

  fb.groupby('trading_volume')['close'].agg(['min', 'max', 'mean'])


Unnamed: 0_level_0,min,max,mean
trading_volume,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
low,124.06,214.67,171.43
med,152.22,217.5,175.14
high,160.06,176.26,168.16


In [10]:
# This time using dicts to specify which aggregations to do on a column

fb_agg = fb.groupby('trading_volume').agg({
    'open' : 'mean',
    'high' : ['min', 'max'],
    'low' : ['min', 'max'],
    'close'  : 'mean'
})
fb_agg

  fb_agg = fb.groupby('trading_volume').agg({


Unnamed: 0_level_0,open,high,high,low,low,close
Unnamed: 0_level_1,mean,min,max,min,max,mean
trading_volume,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
low,171.36,129.74,216.2,123.02,212.6,171.43
med,175.82,162.85,218.62,150.75,214.27,175.14
high,167.73,161.1,180.13,149.02,173.75,168.16


In [11]:
fb_agg.columns

MultiIndex([( 'open', 'mean'),
            ( 'high',  'min'),
            ( 'high',  'max'),
            (  'low',  'min'),
            (  'low',  'max'),
            ('close', 'mean')],
           )

In [12]:
#Joining levels using join()
fb_agg.columns = ['_'.join(col_agg) for col_agg in fb_agg.columns]
fb_agg.head()


Unnamed: 0_level_0,open_mean,high_min,high_max,low_min,low_max,close_mean
trading_volume,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
low,171.36,129.74,216.2,123.02,212.6,171.43
med,175.82,162.85,218.62,150.75,214.27,175.14
high,167.73,161.1,180.13,149.02,173.75,168.16


In [13]:
#We can use grouper to group datetimes even if it is an index cols
weather.loc['2018-10'].query('datatype == "PRCP"').groupby(
    pd.Grouper(freq = 'D')).head()


Unnamed: 0_level_0,datatype,station,value,station_name
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2018-10-01,PRCP,GHCND:US1CTFR0039,0.00,"STAMFORD 4.2 S, CT US"
2018-10-01,PRCP,GHCND:US1NJBG0003,0.00,"TENAFLY 1.3 W, NJ US"
2018-10-01,PRCP,GHCND:US1NJBG0017,0.00,"GLEN ROCK 0.7 SSE, NJ US"
2018-10-01,PRCP,GHCND:US1NJBG0018,0.00,"PALISADES PARK 0.2 WNW, NJ US"
2018-10-01,PRCP,GHCND:US1NJBG0023,0.00,"OAKLAND 0.9 SSE, NJ US"
...,...,...,...,...
2018-10-31,PRCP,GHCND:US1CTFR0039,0.00,"STAMFORD 4.2 S, CT US"
2018-10-31,PRCP,GHCND:US1NJBG0003,0.00,"TENAFLY 1.3 W, NJ US"
2018-10-31,PRCP,GHCND:US1NJBG0015,0.00,"NORTH ARLINGTON 0.7 WNW, NJ US"
2018-10-31,PRCP,GHCND:US1NJBG0017,0.00,"GLEN ROCK 0.7 SSE, NJ US"


In [14]:
weather.index

DatetimeIndex(['2018-01-01', '2018-01-01', '2018-01-01', '2018-01-01',
               '2018-01-01', '2018-01-01', '2018-01-01', '2018-01-01',
               '2018-01-01', '2018-01-01',
               ...
               '2018-12-31', '2018-12-31', '2018-12-31', '2018-12-31',
               '2018-12-31', '2018-12-31', '2018-12-31', '2018-12-31',
               '2018-12-31', '2018-12-31'],
              dtype='datetime64[ns]', name='date', length=80256, freq=None)

In [16]:
#Using the filter function

weather.groupby('station').filter(
    lambda x : 'NY' in x.name
).query(
    'datatype == "SNOW"'
).groupby('station_name').sum().squeeze() 

"""Note that squeeze() function here is a pandas func
in which it compresses a single column dataframe into a
series."""

Unnamed: 0_level_0,datatype,station,value
station_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"ALBERTSON 0.2 SSE, NY US",SNOWSNOWSNOWSNOWSNOWSNOWSNOWSNOWSNOWSNOWSNOWSN...,GHCND:US1NYNS0042GHCND:US1NYNS0042GHCND:US1NYN...,1087.0
"AMITYVILLE 0.1 WSW, NY US",SNOWSNOWSNOWSNOWSNOWSNOWSNOWSNOWSNOWSNOWSNOWSN...,GHCND:US1NYSF0089GHCND:US1NYSF0089GHCND:US1NYS...,434.0
"AMITYVILLE 0.6 NNE, NY US",SNOWSNOWSNOWSNOWSNOWSNOWSNOWSNOWSNOWSNOWSNOWSN...,GHCND:US1NYSF0092GHCND:US1NYSF0092GHCND:US1NYS...,1072.0
"ARMONK 0.3 SE, NY US",SNOWSNOWSNOWSNOWSNOWSNOWSNOWSNOWSNOWSNOWSNOWSN...,GHCND:US1NYWC0018GHCND:US1NYWC0018GHCND:US1NYW...,1504.0
"BROOKLYN 3.1 NW, NY US",SNOWSNOWSNOWSNOWSNOWSNOWSNOWSNOWSNOWSNOWSNOWSN...,GHCND:US1NYKN0025GHCND:US1NYKN0025GHCND:US1NYK...,305.0
"CENTERPORT 0.9 SW, NY US",SNOWSNOWSNOWSNOWSNOWSNOWSNOWSNOWSNOWSNOWSNOWSN...,GHCND:US1NYSF0061GHCND:US1NYSF0061GHCND:US1NYS...,799.0
"ELMSFORD 0.8 SSW, NY US",SNOWSNOWSNOWSNOWSNOWSNOWSNOWSNOWSNOW,GHCND:US1NYWC0019GHCND:US1NYWC0019GHCND:US1NYW...,863.0
"FLORAL PARK 0.4 W, NY US",SNOWSNOWSNOWSNOWSNOWSNOWSNOWSNOWSNOWSNOWSNOWSN...,GHCND:US1NYNS0007GHCND:US1NYNS0007GHCND:US1NYN...,1015.0
"HICKSVILLE 1.3 ENE, NY US",SNOWSNOWSNOWSNOWSNOWSNOWSNOWSNOWSNOWSNOWSNOWSN...,GHCND:US1NYNS0018GHCND:US1NYNS0018GHCND:US1NYN...,716.0
"JACKSON HEIGHTS 0.3 WSW, NY US",SNOWSNOWSNOWSNOWSNOWSNOWSNOWSNOWSNOWSNOWSNOWSN...,GHCND:US1NYQN0026GHCND:US1NYQN0026GHCND:US1NYQ...,107.0


In [20]:
#reading the weather csv file again

weather = pd.read_csv('weather_by_station.csv', index_col= 'date',
                      parse_dates=True)
weather

Unnamed: 0_level_0,datatype,station,value,station_name
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2018-01-01,PRCP,GHCND:US1CTFR0039,0.00,"STAMFORD 4.2 S, CT US"
2018-01-01,PRCP,GHCND:US1NJBG0015,0.00,"NORTH ARLINGTON 0.7 WNW, NJ US"
2018-01-01,SNOW,GHCND:US1NJBG0015,0.00,"NORTH ARLINGTON 0.7 WNW, NJ US"
2018-01-01,PRCP,GHCND:US1NJBG0017,0.00,"GLEN ROCK 0.7 SSE, NJ US"
2018-01-01,SNOW,GHCND:US1NJBG0017,0.00,"GLEN ROCK 0.7 SSE, NJ US"
...,...,...,...,...
2018-12-31,WDF5,GHCND:USW00094789,130.00,"JFK INTERNATIONAL AIRPORT, NY US"
2018-12-31,WSF2,GHCND:USW00094789,9.80,"JFK INTERNATIONAL AIRPORT, NY US"
2018-12-31,WSF5,GHCND:USW00094789,12.50,"JFK INTERNATIONAL AIRPORT, NY US"
2018-12-31,WT01,GHCND:USW00094789,1.00,"JFK INTERNATIONAL AIRPORT, NY US"


In [21]:
#Checking which month has the most precipitation.

weather.query('datatype == "PRCP"').groupby(
    pd.Grouper(freq='D')
).mean().groupby(pd.Grouper(freq = 'M')).sum().value.nlargest()

TypeError: agg function failed [how->mean,dtype->object]

In [22]:
#Using the transform()

fb[['open', 'high', 'low', 'close']].transform(
    lambda x : (x - x.mean()).div(x.std())).head()


Unnamed: 0_level_0,open,high,low,close
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2018-01-02,0.32,0.41,0.41,0.5
2018-01-03,0.53,0.57,0.6,0.66
2018-01-04,0.68,0.65,0.74,0.64
2018-01-05,0.72,0.68,0.78,0.77
2018-01-08,0.8,0.79,0.85,0.84


# Pivot Tables

In [23]:
fb.pivot_table(columns='trading_volume')


  fb.pivot_table(columns='trading_volume')


trading_volume,low,med,high
close,171.43,175.14,168.16
high,173.46,179.42,170.48
low,169.31,172.11,161.57
open,171.36,175.82,167.73
volume,24547207.71,79072559.12,141924023.33


In [24]:
fb.pivot_table(index = 'trading_volume')

  fb.pivot_table(index = 'trading_volume')


Unnamed: 0_level_0,close,high,low,open,volume
trading_volume,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
low,171.43,173.46,169.31,171.36,24547207.71
med,175.14,179.42,172.11,175.82,79072559.12
high,168.16,170.48,161.57,167.73,141924023.33


In [25]:
weather.reset_index().pivot_table(
    index = ['date', 'station', 'station_name'],
    columns = 'datatype',
    values = 'value',
    aggfunc= 'median'
).reset_index().tail()

datatype,date,station,station_name,AWND,DAPR,MDPR,PGTM,PRCP,SNOW,SNWD,...,WSF5,WT01,WT02,WT03,WT04,WT05,WT06,WT08,WT09,WT11
28740,2018-12-31,GHCND:USW00054787,"FARMINGDALE REPUBLIC AIRPORT, NY US",5.0,,,2052.0,28.7,,,...,15.7,,,,,,,,,
28741,2018-12-31,GHCND:USW00094728,"NY CITY CENTRAL PARK, NY US",,,,,25.9,0.0,0.0,...,,1.0,,,,,,,,
28742,2018-12-31,GHCND:USW00094741,"TETERBORO AIRPORT, NJ US",1.7,,,1954.0,29.2,,,...,8.9,,,,,,,,,
28743,2018-12-31,GHCND:USW00094745,"WESTCHESTER CO AIRPORT, NY US",2.7,,,2212.0,24.4,,,...,11.2,,,,,,,,,
28744,2018-12-31,GHCND:USW00094789,"JFK INTERNATIONAL AIRPORT, NY US",4.1,,,,31.2,0.0,0.0,...,12.5,1.0,1.0,,,,,,,


In [26]:
#Crosstabs to create a frequency table 
pd.crosstab(
    index = fb.trading_volume,
    columns= fb.index.month,
    colnames= ['month']
)

month,1,2,3,4,5,6,7,8,9,10,11,12
trading_volume,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
low,20,19,15,20,22,21,18,23,19,23,21,19
med,1,0,4,1,0,0,2,0,0,0,0,0
high,0,0,2,0,0,0,1,0,0,0,0,0


In [27]:
#Using normalize parameter to see the percentage of total

pd.crosstab(
    index = fb.trading_volume,
    columns= fb.index.month,
    colnames= ['month'],
    normalize= 'columns'
)

month,1,2,3,4,5,6,7,8,9,10,11,12
trading_volume,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
low,0.95,1.0,0.71,0.95,1.0,1.0,0.86,1.0,1.0,1.0,1.0,1.0
med,0.05,0.0,0.19,0.05,0.0,0.0,0.1,0.0,0.0,0.0,0.0,0.0
high,0.0,0.0,0.1,0.0,0.0,0.0,0.05,0.0,0.0,0.0,0.0,0.0


In [29]:
pd.crosstab(
    index = fb.trading_volume,
    columns= fb.index.month,
    colnames=['month'],
    values = fb.close,
    aggfunc=np.mean
)


  pd.crosstab(


month,1,2,3,4,5,6,7,8,9,10,11,12
trading_volume,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
low,185.24,180.27,177.07,163.29,182.93,195.27,201.92,177.49,164.38,154.19,141.64,137.16
med,179.37,,164.76,174.16,,,194.28,,,,,
high,,,164.11,,,,176.26,,,,,


Basically, we performed an aggregation using a crosstab
by passing the column to the value params and assigning the 
numpy.mean to aggfunc params

In [33]:
#Using margins to count the subtotals of columns and rows

snow_data = weather.query('datatype == "SNOW"')

pd.crosstab(
    index = snow_data.station_name,
    columns = snow_data.index.month,
    colnames = ['month'],
    values= snow_data.value,
    aggfunc=lambda x : (x > 0).sum(),
    margins= True,
    margins_name= 'Total Observations of Snow'

)

month,1,2,3,4,5,6,7,8,9,10,11,12,Total Observations of Snow
station_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
"ALBERTSON 0.2 SSE, NY US",3.00,1.00,3.00,1.00,0.00,0.00,0.00,0.00,0.00,0.00,1.00,0.00,9
"AMITYVILLE 0.1 WSW, NY US",1.00,0.00,1.00,1.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,3
"AMITYVILLE 0.6 NNE, NY US",3.00,1.00,3.00,1.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,8
"ARMONK 0.3 SE, NY US",6.00,4.00,6.00,3.00,0.00,0.00,0.00,0.00,0.00,0.00,1.00,3.00,23
"BLOOMINGDALE 0.7 SSE, NJ US",2.00,1.00,3.00,1.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,1.00,8
...,...,...,...,...,...,...,...,...,...,...,...,...,...
"WESTFIELD 0.6 NE, NJ US",3.00,0.00,4.00,1.00,0.00,,0.00,0.00,0.00,,1.00,,9
"WOODBRIDGE TWP 1.1 ESE, NJ US",4.00,1.00,3.00,2.00,0.00,0.00,0.00,0.00,0.00,0.00,1.00,0.00,11
"WOODBRIDGE TWP 1.1 NNE, NJ US",2.00,1.00,3.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,1.00,0.00,7
"WOODBRIDGE TWP 3.0 NNW, NJ US",,0.00,0.00,,,0.00,,,,0.00,0.00,,0
