In [1]:
import pandas as pd

## Data 1

In [12]:
dtypes = {
    "first_name": "category",
    "gender": "category",
    "type": "category",
    "state": "category",
    "party": "category",
}

df = pd.read_csv(
    "gropby data/legislators-historical.csv",
    dtype=dtypes,
    usecols=list(dtypes) + ["birthday", "last_name"],
    parse_dates=["birthday"]
)


df.tail()


Unnamed: 0,last_name,first_name,birthday,gender,type,state,party
11970,Garrett,Thomas,1972-03-27,M,rep,VA,Republican
11971,Handel,Karen,1962-04-18,F,rep,GA,Republican
11972,Jones,Brenda,1959-10-24,F,rep,MI,Democrat
11973,Marino,Tom,1952-08-15,M,rep,PA,Republican
11974,Jones,Walter,1943-02-10,M,rep,NC,Republican


In [17]:
n_by_state = df.groupby('state')['last_name'].count().head()
n_by_state

state
AK     16
AL    206
AR    117
AS      2
AZ     48
Name: last_name, dtype: int64

In [19]:
n_by_state_gender = df.groupby(['state','gender'])['last_name'].count().head()
n_by_state_gender

state  gender
AK     F           0
       M          16
AL     F           3
       M         203
AR     F           5
Name: last_name, dtype: int64

In [21]:
 df.groupby("state", sort=False)["last_name"].count().head()

state
DE      97
VA     432
SC     251
MD     305
PA    1053
Name: last_name, dtype: int64

In [22]:
 df.groupby("state", sort=True)["last_name"].count().head()

state
AK     16
AL    206
AR    117
AS      2
AZ     48
Name: last_name, dtype: int64

In [25]:
# to show the result of groupby you need to iterate over it cause you can't print it
by_state = df.groupby("state")

for state,frame in by_state:
    print(f'first two entites for {state}')
    print('-'*10)
    print(frame.head(2))
    print('-'*10)

first two entites for AK
----------
     last_name first_name   birthday gender type state        party
6619    Waskey      Frank 1875-04-20      M  rep    AK     Democrat
6647      Cale     Thomas 1848-09-17      M  rep    AK  Independent
----------
first two entites for AL
----------
    last_name first_name   birthday gender type state       party
912   Crowell       John 1780-09-18      M  rep    AL  Republican
991    Walker       John 1783-08-12      M  sen    AL  Republican
----------
first two entites for AR
----------
     last_name first_name   birthday gender type state party
1001     Bates      James 1788-08-25      M  rep    AR   NaN
1279    Conway      Henry 1793-03-18      M  rep    AR   NaN
----------
first two entites for AS
----------
          last_name first_name   birthday gender type state     party
10797         Sunia       Fofó 1937-03-13      M  rep    AS  Democrat
11755  Faleomavaega        Eni 1943-08-15      M  rep    AS  Democrat
----------
first two entites

In [26]:
# we can access each group by accessing it using .groups 
by_state.groups['AK']

Int64Index([ 6619,  6647,  7442,  7501,  8039,  8236,  8877,  9819,  9951,
             9985, 10082, 10108, 10325, 11262, 11386, 11734],
           dtype='int64')

In [28]:
#this is equavilant to df.loc[df["state"] == "PA"]
by_state.get_group('AK').head()

Unnamed: 0,last_name,first_name,birthday,gender,type,state,party
6619,Waskey,Frank,1875-04-20,M,rep,AK,Democrat
6647,Cale,Thomas,1848-09-17,M,rep,AK,Independent
7442,Grigsby,George,1874-12-02,M,rep,AK,
7501,Sulzer,Charles,1879-02-24,M,rep,AK,
8039,Sutherland,Daniel,1869-04-17,M,rep,AK,Republican


In [29]:
#we can get the initial value of the groups as the following
state , frame = next(iter(by_state))
print(state)
print(frame)

AK
        last_name first_name   birthday gender type state        party
6619       Waskey      Frank 1875-04-20      M  rep    AK     Democrat
6647         Cale     Thomas 1848-09-17      M  rep    AK  Independent
7442      Grigsby     George 1874-12-02      M  rep    AK          NaN
7501       Sulzer    Charles 1879-02-24      M  rep    AK          NaN
8039   Sutherland     Daniel 1869-04-17      M  rep    AK   Republican
8236   Wickersham      James 1857-08-24      M  rep    AK   Republican
8877       Dimond    Anthony 1881-11-30      M  rep    AK     Democrat
9819     Gruening     Ernest 1887-02-06      M  sen    AK     Democrat
9951       Rivers      Ralph 1903-05-23      M  rep    AK     Democrat
9985     Bartlett     Edward 1904-04-20      M  sen    AK     Democrat
10082     Pollock     Howard 1920-04-11      M  rep    AK   Republican
10108      Begich   Nicholas 1932-04-06      M  rep    AK     Democrat
10325      Gravel    Maurice 1930-05-13      M  sen    AK     Democrat
112

## Data2

In [32]:
df2 = pd.read_csv(
    "gropby data/airqual.csv",
    parse_dates=[["Date", "Time"]],
    na_values=[-200],
    usecols=["Date", "Time", "CO(GT)", "T", "RH", "AH"]
).rename(
    columns={
        "CO(GT)": "co",
        "Date_Time": "tstamp",
        "T": "temp_c",
        "RH": "rel_hum",
        "AH": "abs_hum",
    }
).set_index("tstamp")
df2.head()

Unnamed: 0_level_0,co,temp_c,rel_hum,abs_hum
tstamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2004-03-10 18:00:00,2.6,13.6,48.9,0.7578
2004-03-10 19:00:00,2.0,13.3,47.7,0.7255
2004-03-10 20:00:00,2.2,11.9,54.0,0.7502
2004-03-10 21:00:00,2.2,11.0,60.0,0.7867
2004-03-10 22:00:00,1.6,11.2,59.6,0.7888


In [34]:
df2.dtypes

co         float64
temp_c     float64
rel_hum    float64
abs_hum    float64
dtype: object

In [35]:
type(df2.index)

pandas.core.indexes.datetimes.DatetimeIndex

In [36]:
day_names = df2.index.day_name()
day_names[:5]

Index(['Wednesday', 'Wednesday', 'Wednesday', 'Wednesday', 'Wednesday'], dtype='object', name='tstamp')

In [39]:
by_day = df2.groupby(day_names)['co']

In [40]:
for day,frame in by_day:
    print('-'*50)
    print(day)
    print(frame)
    print("-"*50)

--------------------------------------------------
Friday
tstamp
2004-03-12 00:00:00    1.7
2004-03-12 01:00:00    1.9
2004-03-12 02:00:00    1.4
2004-03-12 03:00:00    0.8
2004-03-12 04:00:00    NaN
                      ... 
2005-04-01 19:00:00    2.2
2005-04-01 20:00:00    1.4
2005-04-01 21:00:00    0.9
2005-04-01 22:00:00    0.8
2005-04-01 23:00:00    1.1
Name: co, Length: 1344, dtype: float64
--------------------------------------------------
--------------------------------------------------
Monday
tstamp
2004-03-15 00:00:00    1.8
2004-03-15 01:00:00    1.8
2004-03-15 02:00:00    1.8
2004-03-15 03:00:00    1.1
2004-03-15 04:00:00    NaN
                      ... 
2005-04-04 10:00:00    3.1
2005-04-04 11:00:00    2.4
2005-04-04 12:00:00    2.4
2005-04-04 13:00:00    2.1
2005-04-04 14:00:00    2.2
Name: co, Length: 1335, dtype: float64
--------------------------------------------------
--------------------------------------------------
Saturday
tstamp
2004-03-13 00:00:00    2.7
20

In [42]:
df2.groupby(day_names)["co"].mean()

tstamp
Friday       2.543041
Monday       2.016741
Saturday     1.861077
Sunday       1.438069
Thursday     2.455505
Tuesday      2.382267
Wednesday    2.400787
Name: co, dtype: float64

In [47]:
day_names = df2.index.day_name()
hour = df2.index.hour
by_hour= df2.groupby([day_names,hour])['co'].mean().rename_axis(["dow", "hr"])
by_hour

dow        hr
Friday     0     1.936170
           1     1.608511
           2     1.172340
           3     0.887234
           4     0.823333
                   ...   
Wednesday  19    4.146809
           20    3.844681
           21    2.897872
           22    2.102128
           23    1.938298
Name: co, Length: 168, dtype: float64

In [48]:
#group by an observation’s year and quarter
df2.groupby([df2.index.year, df2.index.quarter])['co'].agg(['min','max']).rename_axis(['year','quarter'])

Unnamed: 0_level_0,Unnamed: 1_level_0,min,max
year,quarter,Unnamed: 2_level_1,Unnamed: 3_level_1
2004,1,0.3,8.1
2004,2,0.1,7.3
2004,3,0.1,7.5
2004,4,0.1,11.9
2005,1,0.1,8.7
2005,2,0.3,5.0


## Data3

In [50]:
def parse_millisecond_timestamp(ts):
    """Convert ms since Unix epoch to UTC datetime instance."""
    return pd.to_datetime(ts, unit="ms")

df3 = pd.read_csv(
    "gropby data/news.csv",
    sep="\t",
    header=None,
    index_col=0,
    names=["title", "url", "outlet", "category", "cluster", "host", "tstamp"],
    parse_dates=["tstamp"],
    date_parser=parse_millisecond_timestamp,
    dtype={
        "outlet": "category",
        "category": "category",
        "cluster": "category",
        "host": "category",
    },
)

In [51]:
df3.head()

Unnamed: 0,title,url,outlet,category,cluster,host,tstamp
1,"Fed official says weak data caused by weather,...",http://www.latimes.com/business/money/la-fi-mo...,Los Angeles Times,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.latimes.com,2014-03-10 16:52:50.698
2,Fed's Charles Plosser sees high bar for change...,http://www.livemint.com/Politics/H2EvwJSK2VE6O...,Livemint,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.livemint.com,2014-03-10 16:52:51.207
3,US open: Stocks fall after Fed official hints ...,http://www.ifamagazine.com/news/us-open-stocks...,IFA Magazine,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.ifamagazine.com,2014-03-10 16:52:51.550
4,"Fed risks falling 'behind the curve', Charles ...",http://www.ifamagazine.com/news/fed-risks-fall...,IFA Magazine,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.ifamagazine.com,2014-03-10 16:52:51.793
5,Fed's Plosser: Nasty Weather Has Curbed Job Gr...,http://www.moneynews.com/Economy/federal-reser...,Moneynews,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.moneynews.com,2014-03-10 16:52:52.027


In [53]:
f= df3.groupby('outlet',sort=False)['title'].apply(lambda sir : sir.str.contains('Fed').sum()).nlargest(10)
f

outlet
Reuters                         161
NASDAQ                          103
Businessweek                     93
Investing.com                    66
Wall Street Journal \(blog\)     61
MarketWatch                      56
Moneynews                        55
Bloomberg                        53
GlobalPost                       51
Economic Times                   44
Name: title, dtype: int64