In [1]:
import sys
import os
sys.path.append(os.path.abspath(".."))
import random, string
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import re
sns.set(context='paper', style='whitegrid', color_codes=True, font_scale=1.8)
colorcycle = [(0.498, 0.788, 0.498),
              (0.745, 0.682, 0.831),
              (0.992, 0.753, 0.525),
              (0.220, 0.424, 0.690),
              (0.749, 0.357, 0.090),
              (1.000, 1.000, 0.600),
              (0.941, 0.008, 0.498),
              (0.400, 0.400, 0.400)]
sns.set_palette(colorcycle)
mpl.rcParams['figure.max_open_warning'] = 65
mpl.rcParams['figure.figsize'] = [12, 7]

from speclib import misc, plotting, loaders

%matplotlib inline 

# Resample

http://pandas.pydata.org/pandas-docs/stable/timeseries.html#resampling

In [2]:
rng = pd.date_range('1/1/2012', periods=100, freq='S')
rng[:8]

DatetimeIndex(['2012-01-01 00:00:00', '2012-01-01 00:00:01',
               '2012-01-01 00:00:02', '2012-01-01 00:00:03',
               '2012-01-01 00:00:04', '2012-01-01 00:00:05',
               '2012-01-01 00:00:06', '2012-01-01 00:00:07'],
              dtype='datetime64[ns]', freq='S')

In [3]:
ts = pd.Series(np.random.randint(0, 500, len(rng)), index=rng) 

In [4]:
ts.resample('5Min').sum() 

2012-01-01    25243
Freq: 5T, dtype: int64

In [5]:
ts.resample('5Min').mean() 

2012-01-01    252.43
Freq: 5T, dtype: float64

In [6]:
ts.resample('20S').mean()

2012-01-01 00:00:00    237.80
2012-01-01 00:00:20    314.20
2012-01-01 00:00:40    226.00
2012-01-01 00:01:00    237.60
2012-01-01 00:01:20    246.55
Freq: 20S, dtype: float64

In [7]:
rng2 = np.random.permutation(pd.date_range('1/1/2012', periods=1000, freq='S'))[:350]
ts2 = pd.Series(np.random.randint(0, 500, len(rng)), index=rng)

In [8]:
ts2.resample('2Min').sum() 

2012-01-01    25383
Freq: 2T, dtype: int64

In [9]:
ts2.resample('2Min').count() 

2012-01-01    100
Freq: 2T, dtype: int64

In [10]:
ts.resample("5Min", closed='right').mean() 

2011-12-31 23:55:00     74.000000
2012-01-01 00:00:00    254.232323
Freq: 5T, dtype: float64

In [11]:
ts.resample("5Min", closed='left').mean()

2012-01-01    252.43
Freq: 5T, dtype: float64

In [12]:
ser_1 = pd.Series([random.choice(string.ascii_uppercase) for _ in range(len(rng2))]) 
ser_2 = pd.Series(np.random.randint(0, 10, len(rng2)))
ser_3 = pd.Series([random.choice(string.ascii_uppercase) for _ in range(len(rng2))]) 
df2 = pd.DataFrame(np.array([ser_1, ser_2, ser_3]).T, index=rng2, columns=['user', 'value', 'adjacent'])

In [13]:
df2.head(12)

Unnamed: 0,user,value,adjacent
2012-01-01 00:05:47,F,9,C
2012-01-01 00:09:20,G,1,Y
2012-01-01 00:11:26,X,2,Z
2012-01-01 00:10:13,G,9,Z
2012-01-01 00:01:40,Q,5,I
2012-01-01 00:12:46,Y,4,S
2012-01-01 00:04:03,T,3,R
2012-01-01 00:08:50,A,3,D
2012-01-01 00:10:47,D,5,X
2012-01-01 00:09:01,S,6,M


In [14]:
%timeit df2[['user', 'adjacent']].groupby([pd.Grouper(freq="6Min"), 'user']).count() 
%timeit pd.DataFrame(df2.groupby([pd.Grouper(freq="6Min"), 'user']).adjacent.count())

6.1 ms ± 796 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
4.03 ms ± 390 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [15]:
df2.groupby(['user', pd.Grouper(freq="6Min")]).adjacent.agg(["count", "sum"]) 

Unnamed: 0_level_0,Unnamed: 1_level_0,count,sum
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A,2012-01-01 00:00:00,5,ASKBR
A,2012-01-01 00:06:00,8,DVHJOJOH
A,2012-01-01 00:12:00,3,THZ
B,2012-01-01 00:00:00,5,APOIZ
B,2012-01-01 00:06:00,7,BELSICL
B,2012-01-01 00:12:00,3,XZV
C,2012-01-01 00:00:00,4,AKAS
C,2012-01-01 00:06:00,4,DLAT
C,2012-01-01 00:12:00,2,IK
D,2012-01-01 00:00:00,5,DJDBX


In [16]:
df2.groupby(['user', df2.index.weekday_name]).adjacent.agg(["count", "sum"])

Unnamed: 0_level_0,Unnamed: 1_level_0,count,sum
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A,Sunday,16,DVHAJSOJTKOHBHRZ
B,Sunday,15,BAPEOXZILSIVCLZ
C,Sunday,10,ADKLAATSIK
D,Sunday,16,XRYBDFJDIBLXAREI
E,Sunday,17,THCAFQQARSGDGSIBS
F,Sunday,15,CHTFSPNLFIGIZFZ
G,Sunday,12,YZQINUIWKDOI
H,Sunday,11,QKWVZGMFCLA
I,Sunday,7,VAJPOOC
J,Sunday,10,HRGLWVOLOG


In [17]:
tmp = pd.DataFrame(pd.date_range('1/1/2012', '1/7/2012', freq='D'))
tmp['weekname'] = tmp[0].dt.weekday_name
tmp['weekcode'] = tmp[0].dt.weekday
tmp.sort_values('weekcode')

Unnamed: 0,0,weekname,weekcode
1,2012-01-02,Monday,0
2,2012-01-03,Tuesday,1
3,2012-01-04,Wednesday,2
4,2012-01-05,Thursday,3
5,2012-01-06,Friday,4
6,2012-01-07,Saturday,5
0,2012-01-01,Sunday,6


In [18]:
rng3 = np.random.permutation(pd.date_range('1/1/2012', periods=100_000, freq='Min'))[:15000]
ts3 = pd.Series(np.random.randint(0, 500, len(rng3)), index=rng3)
ser_1 = pd.Series([random.choice(string.ascii_letters) for _ in range(len(rng3))]) 
ser_2 = pd.Series(np.random.randint(0, 10, len(rng3)))
ser_3 = pd.Series([random.choice(string.ascii_letters) for _ in range(len(rng3))]) 
df3 = pd.DataFrame(np.array([ser_1, ser_2, ser_3]).T, index=rng3, columns=['user', 'value', 'adjacent'])
df3.head(8)

Unnamed: 0,user,value,adjacent
2012-02-06 01:15:00,G,5,Z
2012-02-16 19:58:00,D,9,O
2012-02-08 15:39:00,p,6,c
2012-01-09 05:00:00,o,5,X
2012-01-20 01:33:00,M,2,q
2012-03-01 20:24:00,k,8,s
2012-02-23 20:37:00,b,3,k
2012-01-28 14:07:00,p,0,U


In [19]:
df3['before_workday'] = pd.Series(df3.index.weekday, index=df3.index).isin({0, 1, 2, 3, 6})

df3.sample(7) 

Unnamed: 0,user,value,adjacent,before_workday
2012-02-17 21:57:00,U,2,Y,False
2012-03-03 05:58:00,J,5,o,False
2012-02-05 14:42:00,Z,5,x,True
2012-01-07 16:52:00,j,9,l,False
2012-01-16 11:51:00,U,5,n,True
2012-02-06 05:30:00,h,6,G,True
2012-01-26 06:57:00,C,0,T,True


In [20]:
df3['free_time'] = (16 < df3.index.hour) | (df3.index.hour < 7)

df3.head(12)

Unnamed: 0,user,value,adjacent,before_workday,free_time
2012-02-06 01:15:00,G,5,Z,True,True
2012-02-16 19:58:00,D,9,O,True,True
2012-02-08 15:39:00,p,6,c,True,False
2012-01-09 05:00:00,o,5,X,True,True
2012-01-20 01:33:00,M,2,q,False,True
2012-03-01 20:24:00,k,8,s,True,True
2012-02-23 20:37:00,b,3,k,True,True
2012-01-28 14:07:00,p,0,U,False,False
2012-01-30 17:21:00,x,0,r,True,True
2012-02-26 20:17:00,T,0,t,True,True


In [21]:
df4 = df3[df3.before_workday & df3.free_time][['user', 'value', 'adjacent']]
df4.head(12)

Unnamed: 0,user,value,adjacent
2012-02-06 01:15:00,G,5,Z
2012-02-16 19:58:00,D,9,O
2012-01-09 05:00:00,o,5,X
2012-03-01 20:24:00,k,8,s
2012-02-23 20:37:00,b,3,k
2012-01-30 17:21:00,x,0,r
2012-02-26 20:17:00,T,0,t
2012-01-02 20:44:00,D,5,u
2012-02-05 22:57:00,B,3,q
2012-01-30 22:31:00,z,8,C


In [22]:
df4.groupby(['user', df4.index.weekday]).adjacent.agg(["count", "sum"]) 

Unnamed: 0_level_0,Unnamed: 1_level_0,count,sum
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A,0,31,sQkPcwvWmTrPIIQjIRZHFoYCfLfoGcn
A,1,23,bgnQPNkHJmajYVZauUUpUjQ
A,2,17,UtOnlaWIvlgeceuIH
A,3,25,STofYwDbnObGQHJFasXHsfERq
A,6,33,YurDFsOFGQEmOzDQyzNErIZtiAzPYzTvh
B,0,31,MEYwBouwvacUcjBjGokvkkkjxqLlHBE
B,1,23,FGItCVqhVuHPPudbMMciSUy
B,2,33,obcqlnuSqTiGQBagpQsKNjJASsbtoDFkQ
B,3,19,qoQXqqiUBOvdgDAVUxa
B,6,27,qGGrFKcDwNAbWLVQsKSAEomPBmq


In [23]:
tmp = pd.Series([1, 2, 3, np.NaN])
tmp

0    1.0
1    2.0
2    3.0
3    NaN
dtype: float64

In [24]:
tmp.count() 

3

In [25]:
tmp.size

4

In [26]:
minutes = df2.index.minute 

In [27]:
%timeit (df2.index.minute > 3) & (df2.index.minute < 8)
%timeit (minutes > 3) & (minutes < 8)

322 µs ± 8.55 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
87.3 µs ± 251 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [28]:
df5 = df3.reset_index().set_index('user').rename(columns={'index': 'time'})

In [29]:
df5

Unnamed: 0_level_0,time,value,adjacent,before_workday,free_time
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
G,2012-02-06 01:15:00,5,Z,True,True
D,2012-02-16 19:58:00,9,O,True,True
p,2012-02-08 15:39:00,6,c,True,False
o,2012-01-09 05:00:00,5,X,True,True
M,2012-01-20 01:33:00,2,q,False,True
k,2012-03-01 20:24:00,8,s,True,True
b,2012-02-23 20:37:00,3,k,True,True
p,2012-01-28 14:07:00,0,U,False,False
x,2012-01-30 17:21:00,0,r,True,True
T,2012-02-26 20:17:00,0,t,True,True
