In [1]:
import sys
import os
sys.path.append(os.path.abspath(".."))

import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import re
sns.set(context='paper', style='whitegrid', color_codes=True, font_scale=1.8)
colorcycle = [(0.498, 0.788, 0.498),
              (0.745, 0.682, 0.831),
              (0.992, 0.753, 0.525),
              (0.220, 0.424, 0.690),
              (0.749, 0.357, 0.090),
              (1.000, 1.000, 0.600),
              (0.941, 0.008, 0.498),
              (0.400, 0.400, 0.400)]
sns.set_palette(colorcycle)
mpl.rcParams['figure.max_open_warning'] = 65
mpl.rcParams['figure.figsize'] = [12, 7]

from speclib import misc, plotting, loaders

%matplotlib inline 

In [5]:
df = pd.read_msgpack('../../allan_data/bluetooth_light_no_nan.msgpack') 

In [6]:
df.head() 

Unnamed: 0_level_0,rssi,scanned_user,user
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2013-12-05 10:51:22,-65,u0069,u0182
2013-12-05 10:51:22,-80,u0331,u0182
2013-12-05 10:51:22,-68,u0702,u0182
2013-12-05 10:51:22,-62,u0535,u0182
2013-12-05 10:51:22,-73,u0732,u0182


In [7]:
df['hour'] = df.index.hour
print("Done computing hour")
df['weekday'] = df.index.weekday
print("Done computing weekday")
df['before_workday'] = df.weekday.isin({0, 1, 2, 3, 6})  # is it monday, tuesday, wendnesday, thursday or sunday?
print("Done computing before_workday")
df['free_time'] = (19 < df.hour) | (df.hour < 7)
print("Done computing free_time")


Done computing hour
Done computing weekday
Done computing before_workday
Done computing free_time


In [8]:
df.shape

(37949304, 7)

In [9]:
df.head() 

Unnamed: 0_level_0,rssi,scanned_user,user,hour,weekday,before_workday,free_time
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2013-12-05 10:51:22,-65,u0069,u0182,10,3,True,False
2013-12-05 10:51:22,-80,u0331,u0182,10,3,True,False
2013-12-05 10:51:22,-68,u0702,u0182,10,3,True,False
2013-12-05 10:51:22,-62,u0535,u0182,10,3,True,False
2013-12-05 10:51:22,-73,u0732,u0182,10,3,True,False


In [10]:
dfti = df[df.user == 'u0182'].index

In [11]:
dftiu = dfti.unique()
dftiu = dftiu.sort_values() 

In [12]:
dftiu.dtype

dtype('<M8[ns]')

In [13]:
index_delta = list()
for i in range(len(dftiu) - 1):
    index_delta.append(dftiu[i+1] - dftiu[i])
index_delta = pd.Series(index_delta)

In [14]:
index_delta.describe(include='all')

count                      6427
mean     0 days 01:44:26.905865
std      0 days 14:06:33.015288
min             0 days 00:01:04
25%             0 days 00:05:00
50%             0 days 00:05:02
75%             0 days 00:05:14
max            20 days 05:13:17
dtype: object

Krav: Folk skal være sammen i mindst 4 timer før det tæller som hænge ud sammen.

Sample try:

```
dfs = df.sample(4500)
dfs = dfs[dfs.before_workday & dfs.free_time] 

dfs.groupby(['user', dfs.index.weekday_name]).rssi.count()
```

In [16]:
# dfs = df.sample(20)
dfs = df[df.before_workday & df.free_time] 
# dfs['user_id'] = dfs.scanned_user.replace(np.NaN, dfs.bt_mac)
# dfs['scanned_user'] = dfs.scanned_user.replace(np.NaN, 'unknown')

In [17]:
dfs = dfs.reset_index().set_index('user')

In [18]:
dfs.head() 

Unnamed: 0_level_0,timestamp,rssi,scanned_user,hour,weekday,before_workday,free_time
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
u0182,2014-02-06 06:43:05,-93,u0238,6,3,True,True
u0182,2014-02-06 06:58:10,-94,u0381,6,3,True,True
u0182,2014-02-06 06:58:10,-82,u0363,6,3,True,True
u0182,2014-02-13 06:44:40,-87,u0920,6,3,True,True
u0182,2014-02-13 06:54:40,-93,u0574,6,3,True,True


In [42]:
tmp = dfs.iloc[:4000]

Check that timestamps and timedaltas can be used for binning/slicing

In [43]:
tmp.timestamp.iloc[0]

Timestamp('2014-02-06 06:43:05')

In [44]:
tmp.timestamp.iloc[0] + pd.Timedelta(4, unit='h') 

Timestamp('2014-02-06 10:43:05')

In [45]:
(
    ( tmp.timestamp.iloc[0] <= tmp.timestamp ) &
    ( tmp.timestamp <= (tmp.timestamp.iloc[0] + pd.Timedelta(4, unit='h')) )
).head(10) 

user
u0182     True
u0182     True
u0182     True
u0182    False
u0182    False
u0182    False
u0182    False
u0182    False
u0182    False
u0182    False
Name: timestamp, dtype: bool

In [46]:
df.head() 

Unnamed: 0_level_0,rssi,scanned_user,user,hour,weekday,before_workday,free_time
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2013-12-05 10:51:22,-65,u0069,u0182,10,3,True,False
2013-12-05 10:51:22,-80,u0331,u0182,10,3,True,False
2013-12-05 10:51:22,-68,u0702,u0182,10,3,True,False
2013-12-05 10:51:22,-62,u0535,u0182,10,3,True,False
2013-12-05 10:51:22,-73,u0732,u0182,10,3,True,False


In [107]:
def concatenater(args, frac=0.85):
    vc = args.value_counts()
    return set(vc[vc >= frac*vc.max()].index)

def mostly_present_counter(args):
    return len(concatenater(args))
    #con_len = len(concatenater(args))
    # return int(con_len) if con_len else None 

In [108]:
tmp2.iloc[18, 0]

set()

In [109]:
tmp2 = df.iloc[:1000][['user', 'scanned_user']].resample('4h', closed='left').agg(concatenater)
tmp2

Unnamed: 0_level_0,user,scanned_user
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1
2013-12-05 08:00:00,{u0182},{u0083}
2013-12-05 12:00:00,{u0182},{u0020}
2013-12-05 16:00:00,{},{}
2013-12-05 20:00:00,{},{}
2013-12-06 00:00:00,{},{}
2013-12-06 04:00:00,{},{}
2013-12-06 08:00:00,{},{}
2013-12-06 12:00:00,{},{}
2013-12-06 16:00:00,{},{}
2013-12-06 20:00:00,{},{}


In [85]:
dfs2 = df[df.free_time & df.before_workday].drop(['before_workday'], ).reset_index() 

In [86]:
dfs2 = dfs2.set_index(['user', 'timestamp'])

In [88]:
dfs2.head(12) 

Unnamed: 0_level_0,Unnamed: 1_level_0,rssi,scanned_user,hour,weekday,before_workday,free_time
user,timestamp,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
u0182,2014-02-06 06:43:05,-93,u0238,6,3,True,True
u0182,2014-02-06 06:58:10,-94,u0381,6,3,True,True
u0182,2014-02-06 06:58:10,-82,u0363,6,3,True,True
u0182,2014-02-13 06:44:40,-87,u0920,6,3,True,True
u0182,2014-02-13 06:54:40,-93,u0574,6,3,True,True
u0182,2014-02-13 06:54:40,-92,u0097,6,3,True,True
u0182,2014-02-13 06:54:40,-85,u1040,6,3,True,True
u0182,2014-02-13 06:54:40,-91,u0432,6,3,True,True
u0182,2014-02-13 06:59:40,-90,u0098,6,3,True,True
u0182,2014-02-13 06:59:40,-64,u0433,6,3,True,True


In [94]:
tmp3 = dfs2.loc['u0182'].iloc[:1000]['scanned_user'].resample('4h', closed='left').agg(concatenater)
tmp3.head(12)

timestamp
2014-02-06 04:00:00    {u0238, u0363, u0381}
2014-02-06 08:00:00                       {}
2014-02-06 12:00:00                       {}
2014-02-06 16:00:00                       {}
2014-02-06 20:00:00                       {}
2014-02-07 00:00:00                       {}
2014-02-07 04:00:00                       {}
2014-02-07 08:00:00                       {}
2014-02-07 12:00:00                       {}
2014-02-07 16:00:00                       {}
2014-02-07 20:00:00                       {}
2014-02-08 00:00:00                       {}
Freq: 4H, Name: scanned_user, dtype: object

In [97]:
df2 = df[df.before_workday & df.free_time]

In [101]:
tmp4 = df2.iloc[:8000].groupby('user')['scanned_user'].resample('4h', closed='left').agg(concatenater)

In [103]:
pd.DataFrame(tmp4)

Unnamed: 0_level_0,Unnamed: 1_level_0,scanned_user
user,timestamp,Unnamed: 2_level_1
u0182,2014-02-06 04:00:00,"{u0238, u0363, u0381}"
u0182,2014-02-06 08:00:00,{}
u0182,2014-02-06 12:00:00,{}
u0182,2014-02-06 16:00:00,{}
u0182,2014-02-06 20:00:00,{}
u0182,2014-02-07 00:00:00,{}
u0182,2014-02-07 04:00:00,{}
u0182,2014-02-07 08:00:00,{}
u0182,2014-02-07 12:00:00,{}
u0182,2014-02-07 16:00:00,{}
