In [1]:
import sys 
import os
sys.path.append(os.path.abspath(".."))

import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import re
sns.set(context='paper', style='whitegrid', color_codes=True, font_scale=1.8)
colorcycle = [(0.498, 0.788, 0.498),
              (0.745, 0.682, 0.831),
              (0.992, 0.753, 0.525),
              (0.220, 0.424, 0.690),
              (0.749, 0.357, 0.090),
              (1.000, 1.000, 0.600),
              (0.941, 0.008, 0.498),
              (0.400, 0.400, 0.400)]
sns.set_palette(colorcycle)
mpl.rcParams['figure.max_open_warning'] = 65
mpl.rcParams['figure.figsize'] = [12, 7]

from speclib import misc, plotting, loaders

%matplotlib inline 

In [2]:
df = pd.read_msgpack('../../allan_data/bluetooth_light_no_nan.msgpack') 

In [3]:
df.head() 

Unnamed: 0_level_0,rssi,scanned_user,user
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2013-12-05 10:51:22,-65,u0069,u0182
2013-12-05 10:51:22,-80,u0331,u0182
2013-12-05 10:51:22,-68,u0702,u0182
2013-12-05 10:51:22,-62,u0535,u0182
2013-12-05 10:51:22,-73,u0732,u0182


In [4]:
df.shape

(37949304, 3)

In [5]:
df['hour'] = df.index.hour
print("Done computing hour")
df['weekday'] = df.index.weekday
print("Done computing weekday")
before_workday = df.weekday.isin({0, 1, 2, 3, 6})  # is it monday, tuesday, wendnesday, thursday or sunday?
print("Done computing before_workday")
free_time = (19 < df.hour) | (df.hour < 7)
print("Done computing free_time")


Done computing hour
Done computing weekday
Done computing before_workday
Done computing free_time


In [6]:
dfti = df[df.user == 'u0182'].index

In [7]:
dftiu = dfti.unique()
dftiu = dftiu.sort_values() 

In [8]:
dftiu.dtype

dtype('<M8[ns]')

In [9]:
index_delta = list()
for i in range(len(dftiu) - 1):
    index_delta.append(dftiu[i+1] - dftiu[i])
index_delta = pd.Series(index_delta)

In [10]:
index_delta.describe(include='all')

count                      6427
mean     0 days 01:44:26.905865
std      0 days 14:06:33.015288
min             0 days 00:01:04
25%             0 days 00:05:00
50%             0 days 00:05:02
75%             0 days 00:05:14
max            20 days 05:13:17
dtype: object

Krav: Folk skal være sammen i mindst 2 timer før det tæller som hænge ud sammen, og deres signaler skal observeres i mindst 70 % af tiden før de tæller som at være sammen.

Sample try:

```
dfs = df.sample(4500)
dfs = dfs[dfs.before_workday & dfs.free_time] 

dfs.groupby(['user', dfs.index.weekday_name]).rssi.count()
```

In [12]:
# dfs = df.sample(20)
dfs = df[before_workday & free_time] 
# dfs['user_id'] = dfs.scanned_user.replace(np.NaN, dfs.bt_mac)
# dfs['scanned_user'] = dfs.scanned_user.replace(np.NaN, 'unknown')

In [13]:
tmp = dfs.iloc[:4000]

Check that timestamps and timedaltas can be used for binning/slicing

In [17]:
tmp.index[0]

Timestamp('2014-02-06 06:43:05')

In [18]:
tmp.index[0] + pd.Timedelta(4, unit='h') 

Timestamp('2014-02-06 10:43:05')

In [21]:
(
    ( tmp.index[0] <= tmp.index ) &
    ( tmp.index <= (tmp.index[0] + pd.Timedelta(4, unit='h')) )
)[:10] 

array([ True,  True,  True, False, False, False, False, False, False, False], dtype=bool)

In [22]:
dfs.head() 

Unnamed: 0_level_0,rssi,scanned_user,user,hour,weekday
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2014-02-06 06:43:05,-93,u0238,u0182,6,3
2014-02-06 06:58:10,-94,u0381,u0182,6,3
2014-02-06 06:58:10,-82,u0363,u0182,6,3
2014-02-13 06:44:40,-87,u0920,u0182,6,3
2014-02-13 06:54:40,-93,u0574,u0182,6,3


In [54]:
def concatenater(args, frac=0.70):
    vc = args.value_counts()
    return set(vc[vc >= frac*vc.max()].index)

def mostly_present_counter(args):
    return len(concatenater(args))
    #con_len = len(concatenater(args))
    # return int(con_len) if con_len else None 

Resampling `df` works, but it's not ideal since it's not organized pr. user basis

In [55]:
tmp2 = df.iloc[:1000][['user', 'scanned_user']].resample('2h', closed='left').agg(concatenater)
tmp2.head(7) 

Unnamed: 0_level_0,user,scanned_user
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1
2013-12-05 10:00:00,{u0182},{u0083}
2013-12-05 12:00:00,{u0182},"{u0020, u0098, u0083}"
2013-12-05 14:00:00,{u0182},{u0020}
2013-12-05 16:00:00,{},{}
2013-12-05 18:00:00,{},{}
2013-12-05 20:00:00,{},{}
2013-12-05 22:00:00,{},{}


A solution where the data is grouped pr. user basis, and thus usefor for multiprocessing

```
dfs2 = dfs2.set_index(['user', 'timestamp'])

dfs2.head(12) 

tmp3 = dfs2.loc['u0182'].iloc[:1000]['scanned_user'].resample('4h', closed='left').agg(concatenater)
tmp3.head(12)
```

In [56]:
tmp3 = dfs.iloc[:3000].groupby('user')[['scanned_user']].resample('4h', closed='left').agg(concatenater)

tmp3.head(12) 

Unnamed: 0_level_0,Unnamed: 1_level_0,scanned_user
user,timestamp,Unnamed: 2_level_1
u0182,2014-02-06 04:00:00,"{u0238, u0363, u0381}"
u0182,2014-02-06 08:00:00,{}
u0182,2014-02-06 12:00:00,{}
u0182,2014-02-06 16:00:00,{}
u0182,2014-02-06 20:00:00,{}
u0182,2014-02-07 00:00:00,{}
u0182,2014-02-07 04:00:00,{}
u0182,2014-02-07 08:00:00,{}
u0182,2014-02-07 12:00:00,{}
u0182,2014-02-07 16:00:00,{}


In [57]:
grouped = dfs.iloc[:3000].groupby('user')[['scanned_user']].resample('4h', closed='left').agg(concatenater)

grouped.head(20) 

Unnamed: 0_level_0,Unnamed: 1_level_0,scanned_user
user,timestamp,Unnamed: 2_level_1
u0182,2014-02-06 04:00:00,"{u0238, u0363, u0381}"
u0182,2014-02-06 08:00:00,{}
u0182,2014-02-06 12:00:00,{}
u0182,2014-02-06 16:00:00,{}
u0182,2014-02-06 20:00:00,{}
u0182,2014-02-07 00:00:00,{}
u0182,2014-02-07 04:00:00,{}
u0182,2014-02-07 08:00:00,{}
u0182,2014-02-07 12:00:00,{}
u0182,2014-02-07 16:00:00,{}


In [58]:
grouped['scanned_user'] = grouped.scanned_user.replace(set(), np.NaN)

In [59]:
grouped.head() 

Unnamed: 0_level_0,Unnamed: 1_level_0,scanned_user
user,timestamp,Unnamed: 2_level_1
u0182,2014-02-06 04:00:00,"{u0238, u0363, u0381}"
u0182,2014-02-06 08:00:00,
u0182,2014-02-06 12:00:00,
u0182,2014-02-06 16:00:00,
u0182,2014-02-06 20:00:00,


In [49]:
grouped.to_pickle('../../allan_data/users_known_nonan_bt_grouped.pkl')

In [60]:
grouped.dropna().shape

(110, 1)