In [1]:
import sys
import os
sys.path.append(os.path.abspath(".."))

import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import networkx as nx
import re
import itertools
sns.set(context='paper', style='whitegrid', color_codes=True, font_scale=1.8)
colorcycle = [(0.498, 0.788, 0.498),
              (0.745, 0.682, 0.831),
              (0.992, 0.753, 0.525),
              (0.220, 0.424, 0.690),
              (0.749, 0.357, 0.090),
              (1.000, 1.000, 0.600),
              (0.941, 0.008, 0.498),
              (0.400, 0.400, 0.400)]
sns.set_palette(colorcycle)
mpl.rcParams['figure.max_open_warning'] = 65
mpl.rcParams['figure.figsize'] = [12, 7]

from speclib import misc, loaders

pd.set_option('display.max_rows', 55)
pd.set_option('display.max_columns', 10)
pd.set_option('display.width', 1000)


In [2]:
import pickle

In [3]:
userlist = loaders.getUserList()
ua = loaders.Useralias() 
user = userlist[0]

In [4]:
with open('/lscr_paper/allan/phonenumbers.p', 'rb') as fid:
    phonebook = pickle.load(fid)

phonebook = {k: ua.userdct[v] for (k, v) in phonebook.items() if v in ua.userdct}

## Check bluetooth

* Must be at least 200 hours of interaction

In [23]:
def user_bluetooth_timespan(user, ua):
    ub = loaders.loadUserBluetooth(user, ua)
    if ub is None:
        return pd.Timedelta('0H')
    ub = ub.dropna() 
    bt_timespan = ub.index.max() - ub.index.min()
    return bt_timespan

In [14]:
user_bluetooth_timespan(userlist[0], ua)

[Timedelta('466 days 04:10:04')]

## Check sms

* At least 90 sms data for 90 days
* At least 950 messages

In [15]:
def user_sms_timespan_and_count(user, ua):
    us = loaders.loadUser2(user, dataFilter=('sms',))
    us = pd.DataFrame(us['sms'])
    us = us.rename(columns={'address': 'number'}) 
    us['user'] = us.user.replace(ua.userdct, inplace=None)
    us['timestamp'] = pd.to_datetime(us.timestamp, unit='s', infer_datetime_format=True) 
    us['number'] = us.number.replace(phonebook, inplace=None)
    us = us[us.number.str.len() == 5]  # drop phones not in study

    return [np.ptp(us.timestamp), us.shape[0] ]


In [16]:
user_sms_timespan_and_count(userlist[0], ua)

[Timedelta('411 days 08:01:30'), 149]

## Check calls

* At least 90 call data for 90 days
* At least 180 calls

In [17]:
def user_call_timespan_and_count(user, ua):
    uc = loaders.loadUser2(user, dataFilter=('call',))
    uc = pd.DataFrame(uc['call'])
    
    uc['user'] = uc.user.replace(ua.userdct, inplace=None)
    uc['timestamp'] = pd.to_datetime(uc.timestamp, unit='s', infer_datetime_format=True) 
    uc['number'] = uc.number.replace(phonebook, inplace=None)
    uc = uc[uc.number.str.len() == 5] 
    return [np.ptp(uc.timestamp), uc.shape[0]] 

In [18]:
user_call_timespan_and_count(userlist[0], ua)

[Timedelta('411 days 16:28:54'), 20]

In [36]:
def get_user_data(user, ua)
    try:
        dct = {'user': user}
        dct['useralias'] = ua[user]
        dct['bt_timespan'] = user_bluetooth_timespan(user, ua)
        dct['sms_timespan'], dct['sms_count'] = user_sms_timespan_and_count(user, ua)
        dct['call_timespan'], dct['call_count'] = user_call_timespan_and_count(user, ua)
        return pd.Series(dct)
    except Exception as err:
        print(f"Failed for {ua[user]}")
        print(err)
        return pd.Series({'bt_timespan': None, 'call_count': None, 'call_timespan': None, 'sms_count': None,
                          'sms_timespan': None, 'user': None, 'useralias': None})

In [37]:
df = pd.DataFrame([get_user_data(user, ua) for user in userlist[:2]])

In [38]:
df = df.set_index('useralias')
df

Unnamed: 0_level_0,bt_timespan,call_count,call_timespan,sms_count,sms_timespan,user
useralias,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
u0182,466 days 04:10:04,20,411 days 16:28:54,149,411 days 08:01:30,28b76d7b7879d364321f164df5169f
u1015,574 days 17:23:57,68,570 days 19:13:33,276,569 days 03:43:58,f41c375b5c87d3593b963b49a3f9a3


# Do the real computation

The proof-of-concept works

In [56]:
N = len(userlist)

In [59]:
lst = list()
for i, user in enumerate(userlist):
    if i%35 == 0:
        print(f"Processing user {i+1}/{N}")
    ser = get_user_data(user, ua)
    lst.append(ser)

Processing user 1/1059
Failed for u0346
'DataFrame' object has no attribute 'user'
Failed for u0316
'DataFrame' object has no attribute 'user'
Failed for u0986
'DataFrame' object has no attribute 'user'
Failed for u0793
'DataFrame' object has no attribute 'user'
Failed for u0328
'DataFrame' object has no attribute 'user'
Processing user 36/1059
Failed for u0254
'DataFrame' object has no attribute 'user'
Failed for u0162
'DataFrame' object has no attribute 'user'
Failed for u0002
'DataFrame' object has no attribute 'user'
Failed for u0438
'DataFrame' object has no attribute 'user'
Failed for u0402
'DataFrame' object has no attribute 'user'
Failed for u1044
'DataFrame' object has no attribute 'user'
Failed for u0313
'DataFrame' object has no attribute 'user'
Failed for u0243
'DataFrame' object has no attribute 'user'
Processing user 71/1059
Failed for u0227
'DataFrame' object has no attribute 'user'
Failed for u0379
'DataFrame' object has no attribute 'user'
Failed for u0765
'DataFrame' 

Processing user 631/1059
Failed for u0587
'DataFrame' object has no attribute 'user'
Failed for u0191
'DataFrame' object has no attribute 'user'
Failed for u0689
'DataFrame' object has no attribute 'user'
Failed for u0062
'DataFrame' object has no attribute 'user'
Failed for u0487
'DataFrame' object has no attribute 'user'
Failed for u0365
'DataFrame' object has no attribute 'user'
Failed for u0824
'DataFrame' object has no attribute 'user'
Processing user 666/1059
Failed for u0756
'DataFrame' object has no attribute 'user'
Failed for u0517
'DataFrame' object has no attribute 'user'
Failed for u0930
'DataFrame' object has no attribute 'user'
Failed for u0469
'DataFrame' object has no attribute 'user'
Failed for u0036
'DataFrame' object has no attribute 'user'
Failed for u0223
'DataFrame' object has no attribute 'user'
Failed for u0992
'DataFrame' object has no attribute 'user'
Processing user 701/1059
Failed for u0359
'DataFrame' object has no attribute 'user'
Failed for u0054
'DataFra

In [68]:
df = pd.DataFrame(lst)

  union = _union_indexes(indexes)
  result = result.union(other)


In [74]:
df = df.drop([0, 1, 2, 3, 4, 5, 6], axis=1)

In [76]:
df = df.dropna() 

In [80]:
df = df.set_index('useralias')

In [81]:
df.head()

Unnamed: 0_level_0,bt_timespan,call_count,call_timespan,sms_count,sms_timespan,user
useralias,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
u0182,466 days 04:10:04,20.0,411 days 16:28:54,149.0,411 days 08:01:30,28b76d7b7879d364321f164df5169f
u1015,574 days 17:23:57,68.0,570 days 19:13:33,276.0,569 days 03:43:58,f41c375b5c87d3593b963b49a3f9a3
u0778,508 days 22:29:51,4.0,353 days 13:59:21,24.0,729 days 18:24:48,b600d2468831e0b3480a69af903dfc
u0933,514 days 02:33:14,145.0,498 days 00:14:03,200.0,498 days 05:41:58,dbbf1adb3264464838b938429d674b
u0256,305 days 07:02:31,20.0,283 days 06:59:02,121.0,292 days 01:19:01,3b6887b57d97d1ff1e1a674abfa5e3


In [82]:
dfc = df.copy()

In [84]:
dfc['sms_timespan'] = dfc.sms_timespan >= pd.Timedelta('90D') 

In [85]:
dfc['call_timespan'] = dfc.call_timespan >= pd.Timedelta('90D') 

In [87]:
dfc['call_count'] = dfc.call_count >= 170

In [92]:
dfc['sms_count'] = dfc.sms_count >= 950

In [95]:
dfc['bt_timespan'] = dfc.bt_timespan > pd.Timedelta('200H')

In [96]:
dfc.head() 

Unnamed: 0_level_0,bt_timespan,call_count,call_timespan,sms_count,sms_timespan,user
useralias,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
u0182,True,False,True,False,True,28b76d7b7879d364321f164df5169f
u1015,True,False,True,False,True,f41c375b5c87d3593b963b49a3f9a3
u0778,True,False,True,False,True,b600d2468831e0b3480a69af903dfc
u0933,True,False,True,False,True,dbbf1adb3264464838b938429d674b
u0256,True,False,True,False,True,3b6887b57d97d1ff1e1a674abfa5e3


In [100]:
dfc.drop('user', axis=1).all(axis=1).sum() 

61