In [None]:
# import warnings
# warnings.filterwarnings('ignore') 

import os
import sys
sys.path.append(os.path.abspath(".."))

import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.style.use("bmh")
%matplotlib inline
import missingno as msno
import itertools
from IPython.display import display
from scipy.optimize import leastsq

from speclib.userActivityFunctions import getComdataMean
from speclib.loaders import loadUserPhonenumberDict, getUserList, Useralias, loadUserParallel, dict2DataFrame, users2DataFrame
from speclib.plotting import looseAxesLimits, barSBS, countsOnBarPlot

Prepare to load users

In [None]:
userLst = getUserList() 
useralias = Useralias() 
userSpec = [(username, useralias[username], ('call', 'sms')) for username in userLst]
for el in userSpec[:10]:
    print(el)

In [None]:
users = loadUserParallel(userSpec, n=15) 

Turn loaded user data into a DateFrame

In [None]:
df = users2DataFrame(users, useralias, processes=15) 
del users

In [None]:
display(df.head(), df.tail()) 

Apparently there is messages with no recieving number…

In [None]:
idx = df.loc["u0645", "sms"].body == "cc0bf55fbc000c9ffa5ca348a1724744ae704ae0"

df.loc["u0645", "sms"][idx] 

# Some plotting 

Plot activity for all users calls and sms' side by side. Each category sums up to 1.

In [None]:
callWeek = df.xs('call', level=1).weekday.value_counts()
callWeek /= callWeek.sum()
smsWeek = df.xs('sms', level=1).weekday.value_counts()
smsWeek /= smsWeek.sum() 
fig, ax = plt.subplots(figsize=(16, 6)) 
d0 = {'y': smsWeek.sort_index(), 'label': 'SMS'} 
d1 = {'y': callWeek.sort_index(), 'label': 'Call'} 
barSBS(ax, d0, d1) 
ax.set_ylabel("Fractional normalized activity") 
ax.set_xticks(np.arange(0, 7) + 0.35) 
ax.set_xticklabels( ("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"), rotation=45 );
ax.set_xlim((-0.05, 6.8));

Index into a sublevel of the MultiIndex like so:

In [None]:
sms = df.xs('sms', level=1)
display(sms.head(), sms.tail()) 

In [None]:
call = df.xs('call', level=1).hour.value_counts()
call /= call.sum()
sms = df.xs('sms', level=1).hour.value_counts()
sms /= sms.sum() 

In [None]:
fig, ax = plt.subplots(figsize=(16, 6)) 
d0 = {'y': sms.sort_index(), 'label': 'SMS'} 
d1 = {'y': call.sort_index(), 'label': 'Call'} 
barSBS(ax, d0, d1) 
ax.set_ylabel("Fractional normalized activity") 
ax.set_xlabel("Hour of day") 
ax.set_xticks(np.arange(0, 24, dtype=np.int) + 0.35)
ax.set_xticklabels(["%d"%i for i in range(24)])
ax.set_xlim((-0.15, 23.89)); 

Count the number of times each uses calls or writes each unique phone number

In [None]:
if 'comFreq.pandasPickle' in os.listdir('.'):
    comFreq = pd.read_pickle('comFreq.pandasPickle')  # Computation is slow
else:
    comFreq = pd.DataFrame(index=df.index.get_level_values('user').unique(), columns=('sms', 'call'))
    comFreq.columns.name = 'comtype'
    for user in df.index.get_level_values('user').unique():
        for comtype in df.loc[user].index.unique():
            comFreq.loc[user, comtype] = df.loc[user, comtype].number.value_counts().values
    counterLambda = lambda x: 0 if np.any(pd.isnull(x)) else x.size
    comFreq['smsUnique'] = comFreq.sms.apply(counterLambda)
    comFreq['callUnique'] = comFreq.call.apply(counterLambda) 

In [None]:
comFreq.head() 

Plot the distribution

In [None]:
fig, ax = plt.subplots(figsize=(16,6)) 
nMaxEvents = 250
meanColor = '#d64413'
for user in comFreq.index:
    try:
        ax.semilogy(comFreq.loc[user].call[:nMaxEvents], 'k-', alpha=0.04)
    except TypeError:  # length-1 elements are apparently turned into floats, which isn't subscriptable
        ax.semilogy(comFreq.loc[user].call, 'k-', alpha=0.04)
ax.grid(which='minor')
ax.semilogy(getComdataMean(comFreq, 'call', 'callUnique')[:nMaxEvents], color=meanColor, label='Mean signal') 
ax.set_xlabel('Call #')
ax.set_ylabel("Number of calls to number")
ax.set_title("Communication from calls")
ax.legend() 

In [None]:
fig, ax = plt.subplots(figsize=(16,6)) 
nMaxEvents = 250
meanColor = '#d64413'
for user in comFreq.index:
    try:
        ax.semilogy(comFreq.loc[user].sms[:nMaxEvents], 'k-', alpha=0.04)
    except TypeError:  # length-1 elements are apparently turned into floats, which isn't subscriptable
        ax.semilogy(comFreq.loc[user].sms, 'k-', alpha=0.04)
ax.grid(which='minor')
ax.semilogy(getComdataMean(comFreq, 'sms', 'smsUnique')[:nMaxEvents], color=meanColor, label="Mean signal") 
ax.set_xlabel('SMS #')
ax.set_ylabel("Number of SMS' to number")
ax.set_title("Communication from SMS'")
ax.legend() 

Attempt a fit of the data, as I suspect it could be fittet well with a powerlaw.

While the fit did converge, I'm not convinced that it's better than my own guess… are the results weighted towards the lower end, or something like that? Or did my algorithm just converge on a local minima?

In [None]:
cut = 4
smsMean = getComdataMean(comFreq, 'sms', 'smsUnique')
smsMean = smsMean[cut:]
x = np.arange(len(smsMean), dtype=np.double) 
errfunc = lambda p, x, y: np.sqrt(y**2 - (p[0]*x**p[1])**2)
fit, _ = leastsq(errfunc, (85, -0.7), args=(x, smsMean))
fig, ax = plt.subplots()
ax.semilogy(x, smsMean, label='SMS data')
ax.semilogy(x, fit[0]*x**fit[1], label=r'Fit: $%.2f x^{%.2f}$' % tuple(fit)) 
ax.semilogy(x, 330*x**-1.05, label=r"My guess: $330.0 x^{-1.05}$")
ax.legend()
ax.grid(which='minor') 

# Clean the data

Check the min and max dates… seems some users didn't set the time on their phones
 $a^2$

In [None]:
display(df.timestamp.min(), df.timestamp.max() ) 

Print aliases for all usesr which have events before 2013 and after 2015, along with min and max dates

In [None]:
for user in df.index.get_level_values('user').unique():
    if df.loc[user].timestamp.min().year < 2013 or df.loc[user].timestamp.max().year > 2015:
        print(user, df.loc[user].timestamp.min(), df.loc[user].timestamp.max(), sep="\t"*2)

Add a "year" column to the DataFrame, and plot a bar chart over the years

In [None]:
df['year'] = df.timestamp.apply(lambda x: x.year)

Seems like there was some activity as early as in 2011. 1970 is proably relating to a reset phone counting for Unix time 0, and will be removed along with events which "occured" in 1980 and 2021.

In [None]:
ax = df.year.value_counts().sort_index().plot.bar()  
ax.set_yscale('log')
# ax.grid(which='minor')
countsOnBarPlot(ax)
ax.set_ylabel("Number of communication events")
ax.set_xlabel("Year")
display(df.year.value_counts().sort_index())

Remove the faulty rows, and ensure that the number of removed rows correspond to the number of rows matched in the mask

In [None]:
rowsBefore = df.shape[0] 
mask = (df.year < 2011) | (df.year > 2016)
df = df[~mask]
rowsBefore - df.shape[0] == mask.sum() 

# Timebin users activity

In [None]:
ul = ["u000%d" % i for i in range(1, 10)] 
ul 

In [None]:
dfs = df.loc[ul]  
dfs.index.get_level_values('user').unique() 

In [None]:
dfs.head() 

In [None]:
dfs.resample?

Make a matrix with a row for each user and a column for every hour, ranging from `df.timestamp.min()` to `df.timestamp.max()`.

Next, make a function which calculates the index based on the datetime... maybe cast the `datetime` to an `int` and do some modulo magic like (`(timeInt - offset) % 3600`.

If necessary, combine bins at nighttime afterwards.