In [None]:
import sys
import os
sys.path.append(os.path.abspath(".."))

import numpy as np
import bottleneck as bn
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
from sklearn import decomposition
%matplotlib inline

import missingno as msno
from collections import defaultdict

from IPython.display import display

from speclib.loaders import (loadUserPhonenumberDict, getUserList, Useralias,
                             loadUserParallel, dict2DataFrame, users2DataFrame)
from speclib.plotting import looseAxesLimits, barSBS, countsOnBarPlot

Load user SMS and call data…

In [None]:
ua = Useralias()
userSpec = [(user, ua[user], ('sms', 'call')) for user in getUserList()]
userData = loadUserParallel(userSpec) 
df = users2DataFrame(userData, ua)
del userData
phonebook = loadUserPhonenumberDict(ua) 

# Cleaning data

Make a subset of the data only containing communications contained within the Social Frabric project. Of this data, select a subset of the data containing the most active users, preferebly who communicate with each other… a clique percolation algorothm could be used for this, but that won't be the initial approach.

A measure of the activity could simply be $$a = \sum_{\text{i}}\frac{\mathrm{user_{sms}}_i}{\sum_i \mathrm{user_{sms}}_i} + \frac{\mathrm{user_{call}}_i}{\sum_i \mathrm{user_{call}}_i}$$
but this could yield a huge $a$ for a very active, yet weakly connected user, so a weighting with the number of contacted people shoud be introduced.

Since a conversation using SMS regesters as several events for both users (usually), whereas a conversation carried out over a call registes as one event, a weighting should be introduced.
The easy solution is to divide the adjacency matrices with the sum of all the entries, meaning that the sum of all the elements would both add up to one.
Yet another approach would be to clean the SMS data in the following way:

1. Investigate the distribution of time between a SMS and a reply to it.
2. Use the distribution to determining a typical reply time.
3. Remove entries in the SMS data which weren't replied to within some number, say 3, times the average reply time.

Cleaning the SMS data as proposed above, should also prompt for a similar cleaning of the call data.
An obvious way would be to remove unansvered calls, albeit the SMS dataset should also be checked for an "answer".

## Doing the data munging

Remove rows for which the contacted number is not present i `phonebook` (userhash to phonehash translation table).

Also add a column which contaings the useralias (`u0001`, `u0345` and so on) for the contacted user.

In [None]:
df = df[df.number.apply(lambda num: num in phonebook)] 
df['contactedUser'] = df.number.apply(lambda x: phonebook[x])

In [None]:
df.head() 

In [None]:
smsdf = df.loc[pd.IndexSlice[:, 'sms'], :] 
calldf = df.loc[pd.IndexSlice[:, 'call'], :] 

In [None]:
userIndex = df.index.get_level_values('user').unique()
adCall = pd.DataFrame(columns=userIndex, index=userIndex) 
adSms = pd.DataFrame(columns=userIndex, index=userIndex) 

for user in df.index.get_level_values('user').unique():
    if 'call' in df.loc[user].index:
        callCount = df.loc[user, 'call'].contactedUser.value_counts()
        for u, c in zip(callCount.index, callCount.values):
            adCall.loc[user, u] = c
    if 'sms' in df.loc[user].index:
        smsCount = df.loc[user, 'sms'].contactedUser.value_counts()
        for u, c in zip(smsCount.index, smsCount.values):
            adSms.loc[user, u] = c
adCall /= adCall.sum().sum() 
adSms /= adSms.sum().sum() 
adCall.columns.name = 'userRec'
adSms.columns.name = 'userRec'
adCall.index.name = 'userInit'
adSms.index.name = 'userInit'

In [None]:
adCall['activity'] = adCall.apply(lambda row: row.sum())
adSms['activity'] = adSms.apply(lambda row: row.sum())

In [None]:
adCall.head() 

In [None]:
nMostActive = 10

In [None]:
def getMostActive(activity):
    ac = activity.values.astype(np.double)
    ac[np.isnan(ac)] = 0
    idx = np.argsort(ac)
    return idx

In [None]:
cdf = adCall.iloc[getMostActive(adCall.activity)[-nMostActive:], :][::-1] 
sdf = adSms.iloc[getMostActive(adSms.activity)[-nMostActive:], :][::-1]
cdf

In [None]:
sdf

Only 2 out of 10 users are present among the most active in both the SMS and call datasets

In [None]:
cdf.index.intersection(sdf.index)

Try choosing users from the sum of SMS and call activity, thus choosing the same users in both datasets.
Combine the two datasets into one.

In [None]:
idx = getMostActive(adSms.activity + adCall.activity)
cdf = adCall.iloc[idx[-nMostActive:], :][::-1] 
sdf = adSms.iloc[idx[-nMostActive:], :][::-1]
display(cdf, sdf)

Verify that the same users is used in both dataframes

In [None]:
cdf.index.difference(sdf.index)

In [None]:
adf = sdf + cdf
adf

Plotting the resulting adjacency matrix... it's very sparse

In [None]:
fig, ax = plt.subplots()
adfData = adf.values.astype(np.double)[:, 1:]  # remove activity column
pc = ax.pcolorfast(adfData, cmap=mpl.cm.rainbow)
fig.colorbar(pc) 

Removing users which were not contacted, and plotting the new reduced adjacency matrix

In [None]:
fig, ax = plt.subplots()
adfNoActivity = adf[adf.columns[1:]] 
mask = adfNoActivity.sum(axis=0).notnull()
masked = adfNoActivity[mask[mask].index]
toPlot = masked.values.astype(np.double)
toPlot = np.ma.masked_array(toPlot, mask=np.isnan(toPlot))
pc = ax.pcolor(toPlot, cmap=mpl.cm.plasma)
fig.colorbar(pc) 
ax.set_yticks(np.arange(1, masked.shape[0]+1) - 0.5)
ax.set_yticklabels(masked.index)
ax.grid(False)
ax.set_xticklabels([])
ax.set_xlabel('Contacted users')
ax.set_ylabel('Initiating users')

The chosen users are

In [None]:
for user in adf.index:
    print(user)

## Construction the time series

A time series for the users activity, binned for each quarter day are constructed.

In [None]:
dfa = df.loc[list(adf.index.values)]

In [None]:
dfa.loc['u0250'].head() 

Timebinning is done in the following way:

1. Substract the minimum value for the timebin from all times, this starting comminication at time 0.
2. Do integer division with 6*3600 (6 hours worth of seconds) to obtain timebin.

In [None]:
dfa['timebin'] = (dfa.timeint - dfa.timeint.min())//(6*3600)

In [None]:
dfa.head() 

In [None]:
dfa.loc['u0250'].timebin.value_counts().head() 

In [None]:
userIndex = dfa.index.get_level_values('user').unique()
# dfh = pd.DataFrame(columns=(dfa.timebin.unique()))  # empty timebins not included
dfh = pd.DataFrame(columns=(np.arange(dfa.timebin.min(), dfa.timebin.max())))  # empty timebins included
for user in userIndex:
    dfh.loc[user] = dfa.loc[user].timebin.value_counts()
dfh.replace(np.NaN, 0.0, inplace=True)  # Replace NaN's with 0.0
dfh

Play around with `np.reshape` to ensure that I''m reshaping correctly.

In [None]:
arr = np.arange(20).reshape((4, 5))
display(arr, arr.reshape((arr.size)))

* Remember to normalize?
* Sure about transpose?
* Read up on PCA
* Use decomposition.SparcePCA instead?
* Talk to Joachim about PCA input shape

In [None]:
toPca = dfh.values.T.reshape((1, dfh.values.size))

In [None]:
pca = decomposition.PCA()
pca.fit(dfh.values.T)
print(pca.explained_variance_ratio_) 