In [None]:
import sys
import os
sys.path.append(os.path.abspath(".."))

import warnings
warnings.filterwarnings('ignore')

import numpy as np
import bottleneck as bn
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import networkx as nx
import networkx.algorithms.approximation as nxa
import igraph as ig
# http://stackoverflow.com/questions/35279733/what-could-cause-networkx-pygraphviz-to-work-fine-alone-but-not-together
from networkx.drawing.nx_agraph import graphviz_layout
from sklearn import decomposition
%matplotlib inline

for k, v in {'font.size': 13.0,
             'legend.fontsize': 13.0,
             'axes.labelsize': 12.0,
             'axes.titlesize': 15.0,
             'figure.figsize': [16.0, 7.0],
             'figure.titlesize': 'large',
             'xtick.labelsize': 13.0,
             'ytick.labelsize': 13.0}.items():
    mpl.rcParams[k] = v

import missingno as msno
from collections import defaultdict, Counter

from IPython.display import display

from speclib.loaders import (loadUserPhonenumberDict, getUserList, Useralias,
                             loadUserParallel, dict2DataFrame, users2DataFrame)
from speclib.plotting import looseAxesLimits, barSBS, countsOnBarPlot, plotNeatoGraph
from speclib.graph import networkx2igraph, igraph2networkx


%load_ext watermark
%watermark -a "Allan Leander Rostock Hansen" -u -d -v -p numpy,bottleneck,pandas,matplotlib,sklearn,missingno,networkx,igraph

Load user SMS and call data…

In [None]:
ua = Useralias()
userSpec = [(user, ua[user], ('sms', 'call')) for user in getUserList()]
userData = loadUserParallel(userSpec) 
df = users2DataFrame(userData, ua)
del userData
phonebook = loadUserPhonenumberDict(ua) 

# Cleaning data

Make a subset of the data only containing communications contained within the Social Frabric project. Of this data, select a subset of the data containing the most active users, preferebly who communicate with each other… a clique percolation algorithm could be used for this, but that won't be the initial approach.

A measure of the activity could simply be $$a = \sum_{\text{i}}\frac{\mathrm{user_{sms}}_i}{\sum_i \mathrm{user_{sms}}_i} + \frac{\mathrm{user_{call}}_i}{\sum_i \mathrm{user_{call}}_i}$$
but this could yield a huge $a$ for a very active, yet weakly connected user, so a weighting with the number of contacted people shoud be introduced.

Since a conversation using SMS regesters as several events for both users (usually), whereas a conversation carried out over a call registes as one event, a weighting should be introduced.
The easy solution is to divide the adjacency matrices with the sum of all the entries, meaning that the sum of all the elements would both add up to one.
Yet another approach would be to clean the SMS data in the following way:

1. Investigate the distribution of time between a SMS and a reply to it.
2. Use the distribution to determining a typical reply time.
3. Remove entries in the SMS data which weren't replied to within some number, say 3, times the average reply time.

Cleaning the SMS data as proposed above, should also prompt for a similar cleaning of the call data.
An obvious way would be to remove unansvered calls, albeit the SMS dataset should also be checked for an "answer".

## Doing the data munging

Remove rows for which the contacted number is not present in `phonebook` (userhash to phonehash translation table).

Also add a column which contaings the useralias (`u0001`, `u0345` and so on) for the contacted user.

In [None]:
df = df[df.number.apply(lambda num: num in phonebook)] 
df['contactedUser'] = df.number.apply(lambda x: phonebook[x]) 
df.head() 

Construct DataFrames for call and SMS data, where the index is the user initiating contact, and the columns is the users targeted by said contact.

In [None]:
userUniqueCommCall = dict()
userUniqueCommSms = dict()
for user in df.index.get_level_values('user').unique():
    try:
        comSer = df.loc[user, 'call'].contactedUser
        userUniqueCommCall[user] = comSer.unique().size
    except KeyError:
        userUniqueCommCall[user] = 0
    try:
        comSer = df.loc[user, 'sms'].contactedUser
        userUniqueCommSms[user] = comSer.unique().size
    except KeyError:
        userUniqueCommSms[user] = 0

userUniqueComm = pd.DataFrame(pd.Series(userUniqueCommCall), columns=('call',))
userUniqueComm['sms'] = pd.Series(userUniqueCommSms)
userUniqueComm['total'] = userUniqueComm.sms + userUniqueComm.call
del userUniqueCommCall
del userUniqueCommSms

userUniqueComm.sort(columns='total', inplace=True, ascending=False)
display(userUniqueComm.head(), userUniqueComm.describe()) 

In [None]:
fig, (ax0, ax1) = plt.subplots(nrows=1, ncols=2) 
userUniqueComm.plot.line(ax=ax0) 
userUniqueComm[['call', 'sms']].plot.area(ax=ax1)
fig.suptitle('Unique users conacted using Calls and SMS')

In [None]:
userIndex = df.index.get_level_values('user').unique()
adCall = dict()
adSms = dict()

for user in df.index.get_level_values('user').unique():
    if 'call' in df.loc[user].index:
        cnt = df.loc[user, 'call'].contactedUser.value_counts()
        adCall[user] = cnt.to_dict()
    if 'sms' in df.loc[user].index:
        cnt = df.loc[user, 'sms'].contactedUser.value_counts()
        adSms[user] = cnt.to_dict() 

# Convert dicts to DataFrames and label the index and columns.
adCall = pd.DataFrame(adCall)
adSms = pd.DataFrame(adSms)
adCall.columns.name = 'userRec'
adSms.columns.name = 'userRec'
adCall.index.name = 'userInit'
adSms.index.name = 'userInit'

# Drop contacted users which are'nt preset in the index (contact initating users) 
adCall.drop(list(set(adCall.columns) - set(adCall.index)), axis=1, inplace=True) 
adSms.drop(list(set(adSms.columns) - set(adSms.index)), axis=1, inplace=True) 

Normalize the contributions for each dataset, such that $\sum_{\text{all entries}} = 1$

In [None]:
# adCall /= np.nansum(adCall.values)
# adSms /= np.nansum(adSms.values)

Add a column where the activity level for each user is summed up.

In [None]:
adCall['activity'] = adCall.sum(axis=0, skipna=True)
adSms['activity'] = adSms.sum(axis=0, skipna=True)

# Sort the columns so that the 'activity' column is a the start of the Data Frame
adCall.columns = adCall.columns.sort_values()
adSms.columns = adSms.columns.sort_values() 

display(adCall.head(), adSms.head()) 

## Experimenting with clique algorithms

Finding users active in cliques.
To do this, the data is loaded into networkx as a graph.

* Two algorithms is used.
* I use the users returned from the biggest groups from both (14 users).
* I also investigate the number overlap inbetween the two algorithms wrt. cliques and users.

In [None]:
adf = adCall + adSms

In [None]:
adf.head() 

In [None]:
dct = dict()  # keys is users which initiate contact (adf.index)
# Remove newthe recently added activity column and make the activity measure binary (int8 for display putposes)
adfNoActivity = (adf[adf.columns[adf.columns != 'activity']] > 0).astype(np.int8)
display(adfNoActivity.head())

for iUsr in sorted(adf.index.unique()):  # Loop througth sorted user list
    comSeries = adfNoActivity.loc[iUsr]  # Extract user communications
    dct[iUsr] = comSeries.tolist()   # Convert Pandas Series to a list
g = nx.from_dict_of_lists(dct)  # costruct graph
del dct  # Delete temporary variables
del adfNoActivity

In [None]:
nx.draw(g, with_labels=True, node_color='lightblue', edge_color='lightgray', node_size=150)

In [None]:
ig = networkx2igraph(g) 

igraphCliques = ig.cliques() 
Counter((len(el) for el in igraphCliques)) 

Count clique size for the two algorithms

In [None]:
mclq = list(nx.algorithms.enumerate_all_cliques(g))
mclq[::-1] 
cntMclq = Counter(len(el) for el in mclq)
cntMclq

In [None]:
fclq = list(nx.algorithms.find_cliques(g))
cntFclq = Counter(len(x) for x in fclq) 
cntFclq

In [None]:
fclqr = list(nx.algorithms.clique.find_cliques_recursive(g))
cntFclqr = Counter(len(x) for x in fclqr)
cntFclqr

In [None]:
kclq = list(nx.algorithms.k_clique_communities(g, k=3))
kclq

Verify graph by plotting it...

In [None]:
fig, ax = plt.subplots(figsize=(16, 9))
nx.draw(g, node_size=50, node_color='steelblue',
        edge_color='lightgray', alpha=0.65, ax=ax) 

In [None]:
adf.shape

In [None]:
pd.isnull(adf[adf.columns[adf.columns != 'activity']]).sum(axis=1) 

In [None]:
list(nx.algorithms.community.k_clique_communities(g, 3))