In [None]:
import sys
import os
sys.path.append(os.path.abspath(".."))

import warnings
warnings.filterwarnings('ignore')

import numpy as np
import bottleneck as bn
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import networkx as nx
import networkx.algorithms.approximation as nxa
import igraph as ig
# http://stackoverflow.com/questions/35279733/what-could-cause-networkx-pygraphviz-to-work-fine-alone-but-not-together
from networkx.drawing.nx_agraph import graphviz_layout
from sklearn import decomposition
%matplotlib inline

import missingno as msno
from collections import defaultdict, Counter

from IPython.display import display, Markdown, HTML

from speclib.loaders import (loadUserPhonenumberDict, getUserList, Useralias,
                             loadUserParallel, dict2DataFrame, users2DataFrame)
from speclib.plotting import looseAxesLimits, barSBS, countsOnBarPlot, plotNeatoGraph
from speclib.graph import networkx2igraph, igraph2networkx


for k, v in {'font.size': 13.0,
             'legend.fontsize': 13.0,
             'axes.labelsize': 12.0,
             'axes.titlesize': 15.0,
             'figure.figsize': [16.0, 7.0],
             'figure.dpi': 300,
             'figure.titlesize': 'large',
             'xtick.labelsize': 13.0,
             'ytick.labelsize': 13.0}.items():
    mpl.rcParams[k] = v

%load_ext watermark
%watermark -a "Allan Leander Rostock Hansen" -u -d -v -p numpy,bottleneck,pandas,matplotlib,sklearn,missingno,networkx,igraph

Load user SMS and call data…

In [None]:
ua = Useralias()
userSpec = [(user, ua[user], ('sms', 'call')) for user in getUserList()]
userData = loadUserParallel(userSpec) 
df = users2DataFrame(userData, ua)
del userData
phonebook = loadUserPhonenumberDict(ua) 

In [None]:
df.head() 

# Cleaning data

Make a subset of the data only containing communications contained within the Social Frabric project. Of this data, select a subset of the data containing the most active users, preferebly who communicate with each other… a clique percolation algorithm could be used for this, but that won't be the initial approach.

A measure of the activity could simply be $$a = \sum_{\text{i}}\frac{\mathrm{user_{sms}}_i}{\sum_i \mathrm{user_{sms}}_i} + \frac{\mathrm{user_{call}}_i}{\sum_i \mathrm{user_{call}}_i}$$
but this could yield a huge $a$ for a very active, yet weakly connected user, so a weighting with the number of contacted people shoud be introduced.

Since a conversation using SMS regesters as several events for both users (usually), whereas a conversation carried out over a call registes as one event, a weighting should be introduced.
The easy solution is to divide the adjacency matrices with the sum of all the entries, meaning that the sum of all the elements would both add up to one.
Yet another approach would be to clean the SMS data in the following way:

1. Investigate the distribution of time between a SMS and a reply to it.
2. Use the distribution to determining a typical reply time.
3. Remove entries in the SMS data which weren't replied to within some number, say 3, times the average reply time.

Cleaning the SMS data as proposed above, should also prompt for a similar cleaning of the call data.
An obvious way would be to remove unansvered calls, albeit the SMS dataset should also be checked for an "answer".

## Doing the data munging

Remove rows for which the contacted number is not present in `phonebook` (userhash to phonehash translation table).

Also add a column which contaings the useralias (`u0001`, `u0345` and so on) for the contacted user.

In [None]:
df = df[df.number.apply(lambda num: num in phonebook)] 
df['contactedUser'] = df.number.apply(lambda x: phonebook[x]) 
df.head() 

### Count number of unique contacts for each user

In [None]:
userUniqueCommCall = dict()   # dict for calls
userUniqueCommSms = dict()  # dict for sms
for user in df.index.get_level_values('user').unique():  # loop over users
    try:  # 'call' data might be missing from some users
        comSer = df.loc[user, 'call'].contactedUser
        userUniqueCommCall[user] = comSer.unique().size
    except KeyError:
        userUniqueCommCall[user] = 0
    try:  # 'sms' data might be missing from some users
        comSer = df.loc[user, 'sms'].contactedUser
        userUniqueCommSms[user] = comSer.unique().size
    except KeyError:
        userUniqueCommSms[user] = 0

userUniqueComm = pd.DataFrame(pd.Series(userUniqueCommCall), columns=('call',))
userUniqueComm['sms'] = pd.Series(userUniqueCommSms)
userUniqueComm['total'] = userUniqueComm.sms + userUniqueComm.call
del userUniqueCommCall
del userUniqueCommSms

userUniqueComm.sort(columns='total', inplace=True, ascending=False)
display(userUniqueComm.head(), userUniqueComm.describe()) 

Plot the findings, using two different plot styles

In [None]:
fig, (ax0, ax1) = plt.subplots(nrows=1, ncols=2) 
userUniqueComm.plot.line(ax=ax0) 
userUniqueComm.drop('total', axis=1).plot.area(ax=ax1)
fig.suptitle('Unique users conacted using Calls and SMS')

A threshold of 20 unique contacts is used to select the most active users

In [None]:
totalCommunicationThreshold = 20
chosenUsers = userUniqueComm.index[userUniqueComm.total > totalCommunicationThreshold]
print(*chosenUsers, sep='\t')

### Compute adjacency matrices
Construct DataFrames for adjacency matrices/graphs for call and SMS data, where the index is the user initiating contact, and the columns is the users targeted by said contact.
Selected users is limited to previously chosen active users.

In [None]:
adCall = dict()  # dict for user calls
adSms = dict()  # dict for user sms

# for user in df.index.get_level_values('user').unique():
for user in chosenUsers:
    if 'call' in df.loc[user].index:  # user might not have 'call' data
        cnt = df.loc[user, 'call'].contactedUser.value_counts()
        adCall[user] = cnt.to_dict()
    if 'sms' in df.loc[user].index:  # user might not have 'sms' data
        cnt = df.loc[user, 'sms'].contactedUser.value_counts()
        adSms[user] = cnt.to_dict() 

# Convert dicts to DataFrames and label the index and columns.
adCall = pd.DataFrame(adCall)
adSms = pd.DataFrame(adSms)
adCall.columns.name = 'userRec'
adSms.columns.name = 'userRec'
adCall.index.name = 'userInit'
adSms.index.name = 'userInit'

# Drop contacted users which are'nt preset in the index (contact initating users) 
adCall.drop(list(set(adCall.columns) - set(adCall.index)), axis=1, inplace=True) 
adSms.drop(list(set(adSms.columns) - set(adSms.index)), axis=1, inplace=True) 

Normalize the contributions for each dataset, such that $\sum_{\text{all entries}} = 1$

In [None]:
# adCall /= np.nansum(adCall.values)
# adSms /= np.nansum(adSms.values)

Add a column where the activity level for each user is summed up.

In [None]:
adCall['activity'] = adCall.sum(axis=0, skipna=True)
adSms['activity'] = adSms.sum(axis=0, skipna=True)

# Sort the columns so that the 'activity' column is a the start of the Data Frame
adCall.columns = adCall.columns.sort_values()
adSms.columns = adSms.columns.sort_values() 

display(adCall.head(), adSms.head()) 

### Construct graph objects
A NetworkX graph is constructed from the DataFrame with the adjacency-matrix like data.
The call and sms data is combined.

In [None]:
adf = adCall + adSms
adf.head() 

In [None]:
dct = dict()  # keys is users which initiate contact (adf.index)
# Remove newthe recently added activity column and make the activity measure binary (int8 for display putposes)
adfNoActivity = (adf[adf.columns[adf.columns != 'activity']] > 0).astype(np.int8)
display(adfNoActivity.head())

for i, iUsr in enumerate(sorted(adf.index.unique())):  # Loop througth sorted user list
    comSeries = adfNoActivity.loc[iUsr]  # Extract user communications
    comSeries = comSeries.index[comSeries.astype(bool)]  # Filter the usernames (index) using the series masking data
    dct[iUsr] = comSeries.tolist()    # Convert Pandas Series to a list
g = nx.from_dict_of_lists(dct)  # costruct graph

# Delete temporary variables
del dct
del adfNoActivity

Plot the network

In [None]:
nx.draw(g, with_labels=True, node_color='lightblue', edge_color='lightgray', node_size=150) 

Clearly some nodes aren't connected to the network – their contacts probably didn't meet the "choose any users with 20 or more individual contacts"-criterion.

Nodes with no connections (that is, nodes with degree 0) are removed.

In [None]:
for node, degree in dict(g.degree()).items():
    if degree == 0:
        g.remove_node(node) 

Verify by plotting the network again

In [None]:
nx.draw(g, with_labels=True, node_color='lightblue', edge_color='lightgray', node_size=150) 

This indeed looks like nodes with degree 0 are removed.

## Apply clique algorithms to find most active users

<!--
* Two algorithms is used.
* I use the users returned from the biggest groups from both (14 users).
* I also investigate the number overlap inbetween the two algorithms wrt. cliques and users.
-->

### Verify results from NetworkX using the iGraph library

In [None]:
ig = networkx2igraph(g) 

igraphCliques = ig.maximal_cliques() 
igraphCounter = Counter((len(el) for el in igraphCliques)) 
networkxCounter = Counter(len(el) for el in nx.algorithms.find_cliques(g))
if networkxCounter == igraphCounter:
    display(Markdown('Igraph and Networkx yields identical results.'))
else:
    display(Markdown('Igraph and Networkx yields different results!'))

### Using NetworkX

In [None]:
clqdf = pd.DataFrame(nx.algorithms.find_cliques(g))
clqdf['cliquesize'] = (~clqdf.isnull()).sum(axis=1)
clqdf.sort(columns='cliquesize', ascending=False, inplace=True) 
clqdf.head() 

In [None]:
fig, ax = plt.subplots()
clqdf.cliquesize.value_counts().plot.bar(ax=ax, rot=0)
ax.set_title("Clique size distribution")
looseAxesLimits(ax, [0.0, 0.0, 0.0, 0.1])
countsOnBarPlot(ax)
ax.set_xlabel('Clique size')
ax.set_ylabel('Number of cliques') 

Remove users which are not in a clique with size 6 or larger.

In [None]:
clqdf = clqdf[clqdf.cliquesize >= 6]
clqdf.head() 

* Make an array containing all the users in the selected clique(s) only once.
* From that array, generate a new array which includes all the contacts of those users.
* Extract a subgraph for those users.

In [None]:
coreUsers = pd.Series(clqdf.drop('cliquesize', axis=1).values.flat).dropna().unique()  # Unique list of chosen users
remoteUsers = [tuple(nx.neighbors(g, user)) for user in coreUsers]  # Chosen users neighbours
remoteUsers = pd.Series(pd.DataFrame(remoteUsers).values.flat).dropna().unique()  # Make the array elements unique

# Print the choice and the lengths of the array
print('Core users in network ({} users):'.format(coreUsers.size))
print(*coreUsers, sep='\t', end='\n'*2)
print('Core users and their connections ({} users):'.format(remoteUsers.size))
print(*remoteUsers, sep='\t', end='\n'*2)

Ensure that the core users are included in the remote users

In [None]:
if set(coreUsers).issubset(set(remoteUsers)):
    display(Markdown('All core users are contained in remote users.'))
else:
    display(Markdown('Remember to combine core users and remote users for subgraph extraction!'))

This is expected, since the core users is a clique, and thus will be included among the links from the other users in the clique.

Make the subgraph, and verify it by plotting it.

In [None]:
gs = g.subgraph(remoteUsers)  # The subset of the graph on which a PCA analysis should be performed on the users

nx.draw(gs, with_labels=True, node_color='lightblue', edge_color='lightgray', node_size=150) 

In [None]:
gsDegdf = pd.DataFrame(gs.degree())
gsDegdf.columns = ['user', 'degree']
gsDegdf.set_index('user', inplace=True)
gsDegdf.sort(columns='degree', ascending=False, inplace=True)
display(gsDegdf.head())

fig, ax = plt.subplots()
ax.hist(gsDegdf.degree,
        range=(gsDegdf.degree.min(), gsDegdf.degree.max()+1),
        bins=gsDegdf.degree.max()+1 - gsDegdf.degree.min())
ax.minorticks_on() 
ax.grid(True, which='both')
ax.set_xticks(range(0, gsDegdf.degree.max()+2, 5))
ax.set_xbound(0, gsDegdf.degree.max()+2)
ax.set_xlabel('Connectivity degree')
ax.set_ylabel('Number of users')
# countsOnBarPlot(ax)
ax.set_title('Conectivity vs number of users')

# Do the PCA analysis

Obtain the adjacency matrix for the chosen network.

In [None]:
gsa = nx.adj_matrix(gs) 
fig, ax = plt.subplots() 