In [1]:
import sys
import os
sys.path.append(os.path.abspath(".."))

import warnings
warnings.filterwarnings('ignore')

import numpy as np
import bottleneck as bn
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import networkx as nx
# http://stackoverflow.com/questions/35279733/what-could-cause-networkx-pygraphviz-to-work-fine-alone-but-not-together
from networkx.drawing.nx_agraph import graphviz_layout
from sklearn import decomposition
%matplotlib inline

import missingno as msno
from collections import defaultdict, Counter

from IPython.display import display

from speclib.loaders import (loadUserPhonenumberDict, getUserList, Useralias,
                             loadUserParallel, dict2DataFrame, users2DataFrame)
from speclib.plotting import looseAxesLimits, barSBS, countsOnBarPlot


%load_ext watermark
%watermark -a "Allan Leander Rostock Hansen" -u -d -v -p numpy,bottleneck,pandas,matplotlib,matplotlib.pyplot,sklearn,missingno,networkx

Allan Leander Rostock Hansen 
last updated: 2017-02-16 

CPython 3.5.2
IPython 5.1.0

numpy 1.12.0
bottleneck 1.2.0
pandas 0.19.1
matplotlib 2.0.0
matplotlib.pyplot 2.0.0
sklearn 0.18.1
missingno 0.3.4
networkx 2.0.dev_20170215140237


Load user SMS and call data…

In [2]:
ua = Useralias()
userSpec = [(user, ua[user], ('sms', 'call')) for user in getUserList()]
userData = loadUserParallel(userSpec) 
df = users2DataFrame(userData, ua)
del userData
phonebook = loadUserPhonenumberDict(ua) 

# Cleaning data

Make a subset of the data only containing communications contained within the Social Frabric project. Of this data, select a subset of the data containing the most active users, preferebly who communicate with each other… a clique percolation algorothm could be used for this, but that won't be the initial approach.

A measure of the activity could simply be $$a = \sum_{\text{i}}\frac{\mathrm{user_{sms}}_i}{\sum_i \mathrm{user_{sms}}_i} + \frac{\mathrm{user_{call}}_i}{\sum_i \mathrm{user_{call}}_i}$$
but this could yield a huge $a$ for a very active, yet weakly connected user, so a weighting with the number of contacted people shoud be introduced.

Since a conversation using SMS regesters as several events for both users (usually), whereas a conversation carried out over a call registes as one event, a weighting should be introduced.
The easy solution is to divide the adjacency matrices with the sum of all the entries, meaning that the sum of all the elements would both add up to one.
Yet another approach would be to clean the SMS data in the following way:

1. Investigate the distribution of time between a SMS and a reply to it.
2. Use the distribution to determining a typical reply time.
3. Remove entries in the SMS data which weren't replied to within some number, say 3, times the average reply time.

Cleaning the SMS data as proposed above, should also prompt for a similar cleaning of the call data.
An obvious way would be to remove unansvered calls, albeit the SMS dataset should also be checked for an "answer".

## Doing the data munging

Remove rows for which the contacted number is not present in `phonebook` (userhash to phonehash translation table).

Also add a column which contaings the useralias (`u0001`, `u0345` and so on) for the contacted user.

In [3]:
df = df[df.number.apply(lambda num: num in phonebook)] 
df['contactedUser'] = df.number.apply(lambda x: phonebook[x])

In [4]:
df.head() 

Unnamed: 0_level_0,Unnamed: 1_level_0,body,duration,hour,number,timeint,timestamp,weekday,contactedUser
user,comtype,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
u0001,call,,0.0,1,a1839538e5d09dd68a576f1ee2c2611ac0c4f451,1390698006,2014-01-26 01:00:06,6,u0493
u0001,call,,0.0,14,a1839538e5d09dd68a576f1ee2c2611ac0c4f451,1391176805,2014-01-31 14:00:05,4,u0493
u0001,call,,29.0,14,55f088fd3f0b8d0497eb8e47dbf501721eb2e927,1391782631,2014-02-07 14:17:11,4,u0117
u0001,call,,22.0,14,a1839538e5d09dd68a576f1ee2c2611ac0c4f451,1391782712,2014-02-07 14:18:32,4,u0493
u0001,call,,3.0,14,55f088fd3f0b8d0497eb8e47dbf501721eb2e927,1394805935,2014-03-14 14:05:35,4,u0117


Split the data into a SMS and a call dataset.

In these new DataFrames, make a list over all contacted users, and the amount of "activity-time" devoted to each contact.

In [5]:
smsdf = df.loc[pd.IndexSlice[:, 'sms'], :] 
calldf = df.loc[pd.IndexSlice[:, 'call'], :] 

In [6]:
userIndex = df.index.get_level_values('user').unique()
adCall = pd.DataFrame(columns=userIndex, index=userIndex) 
adSms = pd.DataFrame(columns=userIndex, index=userIndex) 

for user in df.index.get_level_values('user').unique():
    if 'call' in df.loc[user].index:
        callCount = df.loc[user, 'call'].contactedUser.value_counts()
        for u, c in zip(callCount.index, callCount.values):
            adCall.loc[user, u] = c
    if 'sms' in df.loc[user].index:
        smsCount = df.loc[user, 'sms'].contactedUser.value_counts()
        for u, c in zip(smsCount.index, smsCount.values):
            adSms.loc[user, u] = c
adCall /= adCall.sum().sum() 
adSms /= adSms.sum().sum() 
adCall.columns.name = 'userRec'
adSms.columns.name = 'userRec'
adCall.index.name = 'userInit'
adSms.index.name = 'userInit'

Add a column where the activity level for each user is summed up.

In [7]:
adCall['activity'] = adCall.apply(lambda row: row.sum())
adSms['activity'] = adSms.apply(lambda row: row.sum())

In [8]:
adCall.head() 

userRec,u0001,u0002,u0003,u0004,u0005,u0006,u0007,u0008,u0009,u0010,...,u1062,u0056,u0611,u0994,u0623,u0417,u0403,u0949,u0776,activity
userInit,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
u0001,,,,,,,,,,,...,,,,,,,,,,0.000662
u0002,,2.75881e-05,,,,,,,,,...,,,,,,,,,,0.000524
u0003,,,1.37941e-05,,,,,,,,...,,,,,,,,,,0.000166
u0004,,,,,,,,,,,...,,,,,,,,,,0.001793
u0005,,,,,,,,,,,...,,,,,,,,,,


# Back the old measure (total activity)

Find the n (=10) most active users. This should be replaced by a clique algorithm.

In [9]:
nMostActive = 10

In [10]:
def getMostActive(activity):
    ac = activity.values.astype(np.double)
    ac[np.isnan(ac)] = 0
    idx = np.argsort(ac)
    return idx

In [11]:
cdf = adCall.iloc[getMostActive(adCall.activity)[-nMostActive:], :][::-1] 
sdf = adSms.iloc[getMostActive(adSms.activity)[-nMostActive:], :][::-1]
display(cdf.head(), sdf.head())

userRec,u0001,u0002,u0003,u0004,u0005,u0006,u0007,u0008,u0009,u0010,...,u1062,u0056,u0611,u0994,u0623,u0417,u0403,u0949,u0776,activity
userInit,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
u0250,,,,,,,,,,,...,,,,,,,,,,0.030168
u0544,,,,,,,,,,,...,,,,,,,,,,0.029174
u0568,,,,,,,,,,,...,,,,,,,,,,0.01727
u0813,,,,,,,,,,,...,,,,,,,,,,0.014249
u0465,,,,,,,,,,,...,,,,,,,,,,0.012566


userRec,u0001,u0002,u0003,u0004,u0005,u0006,u0007,u0008,u0009,u0010,...,u0611,u0994,u0077,u0623,u0417,u0403,u0357,u0776,u0336,activity
userInit,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
u0799,,,,,,,,,,,...,,,,,,,,,,0.059889
u0250,,,,,,,,,,,...,,,,,,,,,,0.043574
u0591,,,,,,,,,,,...,,,,,,,,,,0.031641
u0961,,,,,,,,,,,...,,,,,,,,,,0.031634
u0857,,,,,,,,,,,...,,,,,,,,,,0.023424


Only 2 out of 10 users are present among the most active in both the SMS and call datasets

In [12]:
cdf.index.intersection(sdf.index)

Index(['u0250', 'u0857'], dtype='object', name='userInit')

Try choosing users from the sum of SMS and call activity, thus choosing the same users in both datasets.
Combine the two datasets into one.

In [13]:
idx = getMostActive(adSms.activity + adCall.activity)
cdf = adCall.iloc[idx[-nMostActive:], :][::-1] 
sdf = adSms.iloc[idx[-nMostActive:], :][::-1]
display(cdf, sdf)

userRec,u0001,u0002,u0003,u0004,u0005,u0006,u0007,u0008,u0009,u0010,...,u1062,u0056,u0611,u0994,u0623,u0417,u0403,u0949,u0776,activity
userInit,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
u0250,,,,,,,,,,,...,,,,,,,,,,0.030168
u0799,,,,,,,,,,,...,,,,,,,,,,0.002993
u0544,,,,,,,,,,,...,,,,,,,,,,0.029174
u0961,,,,,,,,,,,...,,,,,,,,,,0.002483
u0591,,,,,,,,,,,...,,,,,,,,,,0.002373
u0857,,,,,,,,,,,...,,,,,,,,,,0.008856
u0930,,,,,,,,,,,...,,,,,,,,,,0.003669
u0568,,,,,,,,,,,...,,,,,,,,,,0.01727
u0393,,,,,,,,,,,...,,,,,,,,,,0.007918
u0794,,,,,,,,,,,...,,,,,,,,,,0.004262


userRec,u0001,u0002,u0003,u0004,u0005,u0006,u0007,u0008,u0009,u0010,...,u0611,u0994,u0077,u0623,u0417,u0403,u0357,u0776,u0336,activity
userInit,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
u0250,,,,,,,,,,,...,,,,,,,,,,0.043574
u0799,,,,,,,,,,,...,,,,,,,,,,0.059889
u0544,,,,,,,,,,,...,,,,,,,,,,0.005029
u0961,,,,,,,,,,,...,,,,,,,,,,0.031634
u0591,,,,,,,,,,,...,,,,,,,,,,0.031641
u0857,,,,,,,,,,,...,,,,,,,,,,0.023424
u0930,,,,,,,,,,,...,,,,,,,,,,0.022407
u0568,,,,,,,,,,,...,,,,,,,,,,0.008106
u0393,,,,,,,,,,,...,,,,,,,,,,0.0135
u0794,,,,,,,,,,,...,,,,,,,,,,0.015468


Verify that the same users is used in both dataframes

In [14]:
cdf.index.difference(sdf.index)

Index([], dtype='object', name='userInit')

Combine the DataFrames `sdf` and `cdf` under the name `adf`

In [15]:
adf = sdf + cdf
adf

userRec,activity,u0001,u0002,u0003,u0004,u0005,u0006,u0007,u0008,u0009,...,u1049,u1052,u1053,u1054,u1055,u1056,u1058,u1059,u1060,u1062
userInit,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
u0250,0.073742,,,,,,,,,,...,,,,,,,,,,
u0799,0.062882,,,,,,,,,,...,,,,,,,,,,
u0544,0.034204,,,,,,,,,,...,,,,,,,,,,
u0961,0.034117,,,,,,,,,,...,,,,,,,,,,
u0591,0.034014,,,,,,,,,,...,,,,,,,,,,
u0857,0.03228,,,,,,,,,,...,,,,,,,,,,
u0930,0.026076,,,,,,,,,,...,,,,,,,,,,
u0568,0.025377,,,,,,,,,,...,,,,,,,,,,
u0393,0.021418,,,,,,,,,,...,,,,,,,,,,
u0794,0.01973,,,,,,,,,,...,,,,,,,,,,


Plotting the resulting adjacency matrix... it's very sparse

## Use output from clique algorithms for user selection

In [16]:
cdf = adCall.loc[clqCnt.keys()]
sdf = adSms.loc[clqCnt.keys()]
adf = cdf + sdf
adf

NameError: name 'clqCnt' is not defined

In [None]:
fig, ax = plt.subplots()
adfData = adf.values.astype(np.double)[:, 1:]  # remove activity column
pc = ax.pcolorfast(adfData, cmap=mpl.cm.rainbow)
fig.colorbar(pc) 

Removing users which were not contacted, and plotting the new reduced adjacency matrix

In [None]:
fig, ax = plt.subplots(figsize=(12, 8))
adfNoActivity = adf[adf.columns[1:]] 
mask = adfNoActivity.sum(axis=0).notnull()
masked = adfNoActivity[mask[mask].index]
toPlot = masked.values.astype(np.double)
toPlot = np.ma.masked_array(toPlot, mask=np.isnan(toPlot))
pc = ax.pcolor(toPlot, cmap=mpl.cm.plasma)
fig.colorbar(pc) 
ax.set_yticks(np.arange(1, masked.shape[0]+1) - 0.5)
ax.set_yticklabels(masked.index)
ax.grid(False)
ax.set_xticklabels([])
ax.set_xlabel('Contacted users')
ax.set_ylabel('Initiating users')

The chosen users are

In [None]:
for user in adf.index:
    print(user)

## Construction the time series

A time series for the users activity, binned for each quarter day are constructed.

In [None]:
dfa = df.loc[list(adf.index.values)]

In [None]:
dfa.loc['u0250'].head() 

Timebinning is done in the following way:

<br>
<font color="red">Make sure to start the timedeltas from a sensible time, like midnight!</font>
<br><br>

1. Substract the minimum value for the timebin from all times, this starting comminication at time 0.
2. Do integer division with 6*3600 (6 hours worth of seconds) to obtain timebin.

In [None]:
dfa['timebin'] = (dfa.timeint - dfa.timeint.min())//(6*3600)

In [None]:
dfa.head() 

In [None]:
dfa.loc['u0250'].timebin.value_counts().head() 

#  <font color="red"> Supervision by Joachim </font>

1. Find a clique of about 10 people
2. Make an adjacency matrix for those people which includes their contacts
3. Cut network matrices into 6 hour intervals (or other interval)
4. Turn interval-matrices into column-vectors
5. "Stack" column vectors to a matrix
6. Run PCA on matrix

Also…
* Make sure to understand, and be able to explain, clique/clustering algorithms and PCA algorithm
* Make derivation of PCA algorithm
* Brush up on index notation, because getting supervision by Joachim would otherwise be quite hard a times
* <font color="red"> *Send Joachim an email with results, possible derivations for PCA, and possible questions to show engagement and progress*</font>

In [None]:
userIndex = dfa.index.get_level_values('user').unique()
# dfh = pd.DataFrame(columns=(dfa.timebin.unique()))  # empty timebins not included
dfh = pd.DataFrame(columns=(np.arange(dfa.timebin.min(), dfa.timebin.max())))  # empty timebins included
for user in userIndex:
    dfh.loc[user] = dfa.loc[user].timebin.value_counts()
dfh.replace(np.NaN, 0.0, inplace=True)  # Replace NaN's with 0.0
dfh

Play around with `np.reshape` to ensure that I''m reshaping correctly.

In [None]:
arr = np.arange(20).reshape((4, 5))
display(arr, arr.reshape((arr.size)))

* Remember to normalize?
* Sure about transpose?
* Read up on PCA
* Use decomposition.SparcePCA instead?
* Talk to Joachim about PCA input shape

In [None]:
toPca = dfh.values.T.reshape((1, -1)).T

In [None]:
pca = decomposition.PCA() 
pca.fit(dfh.values.T)
print(pca.explained_variance_ratio_, pca.explained_variance_ratio_.sum(), sep='\n\nSum: ') 

In [None]:
fig, ax = plt.subplots()
ax.plot(pca.components_[0])
print(pca.components_[0].mean()) 

In [None]:
dfh.mean(axis=1).plot() 

In [None]:
fig, ax = plt.subplots()
xax = np.arange(pca.explained_variance_ratio_.size) + 1
ax.plot(xax, pca.explained_variance_ratio_, 'o--')
ax.set_xticks(xax); 

In [None]:
fig, ax = plt.subplots(figsize=(9, 9))
pc = ax.matshow(pca.components_, cmap=mpl.cm.RdYlGn, vmin=-1, vmax=1)  # Spectral_r, bwr, RdYlBu_r, PiYG
ax.grid(False)
fig.colorbar(pc) 

# Play around with plotting using NetworkX-package

In [None]:
adfDict = defaultdict(list)
for init in adfNoActivity.index:
    for recv in adfNoActivity.columns:
        if not pd.isnull(adfNoActivity.loc[init, recv]):
            # Don't use (userAlias, activity-fraction)-tuples as user indicators for now
            # adfDict[init].append((recv, adfNoActivity.loc[init, recv]))
            adfDict[init].append(recv)

In [None]:
grp = nx.from_dict_of_lists(adfDict)

Make a simple plot of the graph

In [None]:
nx.draw(grp, node_size=70, node_color='steelblue', edge_color='lightblue') 

In [None]:
dct = defaultdict(lambda: 0)
for n0 in grp.nodes_iter():
    for n1 in grp.nodes_iter():
        if n0 != n1:
            dct[n1] += nx.algorithms.node_connectivity(grp, n0, n1)

In [None]:
mostConnected = sorted(dct.items(), key=lambda x:x[1])[-10:]
mostConnected

In [None]:
mostConnectedLabels = {el[0]: "{} ({})".format(*el) for el in mostConnected}
mostConnectedLabels

## Experimenting with clustering algorithms

In [None]:
kcom = nx.k_components(grp)

In [None]:
len(kcom[1])

Count clique size for the two algorithms

In [None]:
mclq = list(nx.algorithms.enumerate_all_cliques(grp))
mclq[::-1] 
cntMclq = Counter((len(el) for el in mclq))
cntMclq

In [None]:
fclq = list(nx.algorithms.find_cliques(grp))
cntFclq = Counter(len(x) for x in fclq) 
cntFclq

Turn the algorithms into sets, compute the union and count how many times the usera are present in each one

In [None]:
setMclq = {tuple(el) for el in mclq if len(el) == max(cntMclq)}
setFclq = {tuple(el) for el in fclq if len(el) == max(cntFclq)}

In [None]:
clqCnt = Counter(el for tp in (setMclq.union(setFclq)) for el in tp)
clqCnt

The intersection between the detected cliques are low (1/4)

In [None]:
setMclq.intersection(setFclq) 

Draw with labels for the 10 most connected users

In [None]:
for k in 1/np.sqrt(2**np.arange(1, 10)):
    fig, ax = plt.subplots(figsize=(12, 8))
    pos = nx.spring_layout(grp, k=k)
    nx.draw_networkx_nodes(grp, pos, node_size=70, node_color='steelblue', ax=ax)
    nx.draw_networkx_edges(grp, pos, edge_color='lightgray') 
    # nx.draw_networkx_labels(grp, pos, labels=toLabel, font_color='darkorange', font_size=15, font_weight='bold')
    nx.draw_networkx_labels(grp, pos, labels=mostConnectedLabels,
                            font_color='mediumaquamarine', font_size=15, font_weight='bold')
    ax.set_axis_bgcolor('white')
    ax.set_xticklabels([])
    ax.set_yticklabels([])
    ax.set_title("k = %.3f" % k)

In [None]:
def getColor(node):
    """Get colors from the vitidis color scheme"""
    cols = np.array([mpl.colorbar.cm.viridis(i) for i in range(256)]) 
    valMax, valMin = max(dct.values()), min(dct.values())
    nodeWeight = (dct[node] - valMin)/(valMax - valMin)
    colIdx = 256*nodeWeight - 1
    return cols[colIdx, :] 

In [None]:
cols

In [None]:
fig, ax = plt.subplots()
ccmap = [getColor(node) for node in grp.nodes_iter()]
nx.draw(grp, pos, node_color=ccmap, node_size=80, ax=ax, alpha=0.55, edge_color='slategray') 

In [None]:
fig, ax = plt.subplots(figsize=(16, 9))
pos = graphviz_layout(grp, prog='neato')
nx.draw_networkx_nodes(grp, pos, node_size=70, node_color='steelblue', ax=ax)
nx.draw_networkx_edges(grp, pos, edge_color='slategray') 
# nx.draw_networkx_labels(grp, pos, labels=toLabel, font_color='darkorange', font_size=15, font_weight='bold')
nx.draw_networkx_labels(grp, pos, labels=mostConnectedLabels,
                        font_color='mediumaquamarine', font_size=15, font_weight='bold')
ax.set_axis_bgcolor('white')
ax.set_xticklabels([])
ax.set_yticklabels([])

# Experimenting with clustering algorithms

In [None]:
kcom = nx.k_components(grp)

In [None]:
len(kcom[1])

Count clique size for the two algorithms

In [None]:
mclq = list(nx.algorithms.enumerate_all_cliques(grp))
mclq[::-1] 
cntMclq = Counter((len(el) for el in mclq))
cntMclq

In [None]:
fclq = list(nx.algorithms.find_cliques(grp))
cntFclq = Counter(len(x) for x in fclq) 
cntFclq

Turn the algorithms into sets, compute the union and count how many times the usera are present in each one

In [None]:
setMclq = {tuple(el) for el in mclq if len(el) == max(cntMclq)}
setFclq = {tuple(el) for el in fclq if len(el) == max(cntFclq)}

In [None]:
Counter(el for tp in (setMclq.union(setFclq)) for el in tp)

The intersection between the detected cliques are low (1/4)

In [17]:
setMclq.intersection(setFclq) 

NameError: name 'setMclq' is not defined

In [18]:
%who

Counter	 Useralias	 adCall	 adSms	 adf	 barSBS	 bn	 c	 callCount	 
calldf	 cdf	 countsOnBarPlot	 decomposition	 defaultdict	 df	 dict2DataFrame	 display	 getMostActive	 
getUserList	 graphviz_layout	 idx	 loadUserParallel	 loadUserPhonenumberDict	 looseAxesLimits	 mpl	 msno	 nMostActive	 
np	 nx	 os	 pd	 phonebook	 plt	 sdf	 smsCount	 smsdf	 
