In [None]:
import sys
import os
sys.path.append(os.path.abspath(".."))
import itertools

import multiprocessing

import warnings
warnings.filterwarnings('ignore')

import numpy as np
import bottleneck as bn
import pandas as pd
from sklearn import decomposition
import networkx as nx
# import networkx.algorithms.approximation as nxa
import igraph as ig
# # http://stackoverflow.com/questions/35279733/what-could-cause-networkx-pygraphviz-to-work-fine-alone-but-not-together
# from networkx.drawing.nx_agraph import graphviz_layout
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("darkgrid", {"axes.facecolor": ".95"})
import missingno as msno
%matplotlib inline

from collections import defaultdict, Counter
from IPython.display import display as disp

from speclib.loaders import (loadUserPhonenumberDict, getUserList, Useralias,
                             loadUserParallel, dict2DataFrame, users2DataFrame)
from speclib.plotting import looseAxesLimits, barSBS, countsOnBarPlot, plotNeatoGraph, nxQuickDraw, barFractionPlot
from speclib.graph import networkx2igraph, igraph2networkx, userDF2nxGraph, userDF2activityDataframe
from speclib.misc import nanEqual, timedelta2unit, standardizeData, pcaFit
from speclib.userActivityFunctions import mutualContact, userDf2timebinDf, userDf2timebinAdjMat, userDf2CliqueDf

for k, v in {'font.size': 13.0,
             'legend.fontsize': 13.0,
             'axes.labelsize': 12.0,
             'axes.titlesize': 15.0,
             'figure.figsize': [16.0, 7.0],
             'figure.dpi': 200,
             'figure.titlesize': 'large',
             'xtick.labelsize': 13.0,
             'ytick.labelsize': 13.0}.items():
    mpl.rcParams[k] = v

%load_ext watermark
%watermark -a "Allan Leander Rostock Hansen" -u -d -v -p numpy,bottleneck,pandas,matplotlib,sklearn,missingno
%watermark  -p networkx,igraph,seaborn

# Load and clean data

In [None]:
if os.path.isfile('phone_df.h5'):
    df = pd.io.pytables.read_hdf('phone_df.h5', 'df')
    import pickle
    with open('useralias.pk', 'br') as fid:
        ua = pickle.load(fid)
else:
    ua = Useralias()
    userSpec = [(user, ua[user], ('sms', 'call')) for user in getUserList()]
    userData = loadUserParallel(userSpec) 
    df = users2DataFrame(userData, ua)
    del userData
phonebook = loadUserPhonenumberDict(ua) 
df.head() 

Remove call to users not in phonebook.

In [None]:
df = df[df.number.isin(phonebook)] 

Add _contactedUser_ column and remove the _number_ column.

In [None]:
df['contactedUser'] = df.number.apply(lambda x: phonebook[x]) 
df = df.drop('number', axis=1)

In [None]:
df.head() 

## Check for obvious outliers

In [None]:
ax = df.timestamp.dt.year.value_counts().sort_index(ascending=True).plot.bar() 
countsOnBarPlot(ax) 

Remove data preceding 2013.

In [None]:
df = df[df.timestamp.dt.year >= 2013] 

## Remove entries with users contacting themself

In [None]:
tmp = df.reset_index()
tmp = tmp[(tmp.user != tmp.contactedUser)]
df = tmp.set_index(['user', 'comtype'], drop=False)
del tmp

# Turn data into a Networkx graph

In [None]:
g = userDF2nxGraph(df)

Ensure that the graph contains the correct number of nodes

In [None]:
assert len(list(g.nodes())) == len(set(df.index.get_level_values('user').tolist() + df.contactedUser.tolist())) 

## Analyse the graph

In [None]:
fig, ax = plt.subplots()
cnt = Counter(el[1] for el in g.degree())
x, y = list(zip(*((i, cnt[i]) for i in range(max(cnt)+1))))
ax.bar(x, y)
ax.set_xlabel('Node degree')
ax.set_ylabel('Counts')
countsOnBarPlot(ax) 

# Community detection

Find communities consisting of 5-cliques with a 4 degree connection between the cliques.
<font color="red">Check up on this!</font>

[From Documentation](http://networkx.readthedocs.io/en/latest/reference/generated/networkx.algorithms.community.kclique.k_clique_communities.html#networkx.algorithms.community.kclique.k_clique_communities):

> Find _k_-clique communities in graph using the percolation method.
> 
> A _k_-clique community is the union of all cliques of size _k_ that can be reached through adjacent (sharing _k_-1 nodes) _k_-cliques.

In [None]:
kcDf = pd.DataFrame(sorted(nx.algorithms.community.k_clique_communities(g, 5), key=lambda x: len(x), reverse=True))
kcDf.columns.name = 'users'
kcDf.index.name = 'communityNumber'
disp(kcDf.head())

gsc = g.subgraph(kcDf.iloc[0])
nxQuickDraw(gsc) 

# Clique detection

Could be used for analysis of a larger network

In [None]:
cliqueDf = pd.DataFrame(nx.clique.find_cliques_recursive(g))

cliqueDf['cliqueSize'] = cliqueDf.count(axis=1)
cliqueDf = cliqueDf.sort_values('cliqueSize', ascending=False)

ax = cliqueDf.cliqueSize.value_counts().sort_index().plot.bar(rot=0) 
ax.set_xlabel('Clique size') 
ax.set_ylabel('Counts') 
countsOnBarPlot(ax)

## Choose a clique with 5 users and make a subgraph 

In [None]:
chosenUserLst = cliqueDf[cliqueDf.cliqueSize == 5].drop('cliqueSize', axis=1).iloc[1].dropna().tolist()
print("Chosen users:", *chosenUserLst, sep='\n') 
gs = g.subgraph(chosenUserLst)
nxQuickDraw(gs, plotSettings={'with_labels': True})

## Create timebinning for chosen users

Fraction of users contribution to communication

In [None]:
userDF2activityDataframe(df.loc[chosenUserLst]).sum(axis=1)

In [None]:
(ax, userOrder) = barFractionPlot(userDF2activityDataframe(df.loc[chosenUserLst]).sum(axis=1)) 
ax.set_title('Communication with everybody')
cliqueSubActDf = userDF2activityDataframe(userDf2CliqueDf(df, chosenUserLst)).sum(axis=1)
(ax, userOrder) = barFractionPlot(cliqueSubActDf, userOrder=userOrder) 
ax.set_title('Communication within the clique') 

In [None]:
cliqueSubDf = userDf2CliqueDf(df, chosenUserLst)
toPcaRaw = userDf2timebinAdjMat(cliqueSubDf, 6, chosenUserLst)

fig, ax = plt.subplots()
pc = ax.pcolorfast(toPcaRaw) 
fig.colorbar(pc)

In [None]:
pca = pcaFit(toPcaRaw)

fig, ax = plt.subplots()
ax.plot(pca.explained_variance_ratio_)
ax.set_xlabel('Eigenvalue #') 

In [None]:
cliqueDf = pd.DataFrame(nx.clique.find_cliques_recursive(g))
cliqueDf['cliqueSize'] = cliqueDf.count(axis=1)
cliqueDf = cliqueDf.sort_values('cliqueSize', ascending=False)

binsCalendarDay = 6
cliquePcaDct = dict()
cliqueSizeLst = [x for x in cliqueDf.cliqueSize.unique() if x > 2]
for cs in cliqueSizeLst:
    cliquePcaDct[cs] = list()
    for clique in cliqueDf[cliqueDf.cliqueSize == cs].iloc[:, :cs].values:
        clique = clique.tolist()
        cliqueSubDf = userDf2CliqueDf(df, clique)
        toPcaRaw = userDf2timebinAdjMat(cliqueSubDf, binsCalendarDay, clique)
        pca = pcaFit(toPcaRaw)
        cliquePcaDct[cs].append(pca.explained_variance_ratio_) 

In [None]:
import palettable

fig, ax = plt.subplots() 
colors = palettable.colorbrewer.qualitative.Dark2_5_r.mpl_colors
for i, cs in enumerate(sorted(cliquePcaDct.keys())):
    csDf = pd.DataFrame(cliquePcaDct[cs])
    upper, mean, median, lower, std = csDf.max(axis=0), csDf.mean(axis=0), csDf.median(axis=0), csDf.min(axis=0), csDf.std(axis=0)
    ax.plot(mean+i, '-o', color=colors[i], label='clique size %d' % cs)
    # ax.errorbar(range(len(mean)), mean+i, uplims=upper, lolims=lower, color=colors[i])
    ax.fill_between(range(len(mean)), upper+i, lower+i, color=colors[i], alpha=0.4)
ax.legend(loc='lower right', fancybox=True, framealpha=0.8)
# ax.set_yscale('log')

In [None]:
import pickle
with open('clique_pca_evr_dump.pickle', 'wb') as fid:
    pickle.dump(cliquePcaDct, fid)

In [None]:
# def getPcaExplainedVarianceRatio(cliqueSubDf, chosenUserLst, binsCalendarDay):
#         toPcaRaw = userDf2timebinAdjMat(cliqueSubDf, binsCalendarDay, chosenUserLst)
#         pca = pcaFit(toPcaRaw)
#         return (len(chosenUserLst), pca.explained_variance_ratio_.copy()) 
# 
# def getPcaExplainedVarianceRatio_handler(x):
#     return getPcaExplainedVarianceRatio(*x) 
# 
# def foo(inp):
#     toPca, cs = inp
#     pca = decomposition.PCA()
#     pca.fit(standardizeData(toPca))
#     return (cs, pca.explained_variance_ratio_)
# 
# try:
#     pool = multiprocessing.Pool(processes=2)
# 
#     callArgList = list()
#     binsCalendarDay = 6
#     cliqueSizeLst = [x for x in cliqueDf.cliqueSize.unique() if x > 6]
#     for cs in cliqueSizeLst:
#         for chosenUserArr in cliqueDf[cliqueDf.cliqueSize == cs].iloc[:, :cs].values:
#             # Throw away unneeded columns
#             cliqueSubDf = userDf2CliqueDf(df[['user', 'comtype', 'weekday', 'timestamp', 'contactedUser']], chosenUserLst)  
#             chosenUserLst = chosenUserArr.tolist()
#             toPcaRaw = userDf2timebinAdjMat(cliqueSubDf, binsCalendarDay, chosenUserLst)
#             callArgList.append((toPcaRaw, cs)) 
# 
#     call = pool.map(foo, callArgList)
# 
# finally:
#     pool.close() 