In [None]:
import sys
import os
sys.path.append(os.path.abspath(".."))
import itertools

import numpy as np
import bottleneck as bn
import pandas as pd
from sklearn import decomposition
import networkx as nx
# import networkx.algorithms.approximation as nxa
import igraph as ig
# # http://stackoverflow.com/questions/35279733/what-could-cause-networkx-pygraphviz-to-work-fine-alone-but-not-together
# from networkx.drawing.nx_agraph import graphviz_layout
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("darkgrid", {"axes.facecolor": ".95"})
import missingno as msno
import palettable
%matplotlib inline

import warnings
warnings.simplefilter("ignore", category=DeprecationWarning)
warnings.simplefilter("ignore", category=mpl.cbook.mplDeprecation)
warnings.simplefilter("ignore", category=UserWarning)


from collections import defaultdict, Counter
from IPython.display import display as disp
from IPython.display import display_markdown
mdisp = lambda s: display_markdown(s, raw=True)

from speclib.loaders import *
from speclib.plotting import *
from speclib.graph import *
from speclib.misc import *
from speclib.userActivityFunctions import *

for k, v in {'font.size': 13.0,
             'legend.fontsize': 13.0,
             'axes.labelsize': 12.0,
             'axes.titlesize': 15.0,
             'figure.figsize': [16.0, 7.0],
             'figure.dpi': 200,
             'figure.titlesize': 'large',
             'xtick.labelsize': 13.0,
             'ytick.labelsize': 13.0}.items():
    mpl.rcParams[k] = v

%load_ext watermark
%watermark -a "Allan Leander Rostock Hansen" -u -d -v -p numpy,bottleneck,pandas,matplotlib,sklearn,missingno
%watermark  -p networkx,igraph,seaborn,palettable

In [None]:
%load_ext autoreload

In [None]:
%autoreload -2

# Load and clean data

In [None]:
if os.path.isfile('phone_df.h5'):
    df = pd.io.pytables.read_hdf('phone_df.h5', 'df')
    import pickle
    with open('useralias.pk', 'br') as fid:
        ua = pickle.load(fid)
else:
    ua = Useralias()
    userSpec = [(user, ua[user], ('sms', 'call')) for user in getUserList()]
    userData = loadUserParallel(userSpec) 
    df = users2DataFrame(userData, ua)
    del userData
phonebook = loadUserPhonenumberDict(ua) 
df.head() 

Remove call to users not in phonebook.

In [None]:
df = df[df.number.isin(phonebook)] 

Add _contactedUser_ column and remove the _number_ column.

In [None]:
df['contactedUser'] = df.number.apply(lambda x: phonebook[x]) 
df = df.drop('number', axis=1)

In [None]:
df.head() 

## Check for obvious outliers

In [None]:
ax = df.timestamp.dt.year.value_counts().sort_index(ascending=True).plot.bar() 
countsOnBarPlot(ax) 

Remove data preceding 2013.

In [None]:
df = df[df.timestamp.dt.year >= 2013] 

## Remove entries with users contacting themself

In [None]:
tmp = df.reset_index()
tmp = tmp[(tmp.user != tmp.contactedUser)]
df = tmp.set_index(['user', 'comtype'], drop=False)
del tmp

# Turn data into a Networkx graph

In [None]:
g = userDF2nxGraph(df)

Ensure that the graph contains the correct number of nodes

In [None]:
assert len(list(g.nodes())) == len(set(df.index.get_level_values('user').tolist() + df.contactedUser.tolist())) 

## Analyse the graph

In [None]:
fig, ax = plt.subplots()
cnt = Counter(el[1] for el in g.degree())
x, y = list(zip(*((i, cnt[i]) for i in range(max(cnt)+1))))
ax.bar(x, y)
ax.set_xlabel('Node degree')
ax.set_ylabel('Counts')
countsOnBarPlot(ax) 

# Community detection

Find communities consisting of 5-cliques with a 4 degree connection between the cliques.

[From Documentation](http://networkx.readthedocs.io/en/latest/reference/generated/networkx.algorithms.community.kclique.k_clique_communities.html#networkx.algorithms.community.kclique.k_clique_communities):

> Find $k$-clique communities in graph using the percolation method.
> 
> A $k$-clique community is the union of all cliques of size $k$ that can be reached through adjacent (sharing $k-1$ nodes) $k$-cliques.

In [None]:
kcDf = pd.DataFrame(sorted(nx.algorithms.community.k_clique_communities(g, 5), key=lambda x: len(x), reverse=True))
kcDf.columns.name = 'users'
kcDf.index.name = 'communityNumber'
disp(kcDf.head())

gsc = g.subgraph(kcDf.iloc[0])
nxQuickDraw(gsc) 

kcDf['communitySize'] = kcDf.count(axis=1)

# Clique detection

Could be used for analysis of a larger network

In [None]:
cliqueDf = pd.DataFrame(nx.clique.find_cliques_recursive(g))

cliqueDf['cliqueSize'] = cliqueDf.count(axis=1)
cliqueDf = cliqueDf.sort_values('cliqueSize', ascending=False)
cliqueDf = cliqueDf.reset_index(drop=True) 

ax = cliqueDf.cliqueSize.value_counts().sort_index().plot.bar(rot=0) 
ax.set_xlabel('Clique size') 
ax.set_ylabel('Counts') 
countsOnBarPlot(ax)

cliqueDf = cliqueDf[cliqueDf.cliqueSize > 2]

## Choose a clique with 5 users and make a subgraph 

In [None]:
df.head() 

In [None]:
chosenUserLst = cliqueDf[cliqueDf.cliqueSize == 5].drop('cliqueSize', axis=1).iloc[1].dropna().tolist()
print("Chosen users:", *chosenUserLst, sep='\n') 
gs = g.subgraph(chosenUserLst)
nxQuickDraw(gs, plotSettings={'with_labels': True})

In [None]:
df.head() 

Cut data to exclude time with low activity

In [None]:
# tmp = df[(df.timestamp > '20130810' ) & (df.timestamp < '20130920')]
# tmp = tmp.groupby([tmp.timestamp.dt.month, tmp.timestamp.dt.day]).timeint.count()
# ax = tmp.plot.bar() 

#       Cut DataFrame to chosen time interval                           Count on the date                Sort by date and plot it
ax = df[(df.timestamp > '2013-08-10' ) & (df.timestamp < '2013-09-20')].timestamp.dt.date.value_counts().sort_index().plot.bar()
ax.set_xlabel('Date')
ax.set_ylabel('Communication events') 

Seems like the useage spiked around 21/08/2013. 

In [None]:
fig, (ax0, ax1) = plt.subplots(1, 2)
df.timestamp.hist(bins=180, ax=ax0, xrot=45)  
ax0.set_title('Before date cropping')
df = df[df.timestamp > '2013-08-21']
df.timestamp.hist(bins=180, ax=ax1, xrot=45) 
ax1.set_title('After date cropping')

### Determine start time offset for the binning 

Find the first occuring communication...

In [None]:
t0 = df.timestamp.min()
t0

…and choose the corresponding day...

In [None]:
t0d = pd.Timestamp(t0.date())
t0d

Since the timeint is in seconds, but Pandas keeps it's records in nanoseconds, the integer representation of the date needs to be divided by 1e9.

To check that this is indeed true, compare the values of the integer casted `t0` to the timeint for the corresponding row:

In [None]:
np.int64(t0.value // 1e9) == df.loc[df.timestamp.idxmin()].timeint.values[0]

Since it was true, use the following for the bin time start

In [None]:
t0d = np.int64(t0d.value // 1e9)
t0 = np.int64(t0.value // 1e9) 
t0d

Binning is simply performed by integer division with a suiting bin width.
I choose 8 hours:

In [None]:
bw8h = 60**2*8
df['tbin'] = (df.timeint - t0d) // bw8h
df.head()

In [None]:
ccdf = cliqueDf[cliqueDf.cliqueSize == 5].iloc[:5]

In [None]:
ccdf

In [None]:
dct = communityDf2Pca(df, ccdf, 'tbin') 

In [None]:
key = list(dct.keys())[0]
pca = dct[key]
disp(key, pca, pca.components_.shape, pca.components_[:, :3])  # three most dominant components

In [None]:
# evr = pca.explained_variance_ratio_
# n = (np.cumsum(evr) <= evr.sum()*0.98).sum()  # number of vectors need for 98 % explanation of variance (?)
# firstN = np.abs(pca.components_[:, :n])
# 
# fig, ax = plt.subplots()
# pc = ax.pcolorfast(pca.components_[:, :n], cmap='RdBu_r', vmin=-np.abs(firstN).max(), vmax=np.abs(firstN).max())
# fig.colorbar(pc) 
# 
# firstN += firstN.min()
# firstN[firstN < 1e-6] = 0.0
# firstN /= firstN.sum()
# firstN *= n
# graphLst = [nx.from_numpy_matrix(upperTril2adjMat(firstN[:, i])) for i in range(n)] 
# 
# layout = nx.drawing.layout.circular_layout(graphLst[0])
# 
# # weighFunc = lambda w: (2*w)**(1.5) + 1
# weighFunc = lambda w: 5*w + 0.5
# for i in range(n):
#     fig, ax = plt.subplots(figsize=(10, 6))
#     edgeLabels = {edge: '{:.0f}'.format(np.round(1000*weight)) for (edge, weight) in
#                   nx.get_edge_attributes(graphLst[i], 'weight').items()}
#     drawWeightedGraph(graphLst[i], ax=ax, layout=layout, normailzeWeights=False, weightFunc=weighFunc,
#                       nodeLabels=True, edgeLabels=edgeLabels)
#     fig.suptitle(f'Vector {i+1}/{n}') 
# 
# fig, axi = plt.subplots(2, 1)
# for ax, n, lbl in zip(axi, [1, 6], ['', ' (smoothed)']):
#     ax.plot(np.convolve(np.ones(n)/n, pca.norm_mean, 'same'), label='norm_mean' + lbl, color='#20a365')
#     ax.plot(np.convolve(np.ones(n)/n, pca.norm_std, 'same'), label='norm_std' + lbl, color='#ea8a3f')
#     ax.legend(loc='best')
# 
# fig, axi = plt.subplots(2, 1)
# for ax, n, lbl in zip(axi, [1, 6], ['', ' (smoothed)']):
#     ax.plot(np.convolve(np.ones(n)/n, pca.norm_mean, 'same'), label='norm_mean' + lbl, color='#20a365')
#     ax.plot(np.convolve(np.ones(n)/n, pca.norm_std, 'same'), label='norm_std' + lbl, color='#ea8a3f')
#     ax.legend(loc='best')


In [None]:
for com, pca in dct.items():
    mdisp('### Community:' + ', '.join(com))
    pcaPlot = PcaPlotter(pca)
    pcaPlot.plotHeatmap() 
    pcaPlot.plotStandardization() 
    list(pcaPlot.plotGraphs()) 
    plt.show() # force drawing of plots befor next iteration, and thus header, is printed.

# TODO:

* Labels on PCA-graphs
* Number of significant vectors in pcolor-plot
* Detect "interesting" communities