In [None]:
import sys
import os
sys.path.append(os.path.abspath(".."))
import itertools

import warnings
warnings.filterwarnings('ignore')

import numpy as np
import bottleneck as bn
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import networkx as nx
import networkx.algorithms.approximation as nxa
import igraph as ig
# http://stackoverflow.com/questions/35279733/what-could-cause-networkx-pygraphviz-to-work-fine-alone-but-not-together
from networkx.drawing.nx_agraph import graphviz_layout
from sklearn import decomposition
import seaborn as sns
sns.set_style("darkgrid", {"axes.facecolor": ".95"})
%matplotlib inline

import missingno as msno
from collections import defaultdict, Counter

from IPython.display import display, Markdown, HTML

from speclib.loaders import (loadUserPhonenumberDict, getUserList, Useralias,
                             loadUserParallel, dict2DataFrame, users2DataFrame)
from speclib.plotting import looseAxesLimits, barSBS, countsOnBarPlot, plotNeatoGraph, nxQuickDraw
from speclib.graph import networkx2igraph, igraph2networkx, userDF2nxGraph, userDF2activityDataframe
from speclib.misc import nanEqual, timedelta2unit, standardizeData


for k, v in {'font.size': 13.0,
             'legend.fontsize': 13.0,
             'axes.labelsize': 12.0,
             'axes.titlesize': 15.0,
             'figure.figsize': [16.0, 7.0],
             'figure.dpi': 200,
             'figure.titlesize': 'large',
             'xtick.labelsize': 13.0,
             'ytick.labelsize': 13.0}.items():
    mpl.rcParams[k] = v

%load_ext watermark
%watermark -a "Allan Leander Rostock Hansen" -u -d -v -p numpy,bottleneck,pandas,matplotlib,sklearn,missingno,networkx,igraph

# Loading and Cleaning the Data

## Loading the Data

Load user SMS and call data.
Load binary datastrucctures stored on disk if they exist and otherwise load the data from the original files.

In [None]:
if os.path.isfile('phone_df.h5'):
    df = pd.io.pytables.read_hdf('phone_df.h5', 'df')
    import pickle
    with open('useralias.pk', 'br') as fid:
        ua = pickle.load(fid)
else:
    ua = Useralias()
    userSpec = [(user, ua[user], ('sms', 'call')) for user in getUserList()]
    userData = loadUserParallel(userSpec) 
    df = users2DataFrame(userData, ua)
    del userData
phonebook = loadUserPhonenumberDict(ua) 

In [None]:
df.head() 

## Cleaning data

Make a subset of the data only containing communications contained within the Social Frabric project. Of this data, select a subset of the data containing the most active users, preferebly who communicate with each other… a clique percolation algorithm could be used for this, but that won't be the initial approach.

A measure of the activity could simply be
$$ a = \sum_{\text{i}}\frac{\mathrm{user_{sms}}_i}{\sum_i \mathrm{user_{sms}}_i} + \frac{\mathrm{user_{call}}_i}{\sum_i \mathrm{user_{call}}_i} $$

but this could yield a huge $a$ for a very active, yet weakly connected user, so a weighting with the number of contacted people shoud be introduced.

Another approach is to naively sum up all events between users.

Since a conversation using SMS regesters as several events for both users (usually), whereas a conversation carried out over a call registes as one event, a weighting should be introduced.
The easy solution is to divide the adjacency matrices with the sum of all the entries, meaning that the sum of all the elements would both add up to one.
Yet another approach would be to clean the SMS data in the following way:

1. Investigate the distribution of time between a SMS and a reply to it.
2. Use the distribution to determining a typical reply time.
3. Remove entries in the SMS data which weren't replied to within some number, say 3, times the average reply time.

Cleaning the SMS data as proposed above, should also prompt for a similar cleaning of the call data.
An obvious way would be to remove unansvered calls, albeit the SMS dataset should also be checked for an "answer".

### Doing the data munging

Remove rows for which the contacted number is not present in `phonebook` (userhash to phonehash translation table).

Also add a column which contaings the useralias (`u0001`, `u0345` and so on) for the contacted user.

In [None]:
df = df[df.number.apply(lambda num: num in phonebook)] 
df['contactedUser'] = df.number.apply(lambda x: phonebook[x]) 
df.head() 

Count number of unique contacts for each user.
This is what's called the degree distribution in graph theory.

In [None]:
userUniqueCommCall = dict()   # dict for calls
userUniqueCommSms = dict()  # dict for sms
for user in df.index.get_level_values('user').unique():  # loop over users
    try:  # 'call' data might be missing from some users
        comSer = df.loc[user, 'call'].contactedUser
        userUniqueCommCall[user] = comSer.unique().size
    except KeyError:
        userUniqueCommCall[user] = 0
    try:  # 'sms' data might be missing from some users
        comSer = df.loc[user, 'sms'].contactedUser
        userUniqueCommSms[user] = comSer.unique().size
    except KeyError:
        userUniqueCommSms[user] = 0

userUniqueComm = pd.DataFrame(pd.Series(userUniqueCommCall), columns=('call',))
userUniqueComm['sms'] = pd.Series(userUniqueCommSms)
userUniqueComm['total'] = userUniqueComm.sms + userUniqueComm.call
del userUniqueCommCall
del userUniqueCommSms

userUniqueComm.sort(columns='total', inplace=True, ascending=False)
display(userUniqueComm.head(), userUniqueComm.describe()) 

Plot the findings, using two different plot styles, using both normal and log scaling.

In [None]:
fig = plt.figure(figsize=(14, 8))
ax0 = (plt.subplot2grid((12, 2), (0, 0), rowspan=4))
ax1 = (plt.subplot2grid((12, 2), (0, 1), rowspan=4))
ax2 = (plt.subplot2grid((12, 2), (6, 0), colspan=4, rowspan=9))

# Normal scaling
# userUniqueComm.plot.line(ax=ax0) 
# userUniqueComm.total.value_counts().plot.hist(bins=(userUniqueComm.total.max()), ax=ax0) 
userUniqueComm.drop('total', axis=1).plot.hist(bins=userUniqueComm.drop('total', axis=1).max().max(),
                                               stacked=True,
                                               ax=ax0)
ax0.set_xlabel('Connectivity degree')
userUniqueComm.drop('total', axis=1).plot.area(ax=ax1)
ax1.set_xlabel('Users'), ax1.set_ylabel('Connectivity')
fig.suptitle('Unique users conacted using Calls and SMS')

# Log scaling
userUniqueComm.plot.line(ax=ax2) 
ax2.set_yscale('log')
ax2.grid(which='minor') 
ax2.set_xlabel('Users'), ax2.set_ylabel('Connectivity')

A threshold of 20 unique contacts is used to select the most active users

In [None]:
totalCommunicationThreshold = 20
chosenUsers = userUniqueComm.index[userUniqueComm.total > totalCommunicationThreshold]
print(*chosenUsers, sep='\t')

### Compute adjacency matrices
Construct DataFrames for adjacency matrices/graphs for call and SMS data, where the index is the user initiating contact, and the columns is the users targeted by said contact.
Selected users is limited to previously chosen active users.

In [None]:
adCall = dict()  # dict for user calls
adSms = dict()  # dict for user sms

# for user in df.index.get_level_values('user').unique():
for user in chosenUsers:
    if 'call' in df.loc[user].index:  # user might not have 'call' data
        cnt = df.loc[user, 'call'].contactedUser.value_counts()
        adCall[user] = cnt.to_dict()
    if 'sms' in df.loc[user].index:  # user might not have 'sms' data
        cnt = df.loc[user, 'sms'].contactedUser.value_counts()
        adSms[user] = cnt.to_dict() 

# Convert dicts to DataFrames and label the index and columns.
adCall = pd.DataFrame(adCall)
adSms = pd.DataFrame(adSms)
adCall.columns.name = 'userRec'
adSms.columns.name = 'userRec'
adCall.index.name = 'userInit'
adSms.index.name = 'userInit'

# Drop contacted users which are'nt preset in the index (contact initating users) 
adCall.drop(list(set(adCall.columns) - set(adCall.index)), axis=1, inplace=True) 
adSms.drop(list(set(adSms.columns) - set(adSms.index)), axis=1, inplace=True) 

Add a column where the activity level for each user is summed up.

In [None]:
adCall['activity'] = adCall.sum(axis=0, skipna=True)
adSms['activity'] = adSms.sum(axis=0, skipna=True)

# Sort the columns so that the 'activity' column is a the start of the Data Frame
adCall.columns = adCall.columns.sort_values()
adSms.columns = adSms.columns.sort_values() 

display(adCall.head(), adSms.head()) 

### Construct graph objects
A NetworkX graph is constructed from the DataFrame with the adjacency-matrix like data.
The call and sms data is combined.

In [None]:
adf = adCall + adSms
adf.head() 

In [None]:
dct = dict()  # keys is users which initiate contact (adf.index)
# Remove newthe recently added activity column and make the activity measure binary (int8 for display putposes)
adfNoActivity = (adf[adf.columns[adf.columns != 'activity']] > 0).astype(np.int8)
display(adfNoActivity.head())

for i, iUsr in enumerate(sorted(adf.index.unique())):  # Loop througth sorted user list
    comSeries = adfNoActivity.loc[iUsr]  # Extract user communications
    comSeries = comSeries.index[comSeries.astype(bool)]  # Filter the usernames (index) using the series masking data
    dct[iUsr] = comSeries.tolist()    # Convert Pandas Series to a list
g = nx.from_dict_of_lists(dct)  # costruct graph

# Delete temporary variables
del dct
del adfNoActivity

Plot the network

In [None]:
nxQuickDraw(g) 

Clearly some nodes aren't connected to the network – their contacts probably didn't meet the "choose any users with 20 or more individual contacts"-criterion.

Nodes with no connections (that is, nodes with degree 0) are removed.

In [None]:
for node, degree in dict(g.degree()).items():
    if degree == 0:
        g.remove_node(node) 

Verify by plotting the network again

In [None]:
nxQuickDraw(g) 

This indeed looks like nodes with degree 0 are removed.

## Apply clique algorithms to find most active users

<!--
* Two algorithms is used.
* I use the users returned from the biggest groups from both (14 users).
* I also investigate the number overlap inbetween the two algorithms wrt. cliques and users.
-->

### Verify results from NetworkX using the iGraph library

In [None]:
ig = networkx2igraph(g) 

igraphCliques = ig.maximal_cliques() 
igraphCounter = Counter((len(el) for el in igraphCliques)) 
networkxCounter = Counter(len(el) for el in nx.algorithms.find_cliques(g))
if networkxCounter == igraphCounter:
    display(Markdown('Igraph and Networkx yields identical results.'))
else:
    display(Markdown('Igraph and Networkx yields different results!'))

### Using NetworkX

In [None]:
clqdf = pd.DataFrame(nx.algorithms.find_cliques(g))
clqdf['cliquesize'] = (~clqdf.isnull()).sum(axis=1)
clqdf.sort(columns='cliquesize', ascending=False, inplace=True) 
clqdf.head() 

### Investigate the cliques

Plot clique size distribution

In [None]:
fig, ax = plt.subplots()
clqdf.cliquesize.value_counts().plot.bar(ax=ax, rot=0)
ax.set_title("Clique size distribution")
looseAxesLimits(ax, [0.0, 0.0, 0.0, 0.1])
countsOnBarPlot(ax)
ax.set_xlabel('Clique size')
ax.set_ylabel('Number of cliques') 

What is the average number of connections vs. clique size?
The DataFrame containing the cluques are as follows.

Calculating the mean connectivity and standard deviation for each clique…

In [None]:
lst = list()
for i in range(clqdf.shape[0]):
    nodeArr, cliqueSize = clqdf.iloc[i][:-1], clqdf.iloc[i][-1]
    cliqueConnLst = [el[1] for el in nx.degree(g, nodeArr)]
    lst.append((cliqueSize, np.mean(cliqueConnLst), np.std(cliqueConnLst)))

clqStatDf = pd.DataFrame(lst, columns=['cliqueSize', 'cliqueMean', 'cliqueStd'])
del lst
clqStatDf.head() 

… and takeing the mean for each group size…

In [None]:
clqStatDfMean = clqStatDf.groupby('cliqueSize').mean()
clqStatDfMean

…which can be visualized as a violin plot

In [None]:
# Violinplot
fig, ax = plt.subplots()
sns.violinplot(x='cliqueSize', y='cliqueMean', data=clqStatDf, split=True, scale='count', inner='stick', color='0.8', ax=ax) 

# Errorbars
(_, caps, _) = ax.errorbar(range(clqStatDfMean.shape[0]), clqStatDfMean.cliqueMean, clqStatDfMean.cliqueStd,
                           fmt='o', color='orange', elinewidth=2, alpha=0.75, barsabove=True,
                           capsize=5, label=r'$\mu$ and $\sigma$')
# Caps on error bars
for cap in caps:
    cap.set_markeredgewidth(2)
    
ax.legend(loc='best')

# Take 2
fig, ax = plt.subplots()
sns.violinplot(x="cliqueSize", y="cliqueMean", data=clqStatDf, inner=None, color="0.8", scale='count', ax=ax)

sns.stripplot(x="cliqueSize", y="cliqueMean", data=clqStatDf, jitter=0.04, ax=ax, color='k', size=3, alpha=0.5)

# Errorbars
(_, caps, _) = ax.errorbar(range(clqStatDfMean.shape[0]), clqStatDfMean.cliqueMean, clqStatDfMean.cliqueStd,
                           fmt='o', color='orange', elinewidth=2, alpha=0.75, barsabove=True, capsize=5,
                           label=r'$\mu$ and $\sigma$')
# Put end caps on the error bars
for cap in caps:
    cap.set_markeredgewidth(2)

ax.legend(loc='best') 

And plotting a histogram of the clique mean accross all clique sizes, clearly shows a bimodal distribution

In [None]:
fig, ax0 = plt.subplots()
ax1 = ax0.twinx()
histGridColor = np.array((242, 182, 138))/255
histPlotColor = np.array((237, 153, 92))/255
densGridColor = np.array(((205, 128, 146)))/255
densPlotColor = np.array((184, 73, 99))/255

clqStatDf.cliqueMean.plot.hist(bins=75, ax=ax0, color=histPlotColor, edgecolor='w')
clqStatDf.cliqueMean.plot.density(ax=ax1, color=densPlotColor)
ax0.grid(axis='y', color=histGridColor)
ax1.grid(axis='y', color=densGridColor)
ax0.tick_params(axis='y', colors=histPlotColor)
ax1.tick_params(axis='y', colors=densPlotColor)
ax0.yaxis.label.set_color(histPlotColor)
ax1.yaxis.label.set_color(densPlotColor)
ax0.set_xlabel('Connectivity')

### Choose users from the largest clique: 6
Remove users which are not in a clique with size 6 or larger.

In [None]:
clqdf = clqdf[clqdf.cliquesize >= 6]
clqdf.head() 

* Make an array containing all the users in the selected clique(s) only once.
* From that array, generate a new array which includes all the contacts of those users.
* Extract a subgraph for those users.

In [None]:
coreUsers = pd.Series(clqdf.drop('cliquesize', axis=1).values.flat).dropna().unique()  # Unique list of chosen users
remoteUsers = [tuple(nx.neighbors(g, user)) for user in coreUsers]  # Chosen users neighbours
remoteUsers = pd.Series(pd.DataFrame(remoteUsers).values.flat).dropna().unique()  # Make the array elements unique

# Print the choice and the lengths of the array
print('Core users in network ({} users):'.format(coreUsers.size))
print(*coreUsers, sep='\t', end='\n'*2)
print('Core users and their connections ({} users):'.format(remoteUsers.size))
print(*remoteUsers, sep='\t', end='\n'*2)

Ensure that the core users are included in the remote users

In [None]:
if set(coreUsers).issubset(set(remoteUsers)):
    display(Markdown('All core users are contained in remote users.'))
else:
    display(Markdown('Remember to combine core users and remote users for subgraph extraction!'))

This is expected, since the core users is a clique, and thus will be included among the links from the other users in the clique.

Make the subgraph, and verify it by plotting it.

In [None]:
gs = g.subgraph(remoteUsers)  # The subset of the graph on which a PCA analysis should be performed on the users

nx.draw(gs, with_labels=True, node_color='lightblue', edge_color='lightgray', node_size=150) 

In [None]:
gsDegdf = pd.DataFrame(gs.degree())
gsDegdf.columns = ['user', 'degree']
gsDegdf.set_index('user', inplace=True)
gsDegdf.sort(columns='degree', ascending=False, inplace=True)
display(gsDegdf.head())

fig, ax0 = plt.subplots()
ax0.hist(gsDegdf.degree,
        range=(gsDegdf.degree.min(), gsDegdf.degree.max()+1),
        bins=gsDegdf.degree.max()+1 - gsDegdf.degree.min(),
        edgecolor='white')
ax0.minorticks_on() 
ax0.grid(True, which='both')
ax0.set_xticks(range(0, gsDegdf.degree.max()+2, 5))
ax0.set_xbound(0, gsDegdf.degree.max()+2)
ax0.set_xlabel('Connectivity degree')
ax0.set_ylabel('Number of users')
# countsOnBarPlot(ax0)
ax0.set_title('Conectivity vs number of users')

# ax1 = ax0.twinx()
# gsDegdf.plot.density(ax=ax1, color='orange')
# ax1.grid(b=False)

# Do the PCA analysis

Obtain the adjacency matrix for the chosen network and plot it.
Clearly one user is _very_ connected.
Consider removing said user.

In [None]:
gsa = nx.adj_matrix(gs) 
fig, ax = plt.subplots(figsize=(4, 4), dpi=150) 
ax.pcolorfast(gsa.todense())

In [None]:
tmp = max(gs.degree(), key=lambda nd: nd[1]) 
tmp

In [None]:
gs.remove_node(tmp[0])

In [None]:
gsa = nx.adj_matrix(gs) 
fig, ax = plt.subplots(figsize=(4, 4), dpi=150) 
ax.pcolorfast(gsa.todense()) 

## Binning activities
### Do the time binning of activities
Now the chosen users' activities are about to be time binned.
Bins will initially be binned in sig hour bins, initially starting from midnight.

In [None]:
dfa = df.loc[remoteUsers.tolist()]
dfa.head() 

There's some times in the timebin which needs to be removed, since the project didn't exist in 1970!

In [None]:
dfa.timestamp.describe() 

In [None]:
dfa['year'] = dfa.timestamp.dt.year
dfa.year.value_counts().plot.bar() 
countsOnBarPlot(plt.gca())

Let's examine the 2011 and 2012 data points. There's so few that they seem suspicious.

In [None]:
dfa[(dfa.year == 2011) | (dfa.year == 2012)].reset_index().user.unique() 

Indeed there's only one user responsible for these points, so it's likely that the uses phone just had some wrong date settings, and the data will be removed for this purpose, and only data from 2015 and onwards will be considered.

In [None]:
dfa = dfa[dfa.year >= 2013]

Find minimum timestamp

In [None]:
dfa.timestamp.min() 

Take the corresponding unix time integer, use modolu operator for 6-hour intervans, substract it from the integer time, and cast back to a Timestamp.
This is the starting point for the time binning.

In [None]:
binWidth = 3600*6  # 6 hours in seconds
startTime = dfa.timeint.min() - (dfa.timeint.min() % binWidth)
display(pd.Timestamp(startTime, unit='s'), dfa.timestamp.min() - pd.Timestamp(startTime, unit='s') )

Compute the timebins using integer division and examine the timebins:

In [None]:
dfa['timebin'] = (dfa.timeint - startTime) // binWidth

display(dfa.timebin.describe()) 

fig, ax0 = plt.subplots()
hy, hx = np.histogram(dfa.timebin, bins=400)
hx = hx[1:]
ax0.bar(hx, hy, width=10, label='Timebins')
ax1 = ax0.twinx()
hy = np.convolve(np.ones(8), hy, 'same')
ax0._get_lines.get_next_color() 
ax1.plot(hx, hy, color=ax0._get_lines.get_next_color(), label='Timebins (smoothed)') 
ax0.set_xlabel('Timebins')
ax0.set_ylabel('Timebins')
ax1.set_ylabel('Timebins (smoothed)') 
ax0.grid(axis='x')
ax1.grid(False)

Also visualize using a density kernel plot

In [None]:
dfa.timebin.plot.density() 

### Construct timeseries

In [None]:
dfts = pd.DataFrame(index=remoteUsers, columns=np.arange(dfa.timebin.min(), dfa.timebin.max()+1))

for user in remoteUsers:
    dfts.loc[user] = dfa.loc[user].timebin.value_counts()

display(dfts.head())

dfts.replace(np.NaN, 0.0, inplace=True) 
fig, (ax0, ax1) = plt.subplots(1, 2)
ax0.grid(axis='y')
ax0.set_facecolor('black')
ax0.pcolorfast(np.log10(dfts.values), cmap=mpl.cm.binary_r)
ax0.set_xlabel('Timebins')
ax0.set_ylabel('Users')
ax0.set_yticklabels([])
ax0.set_title(r'$\log_{10}$ user activity')
ax1.grid(axis='y')
ax1.set_facecolor('black')
ax1.pcolorfast(np.log10(np.cumsum(dfts.values, axis=1)), cmap=mpl.cm.binary_r)
ax1.set_xlabel('Timebins')
ax1.set_yticklabels([])
ax1.set_ylabel('Users')
ax1.set_title(r'$\sum \log_{10}$ user activity')


Converting the binned data to a format which is useable in with the PCA algorithm

Testing that the reshaping does the right thing:

In [None]:
mat2PcaVec = lambda x: x.reshape((1, -1))

In [None]:
tmp = np.arange(12)[:,np.newaxis].reshape((3, -1))
display(tmp, mat2PcaVec(tmp))

In [None]:
toPca = dfts.values
toPcaMean = toPca.mean(axis=0)
toPcaStd = toPca.std(axis=0)
toPcaStd[toPcaStd == 0] = 1.0
toPca = (toPca - toPcaMean)/toPcaStd

In [None]:
fig, (ax0, ax1) = plt.subplots(1, 2)
ax0.bar(*np.unique(np.sign(toPca.flat), return_counts=True))
ax0.set_xticks([-1, 0, 1])
ax0.set_xlabel('Sign of data entry')
ax0.set_ylabel('Count')

ax1.hist(toPca.flat, 100)
ax1.set_xscale('log')
ax1.set_yscale('log')
ax1.grid(which='minor')
ax1.set_xlabel('Value in adjacency matrix')
ax1.set_ylabel('Count')

In [None]:
pca = decomposition.PCA()
pca.fit(toPca)
explVarRat = pca.explained_variance_ratio_ 

In [None]:
fig, ax = plt.subplots()
ax.plot(explVarRat, 'o-')

## Experiment with other grouping/binning options

Combine activity for all hours on a weekly basis

In [None]:
dfWH = dfa.drop(['timebin', 'year', 'timeint', 'weekday', 'hour'], axis=1)  # HW is short for Weekday Hour
# dfWH.reset_index(inplace=True)
# dfWH.set_index('timestamp', inplace=True)
dfWH.head() 

In [None]:
tmp = dfWH.groupby((dfWH.index.get_level_values(0), dfWH.timestamp.dt.weekday, dfWH.timestamp.dt.hour))
dfWHCnt = tmp.count().drop(['body', 'duration', 'number', 'timestamp'], axis=1)
dfWHCnt.rename(index=None, columns={'contactedUser': 'events'}, inplace=True) 
dfWHCnt.index.rename(['user', 'weekday', 'hour'], inplace=True)
dfWHCnt.head(10) 

In [None]:
whCntMat = np.zeros((remoteUsers.size, 7*24))  # 7 days a week, 24 hours a day

user2row = {remoteUsers[i]: i for i in range(len(remoteUsers))}
user2col = lambda weekday, hour: 24*weekday + hour
for (user, weekday, hour) in dfWHCnt.index:
    r = user2row[user]
    c = user2col(weekday, hour)
    whCntMat[r,c] = dfWHCnt.loc[user, weekday, hour]

fig, ax = plt.subplots()
ax.pcolorfast(whCntMat) 
ax.set_xticks(np.arange(12, whCntMat.shape[1]-5, 12))
# ax.set_xticklabels([])
ax.set_yticklabels([])
ax.set_xlabel('Bins')
ax.set_ylabel('Users')

In [None]:
pca = decomposition.PCA()
pca.fit(whCntMat)
evrFloat = pca.explained_variance_ratio_
pca.fit(whCntMat > 0)
evrBool = pca.explained_variance_ratio_
fig, ax = plt.subplots()
ax.plot(evrFloat[:-1], '-o', label='float')  # Last point omitted, practically 0
ax.plot(evrBool[:-1], '-o', label='bool')  # Last point omitted, practically 0
ax.set_xlabel('Vectors')
ax.set_ylabel('Explanied variance ratio')
ax.set_yscale('log') 
ax.grid(which='minor')

In [None]:
fig, ax = plt.subplots()
ax.plot(evrFloat, '-o', label='float')  
ax.plot(evrBool, '-o', label='bool')  
ax.set_xlabel('Vectors')
ax.set_ylabel('Explanied variance ratio')

In [None]:
np.isnan(pca.get_covariance()).any() 

In [None]:
fig, ax = plt.subplots(facecolor='white')
pc = ax.pcolorfast(pca.get_covariance()) 
fig.colorbar(pc)

## Build adjacency matrices for each week


In [None]:
nTimebins = dfa.timebin.max()
toPcaMat = np.zeros((len(remoteUsers)**2, nTimebins))
for tb in range(0, nTimebins):
    weekGraph = userDF2nxGraph(dfa[dfa.timebin == tb])
    weekGraph.remove_nodes_from(set(weekGraph.nodes()) - set(remoteUsers))  # remove users not in remoteUsers
    weekGraph.add_nodes_from(remoteUsers)  # add users from remote users
    toPcaMat[:, tb] = nx.adj_matrix(weekGraph, nodelist=remoteUsers).todense().T.reshape(-1)

toPcaMat = standardizeData(toPcaMat)

In [None]:
toPcaMat.shape

In [None]:
pca = decomposition.PCA()
pca.fit(toPcaMat) 

In [None]:
fig, ax0 = plt.subplots()
ax1 = ax0.twinx()
n = 300
l0 = ax0.plot(np.arange(n), np.cumsum(pca.explained_variance_ratio_[:n]),
        label=r'$\frac{\sum_{i = 1}^{300} e_i} {\sum_{i = 1}^{N} e_i}$', color='blue')
l1 = ax1.plot(np.arange(n), pca.explained_variance_ratio_[:n],
        label=r'$\frac{e_i}{\sum_{i=1}^{N} e_i}$', color='green')
ax0.legend([l0[0], l1[0]], [l0[0].get_label(), l1[0].get_label()], fontsize='xx-large', loc='right')
ax0.grid(which='minor')
ax0.set_xlabel('Eigenvalue #') 
ax0.tick_params(axis='y', colors='blue')
ax1.tick_params(axis='y', colors='green')
ax0.yaxis.label.set_color('blue')
ax1.yaxis.label.set_color('green')
ax0.grid(axis='y', color='blue', alpha=0.4)
ax1.grid(axis='y', color='green', alpha=0.4)