In [None]:
import os
import sys
sys.path.append(os.path.abspath(".."))
from collections import defaultdict, Counter
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
# mpl.style.use("bmh")
import seaborn as sns
import networkx as nx
import missingno as msno
%matplotlib inline

from speclib.loaders import loadUser, loadUserPhonenumberDict, getUserList

In [None]:
def linkUser(userDct, linkDctLst, phDct):
    """Links users in the project with each other.
    
    Args:
        userDct (dict): Dictionary with sms and call entries.
        linkDctLst (defaultdict(Counter)): Defaultdict with counter-factory
                                           to provide history.
        phDct (dict): Dictionary with phoneID -> userID for participants in the project.
        
    Returns:
        (float, float): Fraction of (calls, sms) to participants in the project.
    """
    def updateDict(commLst, linkDct, numberIndexStr):
        inProjectCEvent = 0  # initialize counter
        # Loop over communication events
        if commLst is None:
            return None
        for event in commLst:
            contactedNumber = event[numberIndexStr]
            user = event["user"]
            if contactedNumber in phDct:  # if number belongs to a participant in the project
                inProjectCEvent += 1
                contactedUser = phDct[contactedNumber]
                # Remember: it's a counter inside the defaultdict
                linkDct[user][contactedUser] += 1
        return inProjectCEvent

    # for call, index to phone number is "number", for sms it's "address"
    numberTypePairTuple = (("call", "number"), ("sms", "address"))
    inProjectFractions = list()  # Count communications to participants
    # Three indexes: ("call"/"sms"), ("number"/"address"), (associated defaultdict()
    for (commType, numberIndexStr), linkDct in zip(numberTypePairTuple, linkDctLst):
        count = updateDict(userDct[commType], linkDct, numberIndexStr)
        # Turn counts info fraction of communication between participants
        if count is None:
            inProjectFractions.append(None)
            continue
        try:
            inProjectFractions.append(count / len(userDct[commType]))
        except ZeroDivisionError:
            inProjectFractions.append(None)
    return inProjectFractions

In [None]:
phDct = loadUserPhonenumberDict()
userLst = getUserList()
keyMap = {k:"u{:04d}".format(i) for (i, k) in enumerate(phDct.keys())}

callDct, smsDct = defaultdict(Counter), defaultdict(Counter)

fracLst = list()
for i, user in enumerate(userLst):
# for i in range(1000, len(userLst)):
#    user = userLst[i]
    try:
        userDct = loadUser(user, dataFilter=("call", "sms"))
        commFrac = linkUser(userDct, (callDct, smsDct), phDct)
        fracLst.append(commFrac)
        if len(fracLst) % 50 == 0:
            print("{:.0f} %".format(100 * len(fracLst) / len(userLst)), end="\t")
    except Exception as e:
        print("\n\n\nSomething went wrong  at (user, index):\n", user, i)
        raise e

In [None]:
cdct = {k:v for i,(k,v) in enumerate(callDct.items()) if i < 5}

In [None]:
cdct

In [None]:
class Useralias(object):
    def __init__(self):
        super(Useralias, self).__init__()
        self.i = 0
        self.userdct = dict()

    def __setitem__(self, key, value):
        self.userdct[key] = value

    def __getitem__(self, key):
        if key not in self.userdct:
            self.i += 1
            self.userdct[key] = "u{:04d}".format(self.i)
        return self.userdct[key]


In [None]:
cdct

In [None]:
def aliasUsernames(commDct, useralias):
    outDct = dict()
    for ko, vo in commDct.items():
        kon = useralias[ko]
        inner = dict()
        for ki, vi in vo.items():
            inner[useralias[ki]] = vi
        outDct[kon] = inner
    return outDct

ua = Useralias()
callDct2 = aliasUsernames(callDct, ua)
smsDct2 = aliasUsernames(smsDct, ua)

In [None]:
cdf = pd.DataFrame.from_dict(callDct2, orient="index").apply(lambda x: x/np.sum(x))  # Normalize
sdf = pd.DataFrame.from_dict(smsDct2, orient="index").apply(lambda x: x/np.sum(x))
cmb = pd.concat((cdf, sdf))
cmb = cmb.groupby(cmb.index).sum()
cmb

The data is quite sparse

In [None]:
msno.matrix(cdf)

In [None]:
msno.matrix(sdf)

In [None]:
g = nx.from_dict_of_lists({k:callDct[k].keys() for k in callDct})

In [None]:
gm = nx.adjacency_matrix(g)
fig, ax = plt.subplots()
ax.pcolorfast(gm.todense())