In [351]:
import pandas as pd
import numpy as np
import seaborn as sns  
import matplotlib.pyplot as plt
from datetime import date
from dateutil.relativedelta import relativedelta
import json
import time
%matplotlib inline 

# IMPORTANT:
# Redownload the files from slack and Outlook 

##### DATA (the anonymised dataframe) 

In [352]:
#import main dataframe
data = pd.read_csv('locations.csv', low_memory=False)

In [353]:
#rename City column and drop empty columns
data.rename( columns={'Unnamed: 5':'City'}, inplace=True )
data = data.drop(data.columns[data.columns.str.contains('unnamed',case = False)],axis = 1)

In [354]:
#drop VisitorID and ContentInfo
data = data.drop(columns=['VisitorID', 'ContentInfo'])

In [355]:
#just getting rid of the space after the name
data = data.replace('STH Ayshire St Leonards ', 'STH Ayshire St Leonards')
data.head()

Unnamed: 0,Time,ExternalID,Role,Building,City,LinkTitle,LinkType
0,29/10/2018 10:55,5325.0,User,Broom Ground,Stirling,Sam Smith,Internet
1,29/10/2018 10:50,5325.0,User,Broom Ground,Stirling,Elton John,Internet
2,29/10/2018 10:49,5325.0,User,Broom Ground,Stirling,My Music,Category
3,29/10/2018 10:49,5325.0,User,Broom Ground,Stirling,Entertainment,Category
4,29/10/2018 10:48,,User,Belses Gardens - Care Home,Glasgow,BBC Formula 1,Internet


In [356]:
#change time to datetime
data['Time'] = pd.to_datetime(data['Time'], format = "%d/%m/%Y %H:%M", utc=False)

In [357]:
#change ExternalIDs to integers
data.ExternalID = data.ExternalID.fillna(0).astype(int)

In [358]:
#drop rows that are identical
data = data.drop_duplicates(keep='first')

In [359]:
#new dataframe with no null ExternalID vaues
data = data[data.ExternalID != 0]

In [360]:
data = data[(data.City != 'Test user ignore') & (data.City != 'test user')]

##### USERS (the user dataframe) 

In [361]:
#import users dataframe
users = pd.read_csv('PrimaryConditions age sex by customer reference.csv', low_memory=False)

In [362]:
users.shape

(695, 5)

In [363]:
#drop empty column and CleverCogsUserId
users = users.drop(users.columns[users.columns.str.contains('unnamed',case = False)],axis = 1)
users = users.drop(columns=['CleverCogsUserId'])

In [364]:
#delete empty row
users = users.dropna(axis=0, how='all', inplace=False)

In [365]:
#change ExternalID into integers
users['ExternalID'] = users['ExternalID'].apply(np.int64)
users.head()

Unnamed: 0,ExternalID,BirthDate,Gender,Condition
0,4703,04/07/1965,F,Spina Bifida
1,4704,21/06/1940,F,COPD
2,4706,25/02/1931,F,Dementia
3,4707,18/09/1982,F,Brain Injury
4,4708,20/12/1931,F,


In [366]:
#drop duplicates
users = users.drop_duplicates(subset = ['ExternalID'], keep='last')
users.shape

(680, 4)

In [367]:
#change BirthDate to date time
#users['BirthDate'] = pd.to_datetime(users['BirthDate'], format = "%m/%d/%y", dayfirst=False, utc=True)

users['BirthDate'] = pd.to_datetime(users['BirthDate'], errors='coerce', dayfirst=False, yearfirst=False, 
                   format="%d/%m/%Y", utc=False, infer_datetime_format=False, origin='unix')
users = users.set_index(pd.DatetimeIndex(users['BirthDate']))

#Remove erroneous entries
threshold = pd.to_datetime("2010-01-01", format="%Y-%m-%d", utc=False, origin='unix')
users = users[users['BirthDate'] < threshold]

users.head()

Unnamed: 0_level_0,ExternalID,BirthDate,Gender,Condition
BirthDate,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1965-07-04,4703,1965-07-04,F,Spina Bifida
1940-06-21,4704,1940-06-21,F,COPD
1931-02-25,4706,1931-02-25,F,Dementia
1982-09-18,4707,1982-09-18,F,Brain Injury
1931-12-20,4708,1931-12-20,F,


In [368]:
now = pd.to_datetime(date.today())

def getYears(start):
    diff = relativedelta(now, start)
    return diff.years

# Add a new column that contains the age of the user
users["Age"] = users['BirthDate'].map(getYears)

In [369]:
users.head()
users.shape

(672, 5)

# Generate Network

This file will generate a JSON file that includes all LinkTitles as nodes, coloured by clusters of LinkTypes. 

```Node: id, label, numberOfTraffic, timeSpent```

The edges will represent each line of traffic. 

```Edge: id, label, userId, age bucket, startNodeId, endNodeId, date, time, prevDate, prevTime```


In [370]:
# Merge data and user on external id
data.ExternalID = (data.ExternalID.fillna(0)).astype(dtype='int64')
dataUserInner = pd.merge(data, users, on ="ExternalID", how="inner")


# create age buckets

def getAgeBucket(age):
    start = int(age/10)*10
    if age < start+5:
#         return (str(start) + '-' + str(start+5))
        return start
    else:
#         return (str(start+5) + '-' + str(start+10))
        return start+5


dataUserInner["AgeRange"] = dataUserInner['Age'].map(getAgeBucket)

# dataZero = dataUserInner[dataUserInner["ExternalID"] == 0]
# dataZero.head()

dataUserInner.head()


Unnamed: 0,Time,ExternalID,Role,Building,City,LinkTitle,LinkType,BirthDate,Gender,Condition,Age,AgeRange
0,2018-10-29 10:41:00,5541,User,STH Ayshire St Leonards,Ayr,YouTube,Internet,1954-02-18,M,Cerebral Palsy,64,60
1,2018-10-29 10:39:00,5541,User,STH Ayshire St Leonards,Ayr,YouTube,Internet,1954-02-18,M,Cerebral Palsy,64,60
2,2018-10-29 10:38:00,5541,User,STH Ayshire St Leonards,Ayr,YouTube,Internet,1954-02-18,M,Cerebral Palsy,64,60
3,2018-10-29 10:33:00,5541,User,STH Ayshire St Leonards,Ayr,YouTube,Internet,1954-02-18,M,Cerebral Palsy,64,60
4,2018-10-29 10:00:00,5541,User,STH Ayshire St Leonards,Ayr,YouTube,Internet,1954-02-18,M,Cerebral Palsy,64,60


In [371]:
# Sort by user to  allow linear processing
dataUserInner.sort_values(['ExternalID', 'Time'], ascending=[True, True], inplace=True)

In [372]:
dataUserInner.head()

Unnamed: 0,Time,ExternalID,Role,Building,City,LinkTitle,LinkType,BirthDate,Gender,Condition,Age,AgeRange
71133,2018-01-13 09:45:00,4703,User,Charleston,Dundee (Tayside),Keys To Life,Internet,1965-07-04,F,Spina Bifida,53,50
71134,2018-01-13 09:45:00,4703,User,Charleston,Dundee (Tayside),Health & Wellbeing,Category,1965-07-04,F,Spina Bifida,53,50
71135,2018-01-13 09:45:00,4703,User,Charleston,Dundee (Tayside),Information,Category,1965-07-04,F,Spina Bifida,53,50
71129,2018-01-20 20:29:00,4703,User,Charleston,Dundee (Tayside),Sudoku,Internet,1965-07-04,F,Spina Bifida,53,50
71130,2018-01-20 20:29:00,4703,User,Charleston,Dundee (Tayside),Single Player Games,Category,1965-07-04,F,Spina Bifida,53,50


In [380]:
# Iterate through file and process nodes and edges. Assign each distinct user a new id.

# Node: id, label, traffic, timeSpent, start, type
# Edge: id, label, userId, age bucket, startNodeId, endNodeId, timeDiff, pathId, end


# calculate timespent by adding the timediff to a time diff array at the node. Can then be used to 
# calculate averages or sum
links = {}

nodes = {}
edges = {}

threshold = 2 #hours
currentNodeId = 0
currentEdgeId = 0
currentUserId = 0
currentPathId = 0

prevUser = 0
prevTime = 0


node = {}
edge = {}
for index, row in dataUserInner.iterrows():
    # check if the same user
    if prevTime == 0: 
        prevTime = row.Time
    else: 
        timeDiff = row.Time - prevTime
        #print(timeDiff.total_seconds())
        
        
    if row.ExternalID != prevUser:
        if edge: 
            edge["end"] = True
            currentPathId+=1
        prevUser = row.ExternalID
        currentUserId += 1
    
    # check if time threshold
    elif timeDiff.total_seconds() > (2 * 60 * 60): 
        # start new path
        # check if previous edge needs finishing
        if edge: 
            edge["end"] = True
            currentPathId+=1
    

    # set up new start node and edge
    # check if this link already in node
    nodeId = 0
    nodeTraffic = 0
    if row.LinkTitle in links: 
        # this node already exists and if so update it with the traffic
        nodeId, nodeTraffic = links[row.LinkTitle]
        links[row.LinkTitle] = (nodeId, nodeTraffic+1)
        nodes[nodeId]["traffic"] = nodeTraffic+1
    else: 
        #create new node
        nodeId = currentNodeId
        currentNodeId+=1
        node = {"id":nodeId, "label": row.LinkTitle, "traffic":1, "type": row.LinkType, "timeDiffs": []}
        links[row.LinkTitle] = (nodeId, 1)
        nodes[nodeId] = node
    
    if edge: 
        edge["timeDiff"] = timeDiff.total_seconds()
        edge["endNode"] = nodeId
        edges[currentEdgeId] = edge
        nodes[edge["startNode"]]["timeDiffs"].append(timeDiff.total_seconds())
        currentEdgeId += 1
        edge = {}

    #create edge
    prevTime = row.Time
    edge = {"id": currentEdgeId, "userId":currentUserId, "age": row.AgeRange, 
             "startNode": nodeId, "pathId": currentPathId}
    node = {}


In [381]:
nodesList = []
edgesList = []

for key, value in nodes.items():
    nodesList.append(value)

for key, value in edges.items(): 
    edgesList.append(value)

In [382]:
# difference in seconds (b-a).total_seconds()

# for i in range(0,10):
#     print(nodes[i])
    

# for i in range(0,30):
#     print(edges[i])

#print(edges[500])
    
print("node: "),
print(nodesList[0])

print("edges: "),
print(edgesList[0])
    
print(len(nodes))
print(len(edges))

node: 
{'id': 0, 'label': 'Keys To Life', 'traffic': 22, 'type': 'Internet', 'timeDiffs': [0.0, 60.0, 960.0, 0.0, 60.0, 604740.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 420.0, 60.0, 480.0, 120.0]}
edges: 
{'id': 0, 'userId': 1, 'age': 50, 'startNode': 0, 'pathId': 0, 'timeDiff': 0.0, 'endNode': 1}
478
71731


In [383]:
edgesJson = json.dumps(edgesList)
nodesJson = json.dumps(nodesList)
graph = {"links": edges, "nodes": nodes}

In [384]:
with open('nodes.json', 'w') as outfile:
    json.dump(nodesJson, outfile)
    

In [385]:
with open('edges.json', 'w') as outfile:
    json.dump(edgesJson, outfile)

In [386]:
with open('graph.json', 'w') as outfile:
    json.dump(graph, outfile)