In [1]:
import pandas as pd
import numpy as np
import seaborn as sns  
import matplotlib.pyplot as plt
from datetime import date
from dateutil.relativedelta import relativedelta
%matplotlib inline 

# IMPORTANT:
# Redownload the files from slack and Outlook 

##### DATA (the anonymised dataframe) 

In [2]:
#import main dataframe
data = pd.read_csv('locations.csv', low_memory=False)

In [3]:
#rename City column and drop empty columns
data.rename( columns={'Unnamed: 5':'City'}, inplace=True )
data = data.drop(data.columns[data.columns.str.contains('unnamed',case = False)],axis = 1)

In [4]:
#drop VisitorID and ContentInfo
data = data.drop(columns=['VisitorID', 'ContentInfo'])

In [5]:
#just getting rid of the space after the name
data = data.replace('STH Ayshire St Leonards ', 'STH Ayshire St Leonards')

In [6]:
#change time to datetime
data['Time'] = pd.to_datetime(data['Time'], format = "%d/%m/%Y %H:%M", utc=False)

In [7]:
#change ExternalIDs to integers
data.ExternalID = data.ExternalID.fillna(0).astype(int)

In [8]:
#drop rows that are identical
data = data.drop_duplicates(keep='first')

In [9]:
#new dataframe with no null ExternalID vaues
dataNoNullUsers = data[data.ExternalID != 0]

In [10]:
data.head()

Unnamed: 0,Time,ExternalID,Role,Building,City,LinkTitle,LinkType
0,2018-10-29 10:55:00,5325,User,Broom Ground,Stirling,Sam Smith,Internet
1,2018-10-29 10:50:00,5325,User,Broom Ground,Stirling,Elton John,Internet
2,2018-10-29 10:49:00,5325,User,Broom Ground,Stirling,My Music,Category
3,2018-10-29 10:49:00,5325,User,Broom Ground,Stirling,Entertainment,Category
4,2018-10-29 10:48:00,0,User,Belses Gardens - Care Home,Glasgow,BBC Formula 1,Internet


# Heatmap

In [11]:
!pip install pyecharts
!pip install --upgrade pip

Requirement already up-to-date: pip in /opt/conda/lib/python3.6/site-packages (18.1)


In [12]:
!git clone https://github.com/pyecharts/pyecharts.git
!cd pyecharts
!python setup.py install
!pip install pyecharts-snapshot

fatal: destination path 'pyecharts' already exists and is not an empty directory.
python: can't open file 'setup.py': [Errno 2] No such file or directory


In [13]:
#Clicks per date
import datetime as dt
data['Day']=data['Time'].dt.date
from collections import Counter
daycount=Counter(data.Day).most_common()
daycount=pd.DataFrame(data=daycount,columns=['date','click'])
daycount.head()

Unnamed: 0,date,click
0,2018-09-18,656
1,2018-09-20,638
2,2018-08-31,633
3,2018-05-11,616
4,2018-10-16,600


In [14]:
myJson=daycount.to_json(orient='values',date_format='iso')
import re
myJson=re.sub(r'(\d\d\d\d-\d\d-\d\d).{14}"', r'\1"',myJson)

In [15]:
import datetime
import random
from pyecharts import HeatMap

begin = datetime.date(2018, 1, 1)
end = datetime.date(2018, 12, 31)
click = eval(myJson)
heatmap = HeatMap("Click Distribution", "Number of clicks per day", width=1200)
heatmap.add(
    "",
    click,
    is_calendar_heatmap=True,
    visual_text_color="#000",
    visual_range_text=["", ""],
    visual_range=[0,700],
    calendar_cell_size=["auto", 30],
    is_visualmap=True,
    calendar_date_range="2018",
    visual_orient="horizontal",
    visual_pos="center",
    visual_top="80%",
    is_piecewise=True,
)
heatmap.render()

# Wordcloud

In [16]:
LinkName=''
for row in data.LinkTitle:
    LinkName+=row+' '
from collections import Counter
freword = Counter(LinkName.split()).most_common(60)
Word=pd.DataFrame(data=freword,columns=['Keyword','Frequency'])
Word=Word[~Word['Keyword'].isin(['My'])]
Word=Word[~Word['Keyword'].isin(['Play'])]
Word=Word[~Word['Keyword'].isin(['Player'])]
Word=Word[~Word['Keyword'].isin(['Stations'])]
Word=Word[~Word['Keyword'].isin(['The'])]
Word=Word[~Word['Keyword'].isin(['&'])]
Word=Word[~Word['Keyword'].isin(['2'])]
Word=Word[~Word['Keyword'].isin(['FC'])]
Word=Word[~Word['Keyword'].isin(['Mail'])]
Word=Word[~Word['Keyword'].isin(['Card'])]
Word.head()

Unnamed: 0,Keyword,Frequency
0,Entertainment,18024
2,Games,12767
3,Interests,9194
4,Music,8149
6,BBC,6174


In [17]:
keyword = np.array(Word.Keyword)
keyword=keyword.tolist()
frequency=np.array(Word.Frequency)
frequency=frequency.tolist()

In [18]:
from pyecharts import WordCloud
wordcloud = WordCloud(width=1200, height=600)
wordcloud.add("", keyword, frequency, word_size_range=[20, 120])
wordcloud.render()

In [19]:
#frelink=Counter(data.LinkTitle).most_common(50)
#Title=pd.DataFrame(data=frelink,columns=['Linktitle','Frequency'])
#Title.head()
#title = np.array(Title.Linktitle)
#title=title.tolist()
#frequency=np.array(Title.Frequency)
#frequency=frequency.tolist()
#wordcloud = WordCloud(width=1300, height=620)
#wordcloud.add("", title, frequency, word_size_range=[10, 100])
#wordcloud.render()

# Geomap

In [20]:
city = Counter(data.City).most_common()
city=pd.DataFrame(data=city,columns=['city','users'])
city=city[~city['city'].isin(['test user'])]
city=city[~city['city'].isin(['nan'])]
city=city[~city['city'].isin(['Test user ignore'])]
city=city.dropna(subset=['city'])
city=city.replace(['Dundee (Tayside)','Arbroath (Tayside)'],['Dundee','Arbroath'])
city.to_json(orient='values')

'[["Stirling",29138],["Edinburgh",28420],["Ayr",19161],["Dundee",16866],["Glasgow",14914],["Aberdeen",11973],["Arbroath",73],["Dundee",30]]'

In [21]:
!pip install echarts-cities-pypkg



In [22]:
! pip install echarts-countries-pypkg
! pip install echarts-united-kingdom-pypkg



In [30]:
from pyecharts import Map, Geo
city = [("Stirling",29138),("Edinburgh",28420),("Ayr",19161),("Dundee",16866),("Glasgow",14914),("Aberdeen",11973),("Arbroath",73),("Dundee",30)]
geo = Geo(
    "User Distribution",
    "",
    title_color="#000",
    title_pos="center",
    width=1200,
    height=600,
    #background_color="#404a59",
)
attr, value = geo.cast(city)
geo.add(
    "",
    attr,
    value,
    type="effectScatter",
    is_random=True, 
    effect_scale=5,
    maptype='英国',
    coordinate_region='GB',
    visual_range=[0, 30000],
    visual_text_color="#fff",
    symbol_size=15,
    is_visualmap=True,
)
geo.render()

# Page

In [31]:
from pyecharts import Page
page = Page()

# geo
page.add(geo)

# heat
page.add(heatmap) 

# word
page.add(wordcloud)

page.render() 

##### USERS (the user dataframe) 

In [None]:
#import users dataframe
users = pd.read_csv('PrimaryConditions age sex by customer reference.csv', low_memory=False)

In [None]:
#drop empty column and CleverCogsUserId
users = users.drop(users.columns[users.columns.str.contains('unnamed',case = False)],axis = 1)
users = users.drop(columns=['CleverCogsUserId'])

In [None]:
#delete empty row
users = users.dropna(axis=0, how='all', inplace=False)

In [None]:
#change ExternalID into integers
users['ExternalID'] = users['ExternalID'].apply(np.int64)
users.shape
users.head()

In [None]:
#drop duplicates
users = users.drop_duplicates(subset = ['ExternalID'], keep='last')
users.shape

In [None]:
#change BirthDate to date time
#users['BirthDate'] = pd.to_datetime(users['BirthDate'], format = "%m/%d/%y", dayfirst=False, utc=True)

users['BirthDate'] = pd.to_datetime(users['BirthDate'], errors='coerce', dayfirst=False, yearfirst=False, 
                   format="%d/%m/%y", utc=False, infer_datetime_format=False, origin='unix')
users = users.set_index(pd.DatetimeIndex(users['BirthDate']))

#Remove erroneous entries
threshold = pd.to_datetime("2010-01-01", format="%Y-%m-%d", utc=False, origin='unix')
users = users[users['BirthDate'] < threshold]

users.head()

In [None]:
now = pd.to_datetime(date.today())

def getYears(start):
    diff = relativedelta(now, start)
    return diff.years

# Add a new column that contains the age of the user
users["Age"] = users['BirthDate'].map(getYears)

In [None]:
users.head()

In [None]:
users=users.dropna(subset=['Condition'])
con = Counter(users.Condition).most_common()
condition=pd.DataFrame(data=con,columns=['Condition','Number'])
print(condition)

In [None]:
from pyecharts import Pie

attr = ["Stroke", "Learning Difficulties", "Brain Injury", "Spina Bifida", "Cerebral Palsy", "Spinal Injury","Epilepsy","Multiple Sclerosis","Physical Injury"]
v1 = [3,2,2,2,2,1,1,1,1]
pie = Pie("Condition Distribution", title_pos='center', width=900)
pie.add("", attr, v1, is_label_show=True,radius=[40, 75],legend_orient="vertical", legend_pos="left",)
pie.render()