In [22]:
import pandas as pd
import numpy as np
import seaborn as sns  
import matplotlib.pyplot as plt
from datetime import date
from dateutil.relativedelta import relativedelta
%matplotlib inline 

# IMPORTANT:
# Redownload the files from slack and Outlook 

##### DATA (the anonymised dataframe) 

In [23]:
#import main dataframe
data = pd.read_csv('locations.csv', low_memory=False)

In [24]:
#rename City column and drop empty columns
data.rename( columns={'Unnamed: 5':'City'}, inplace=True )
data = data.drop(data.columns[data.columns.str.contains('unnamed',case = False)],axis = 1)

In [25]:
#drop VisitorID and ContentInfo
data = data.drop(columns=['VisitorID', 'ContentInfo'])

In [26]:
#just getting rid of the space after the name
data = data.replace('STH Ayshire St Leonards ', 'STH Ayshire St Leonards')
data.head()

Unnamed: 0,Time,ExternalID,Role,Building,City,LinkTitle,LinkType
0,10/29/18 10:55,5325.0,User,Broom Ground,Stirling,Sam Smith,Internet
1,10/29/18 10:50,5325.0,User,Broom Ground,Stirling,Elton John,Internet
2,10/29/18 10:49,5325.0,User,Broom Ground,Stirling,My Music,Category
3,10/29/18 10:49,5325.0,User,Broom Ground,Stirling,Entertainment,Category
4,10/29/18 10:48,,User,Belses Gardens - Care Home,Glasgow,BBC Formula 1,Internet


In [27]:
#change time to datetime
data['Time'] = pd.to_datetime(data['Time'], format = "%m/%d/%y %H:%M", utc=False)
data.head()

Unnamed: 0,Time,ExternalID,Role,Building,City,LinkTitle,LinkType
0,2018-10-29 10:55:00,5325.0,User,Broom Ground,Stirling,Sam Smith,Internet
1,2018-10-29 10:50:00,5325.0,User,Broom Ground,Stirling,Elton John,Internet
2,2018-10-29 10:49:00,5325.0,User,Broom Ground,Stirling,My Music,Category
3,2018-10-29 10:49:00,5325.0,User,Broom Ground,Stirling,Entertainment,Category
4,2018-10-29 10:48:00,,User,Belses Gardens - Care Home,Glasgow,BBC Formula 1,Internet


In [28]:
#change ExternalIDs to integers
data.ExternalID = data.ExternalID.fillna(0).astype(int)

In [29]:
#drop rows that are identical
data = data.drop_duplicates(keep='first')

In [30]:
#new dataframe with no null ExternalID vaues
dataNoNullUsers = data[data.ExternalID != 0]

##### USERS (the user dataframe) 

In [31]:
#import users dataframe
users = pd.read_csv('PrimaryConditions age sex by customer reference.csv', low_memory=False)

In [32]:
#drop empty column and CleverCogsUserId
users = users.drop(users.columns[users.columns.str.contains('unnamed',case = False)],axis = 1)
users = users.drop(columns=['CleverCogsUserId'])

In [33]:
#delete empty row
users = users.dropna(axis=0, how='all', inplace=False)

In [34]:
#change ExternalID into integers
users['ExternalID'] = users['ExternalID'].apply(np.int64)
users.shape
users.head()

Unnamed: 0,ExternalID,BirthDate,Gender,Condition
0,4703,7/4/65,F,Spina Bifida
1,4704,6/21/40,F,COPD
2,4706,2/25/31,F,Dementia
3,4707,9/18/82,F,Brain Injury
4,4708,12/20/31,F,


In [35]:
#drop duplicates
users = users.drop_duplicates(subset = ['ExternalID'], keep='last')
users.shape

(680, 4)

In [36]:
#change BirthDate to date time
#users['BirthDate'] = pd.to_datetime(users['BirthDate'], format = "%m/%d/%y", dayfirst=False, utc=True)

users['BirthDate'] = pd.to_datetime(users['BirthDate'], errors='coerce', dayfirst=False, yearfirst=False, 
                   format="%d/%m/%y", utc=False, infer_datetime_format=False, origin='unix')
users = users.set_index(pd.DatetimeIndex(users['BirthDate']))

#Remove erroneous entries
threshold = pd.to_datetime("2010-01-01", format="%Y-%m-%d", utc=False, origin='unix')
users = users[users['BirthDate'] < threshold]

users.head()

Unnamed: 0_level_0,ExternalID,BirthDate,Gender,Condition
BirthDate,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1980-01-05,5089,1980-01-05,M,
1983-05-12,5096,1983-05-12,F,
1994-03-03,5105,1994-03-03,M,
1969-11-03,5188,1969-11-03,M,Learning Difficulties
1994-08-04,5213,1994-08-04,M,


In [37]:
now = pd.to_datetime(date.today())

def getYears(start):
    diff = relativedelta(now, start)
    return diff.years

# Add a new column that contains the age of the user
users["Age"] = users['BirthDate'].map(getYears)

In [38]:
users.head()

Unnamed: 0_level_0,ExternalID,BirthDate,Gender,Condition,Age
BirthDate,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1980-01-05,5089,1980-01-05,M,,38
1983-05-12,5096,1983-05-12,F,,35
1994-03-03,5105,1994-03-03,M,,24
1969-11-03,5188,1969-11-03,M,Learning Difficulties,49
1994-08-04,5213,1994-08-04,M,,24
