In [24]:
import pandas as pd
import numpy as np
import seaborn as sns  
import matplotlib.pyplot as plt
from datetime import date
from dateutil.relativedelta import relativedelta
%matplotlib inline 

# IMPORTANT:
# Redownload the files from slack and Outlook 

##### DATA (the anonymised dataframe) 

In [25]:
#import main dataframe
data = pd.read_csv('locations.csv', low_memory=False)

In [26]:
#rename City column and drop empty columns
data.rename( columns={'Unnamed: 5':'City'}, inplace=True )
data = data.drop(data.columns[data.columns.str.contains('unnamed',case = False)],axis = 1)

In [27]:
#drop VisitorID and ContentInfo
data = data.drop(columns=['VisitorID', 'ContentInfo'])

In [29]:
#just getting rid of the space after the name
data = data.replace('STH Ayshire St Leonards ', 'STH Ayshire St Leonards')

In [30]:
#change time to datetime
data['Time'] = pd.to_datetime(data['Time'], format = "%d/%m/%Y %H:%M", utc=False)

In [31]:
#change ExternalIDs to integers
data.ExternalID = data.ExternalID.fillna(0).astype(int)

In [32]:
#drop rows that are identical
data = data.drop_duplicates(keep='first')

In [33]:
#new dataframe with no null ExternalID vaues
dataNoNullUsers = data[data.ExternalID != 0]

##### USERS (the user dataframe) 

In [34]:
#import users dataframe
users = pd.read_csv('PrimaryConditions age sex by customer reference.csv', low_memory=False)

In [35]:
#drop empty column and CleverCogsUserId
users = users.drop(users.columns[users.columns.str.contains('unnamed',case = False)],axis = 1)
users = users.drop(columns=['CleverCogsUserId'])

In [36]:
#delete empty row
users = users.dropna(axis=0, how='all', inplace=False)

In [37]:
#change ExternalID into integers
users['ExternalID'] = users['ExternalID'].apply(np.int64)
users.shape
users.head()

Unnamed: 0,ExternalID,BirthDate,Gender,Condition
0,4703,04/07/1965,F,Spina Bifida
1,4704,21/06/1940,F,COPD
2,4706,25/02/1931,F,Dementia
3,4707,18/09/1982,F,Brain Injury
4,4708,20/12/1931,F,


In [38]:
#drop duplicates
users = users.drop_duplicates(subset = ['ExternalID'], keep='last')
users.shape

(680, 4)

In [40]:
#change BirthDate to date time
#users['BirthDate'] = pd.to_datetime(users['BirthDate'], format = "%m/%d/%y", dayfirst=False, utc=True)

users['BirthDate'] = pd.to_datetime(users['BirthDate'], errors='coerce', dayfirst=False, yearfirst=False, 
                   format="%d/%m/%Y", utc=False, infer_datetime_format=False, origin='unix')
users = users.set_index(pd.DatetimeIndex(users['BirthDate']))

#Remove erroneous entries
threshold = pd.to_datetime("2010-01-01", format="%Y-%m-%d", utc=False, origin='unix')
users = users[users['BirthDate'] < threshold]

users.head()

Unnamed: 0_level_0,ExternalID,BirthDate,Gender,Condition
BirthDate,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1965-07-04,4703,1965-07-04,F,Spina Bifida
1940-06-21,4704,1940-06-21,F,COPD
1931-02-25,4706,1931-02-25,F,Dementia
1982-09-18,4707,1982-09-18,F,Brain Injury
1931-12-20,4708,1931-12-20,F,


In [41]:
now = pd.to_datetime(date.today())

def getYears(start):
    diff = relativedelta(now, start)
    return diff.years

# Add a new column that contains the age of the user
users["Age"] = users['BirthDate'].map(getYears)

In [42]:
users.head()

Unnamed: 0_level_0,ExternalID,BirthDate,Gender,Condition,Age
BirthDate,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1965-07-04,4703,1965-07-04,F,Spina Bifida,53
1940-06-21,4704,1940-06-21,F,COPD,78
1931-02-25,4706,1931-02-25,F,Dementia,87
1982-09-18,4707,1982-09-18,F,Brain Injury,36
1931-12-20,4708,1931-12-20,F,,86
