In [None]:
import pandas as pd

In [None]:
data = pd.DataFrame({
        'first_name': ["Diana", "La'Toya", "Anne Marie", "Billy-Jean"],
        'last_name': ["Milbank-Stapleton", "O'Shaughnessy", "Brown", "Queen"],
        'location': ['NYC', 'NYC', 'Chicago', 'Chicago'],
        'med_timestamp' : [1495397700, 1495721700, 1496341020, 1497286980],
        'distance_walked': [1.4, 3.7, 11.3, 5.1],
        'distance_units': ['mi', 'mi', 'km', 'mi']
    })

## Remove alphanumeric case dependencies

In [None]:
data.first_name = data.first_name.apply(lambda x: x.upper())
data.last_name = data.last_name.apply(lambda x: x.upper())

In [None]:
# instead of making an exhaustive list of characters to remove, require alphanumeric
def alphanum(s):
    return(''.join(lett for lett in s if lett.isalnum()))

In [None]:
data.first_name = data.first_name.apply(lambda x: alphanum(x.upper()))
data.last_name = data.last_name.apply(lambda x: alphanum(x.upper()))

In [None]:
data.head()

## Convert timestamps to local time

In [None]:
location_tzs = {
    'Chicago': 'America/Chicago',
    'NYC': 'America/New_York'
}

In [None]:
data.head()

In [None]:
import datetime
def process_row(r):
    t = datetime.datetime.utcfromtimestamp(r['med_timestamp'])
    t2 = pd.Timestamp(t)
    t2 = t2.tz_localize('UTC')
    t2 = t2.tz_convert(location_tzs[r['location']])
    return(t2)

In [None]:
data['actual_timestamp'] = data.apply(process_row, axis = 1)

In [None]:
data.head()

## Unit conversions

In [None]:
# say you want all distances in miles, but you have some holdouts
# again we'll process the entire row

In [None]:
CONVERSION_FACTOR_KM_TO_MI = .6213
def process_row(r):
    units = r['distance_units']
    value = r['distance_walked']
    if r['distance_units'].lower() == 'mi':
        pass
    elif r['distance_units'].lower() == 'km':
        # better to have this as a constant somewhere your 
        # entire code base can share it
        value = value * CONVERSION_FACTOR_KM_TO_MI
    return(value)
data['miles_walked'] = data.apply(process_row, axis = 1)

In [None]:
data.head()

## Look at your data

In [None]:
# think about how you got your data and 
# what mistakes people are likely to make

In [None]:
# hint: bimodal plots are a good tipoff 
# as are values that are outliers by an order of magnitude under/over

In [None]:
# also again your user interface, looking for likely mistakes
# talk to support folks or clinicians and ask what mistakes they notice