In [1]:
import gzip
import os
import numpy

In [2]:
def checkRange(workout, key, attribute, low_bound, high_bound):
    fout = open('./RESULT_' + workout['sport'], 'ab')
    if type(workout) is not dict:
        fout.write(b'#########Not Dictionary#########')
        fout.close()
        return False
    

    # If the key is not in the dictionary, keep the data
    if key not in workout:
        fout.write(bytes('=========Key ' + key + ' not found=========\n', 'ascii'))
        fout.close()
        return True
    
    
    # Determine whether the data should be cleaned out
    result = {
        'avg': low_bound <= numpy.mean(workout[key]) <= high_bound,
        'min': low_bound <= numpy.amin(workout[key]) <= high_bound,
        'max': low_bound <= numpy.amax(workout[key]) <= high_bound
    }[attribute]
    test = numpy.amax(workout[key])
    fout.write(bytes('The max {} is {}\n'.format(key, test), 'ascii'))
    # Print the results
    if result:
        fout.write(bytes('{} {} is in range for [{}, {}]\n'.format(attribute, key, low_bound, high_bound), 'ascii'))
    else:
        fout.write(bytes('{} {} is not range for [{}, {}]. ABORT!!!!!!!!\n'.format(attribute, key, low_bound, high_bound), 'ascii'))
    
    fout.close()
    return result


In [3]:
def cleanData(thresholds):
    # Validity check for type of parameter
    if type(thresholds) is not dict:
        print("Thresholds passed in is not a dictionary--Abort")
        return
    
    # Iterate through the passed in parameter
    for sport, entry in thresholds.items():
        
        print('Cleanning ' + sport)
        
        # Prepare to process the keys we want to clean
        keyValues = []
        atrValues = []
        lowBounds = []
        highBounds = []
        
        # Process the keys we want to clean
        for key, value in entry.items():
            keyStr = key.split('.')
            
            # Validity check for splitted keys
            if(len(keyStr) != 2):
                print('The format of input key is invalid')
                continue;
            
            keyValues.append(keyStr[0])
            atrValues.append(keyStr[1])
            lowBounds.append(value[0])
            highBounds.append(value[1])
            
            print('For sport {}, the range of {} {} is [{}, {}]'.format(sport, keyStr[1], keyStr[0], value[0], value[1]))
        
        # Open the files of corresponding sport
        zin = gzip.open('../data/' + sport + '.json.gz', 'r')
        zout = gzip.open('../cleaning/' + sport + '_cleaned.json.gz', 'ab')
        
        # Iterate through each line
        for l in zin:
            
            l = l.decode('ascii')
            dic = eval(l)
            
            finalResult = True
            
            # Check each key
            for i in range(0, len(keyValues)):
                result = checkRange(dic, keyValues[i], atrValues[i], lowBounds[i], highBounds[i])
                if not result:
                    finalResult = False
                    break

            # If the data's range is resonable
            if finalResult:
                zout.write(bytes(str(l) + '\n', 'ascii'))
        
        # Close the files
        zin.close()
        zout.close()    

In [4]:
threshold = {}
#threshold['run_AvgSpeed'] = [0, 20]
#threshold['bike (transport)'] = {'speed': [0, 50], 'heart_rate': [60, 180]}
#threshold['bike'] = {'speed': [0, 55], 'heart_rate': [60, 190]}
#threshold['core stability training'] = {'speed': [0, 20], 'heart_rate': [60, 180]}
#threshold['indoor cycling'] = {'speed': [0, 55], 'heart_rate': [80, 180]}
#threshold['mountain bike'] = {'speed': [0, 35], 'heart_rate': [60, 200]}
#threshold['orienteering'] = {'speed': [0, 16], 'heart_rate': [60, 190]}
#threshold['run'] = {'speed': [0, 20], 'heart_rate': [80, 200]}
#threshold['walk'] = {'speed': [0, 9], 'heart_rate': [60, 140]}

threshold['bike (transport)']        = {'speed.max': [4, 76], 'speed.min': [0, 11], 'speed.avg': [10.6, 35.3],
                                        'heart_rate.max': [73, 234], 'heart_rate.min': [19, 139], 'heart_rate.avg': [63, 187.8]}
threshold['bike']                    = {'speed.max': [4, 88], 'speed.min': [0, 17], 'speed.avg': [7.85, 44.59],
                                        'heart_rate.max': [92, 230], 'heart_rate.min': [31, 139], 'heart_rate.avg': [69.29, 194.89]}
threshold['core stability training'] = {'speed.max': [0, 150], 'speed.min': [0, 30], 'speed.avg': [0, 20],
                                        'heart_rate.max': [66, 238], 'heart_rate.min': [40, 197], 'heart_rate.avg': [49.4, 210.25]}
threshold['indoor cycling']          = {'speed.max': [0, 93], 'speed.min': [0, 37], 'speed.avg': [0, 59],
                                         'heart_rate.avg': [88.5, 178.55]}
threshold['mountain bike']           = {'speed.max': [6, 82], 'speed.min': [0, 10], 'speed.avg': [6.5, 33.7],
                                        'heart_rate.max': [108, 222], 'heart_rate.min': [31, 137], 'heart_rate.avg': [78.7, 190.24]}
threshold['orienteering']           = {'speed.max': [5, 38], 'speed.min': [0, 6], 'speed.avg': [4.29, 18.1],
                                        'heart_rate.max': [117, 225], 'heart_rate.min': [40, 147], 'heart_rate.avg': [97, 199]}
threshold['run']                     = {'speed.max': [4, 60], 'speed.min': [0, 12], 'speed.avg': [3.56, 19.24],
                                        'heart_rate.max': [104, 230], 'heart_rate.min': [40, 153], 'heart_rate.avg': [92.8, 200.5]}
threshold['walk']                    = {'speed.max': [2, 40], 'speed.min': [0, 8], 'speed.avg': [2.58, 9.5],
                                        'heart_rate.max': [117, 225], 'heart_rate.min': [40, 147], 'heart_rate.avg': [97, 199]}

print(threshold)
cleanData(threshold)

{'walk': {'heart_rate.avg': [97, 199], 'speed.max': [2, 40], 'heart_rate.min': [40, 147], 'heart_rate.max': [117, 225], 'speed.min': [0, 8], 'speed.avg': [2.58, 9.5]}, 'core stability training': {'heart_rate.avg': [49.4, 210.25], 'speed.max': [0, 150], 'heart_rate.min': [40, 197], 'heart_rate.max': [66, 238], 'speed.min': [0, 30], 'speed.avg': [0, 20]}, 'bike (transport)': {'heart_rate.avg': [63, 187.8], 'speed.max': [4, 76], 'heart_rate.min': [19, 139], 'heart_rate.max': [73, 234], 'speed.min': [0, 11], 'speed.avg': [10.6, 35.3]}, 'indoor cycling': {'heart_rate.avg': [88.5, 178.55], 'speed.min': [0, 37], 'speed.max': [0, 93], 'speed.avg': [0, 59]}, 'orienteering': {'heart_rate.avg': [97, 199], 'speed.max': [5, 38], 'heart_rate.min': [40, 147], 'heart_rate.max': [117, 225], 'speed.min': [0, 6], 'speed.avg': [4.29, 18.1]}, 'bike': {'heart_rate.avg': [69.29, 194.89], 'speed.max': [4, 88], 'heart_rate.min': [31, 139], 'heart_rate.max': [92, 230], 'speed.min': [0, 17], 'speed.avg': [7.85, 

In [6]:
threshold = {}
#threshold['run_AvgSpeed'] = [0, 20]
#threshold['bike (transport)'] = {'speed': [0, 50], 'heart_rate': [60, 180]}
#threshold['bike'] = {'speed': [0, 55], 'heart_rate': [60, 190]}
#threshold['core stability training'] = {'speed': [0, 20], 'heart_rate': [60, 180]}
#threshold['indoor cycling'] = {'speed': [0, 55], 'heart_rate': [80, 180]}
#threshold['mountain bike'] = {'speed': [0, 35], 'heart_rate': [60, 200]}
#threshold['orienteering'] = {'speed': [0, 16], 'heart_rate': [60, 190]}
#threshold['run'] = {'speed': [0, 20], 'heart_rate': [80, 200]}
#threshold['walk'] = {'speed': [0, 9], 'heart_rate': [60, 140]}

#threshold['bike (transport)']        = {'speed.max': [4, 76], 'speed.min': [0, 11], 'speed.avg': [10.6, 35.3],
#                                        'heart_rate.max': [73, 234], 'heart_rate.min': [19, 139], 'heart_rate.avg': [63, 187.8]}
#threshold['bike']                    = {'speed.max': [4, 88], 'speed.min': [0, 17], 'speed.avg': [7.85, 44.59],
#                                        'heart_rate.max': [92, 230], 'heart_rate.min': [31, 139], 'heart_rate.avg': [69.29, 194.89]}
#threshold['core stability training'] = {'speed.max': [0, 150], 'speed.min': [0, 30], 'speed.avg': [0, 20],
#                                        'heart_rate.max': [66, 238], 'heart_rate.min': [40, 197], 'heart_rate.avg': [49.4, 210.25]}
#threshold['indoor cycling']          = {'speed.max': [0, 93], 'speed.min': [0, 37], 'speed.avg': [0, 59],
#                                         'heart_rate.avg': [88.5, 178.55]}
#threshold['mountain bike']           = {'speed.max': [6, 82], 'speed.min': [0, 10], 'speed.avg': [6.5, 33.7],
#                                        'heart_rate.max': [108, 222], 'heart_rate.min': [31, 137], 'heart_rate.avg': [78.7, 190.24]}
#threshold['orienteering']           = {'speed.max': [5, 38], 'speed.min': [0, 6], 'speed.avg': [4.29, 18.1],
#                                        'heart_rate.max': [117, 225], 'heart_rate.min': [40, 147], 'heart_rate.avg': [97, 199]}
threshold['run']                     = {'speed.max': [4, 60], 'speed.min': [0, 12], 'speed.avg': [3.56, 19.24],
                                        'heart_rate.max': [104, 230], 'heart_rate.min': [40, 153], 'heart_rate.avg': [92.8, 200.5]}
#threshold['walk']                    = {'speed.max': [2, 40], 'speed.min': [0, 8], 'speed.avg': [2.58, 9.5],
#                                        'heart_rate.max': [117, 225], 'heart_rate.min': [40, 147], 'heart_rate.avg': [97, 199]}

print(threshold)
cleanData(threshold)
print('Finished.')

{'run': {'heart_rate.avg': [92.8, 200.5], 'speed.max': [4, 60], 'heart_rate.min': [40, 153], 'heart_rate.max': [104, 230], 'speed.min': [0, 12], 'speed.avg': [3.56, 19.24]}}
Cleanning run
For sport run, the range of avg heart_rate is [92.8, 200.5]
For sport run, the range of max speed is [4, 60]
For sport run, the range of min heart_rate is [40, 153]
For sport run, the range of max heart_rate is [104, 230]
For sport run, the range of min speed is [0, 12]
For sport run, the range of avg speed is [3.56, 19.24]
Finished.
