In [1]:
import gzip
from haversine import haversine

In [2]:
# Return true if number of timestamps is above threshold, false otherwise
def cleanDataByTimestamp(numOfTimestamps, threshold):
    return numOfTimestamps >= threshold

In [3]:
# Generate a list of distance
def formDistanceList(workout):
    distanceIntervals = []
    for i in range(len(workout['longitude']) - 1):
        fromPoint = [workout['latitude'][i], workout['longitude'][i]]
        toPoint = [workout['latitude'][i + 1], workout['longitude'][i + 1]]
        
        distance = haversine(fromPoint, toPoint, miles = True)
        distanceIntervals.append(distance)
        
    return distanceIntervals

In [4]:
# Generate a list of time intervals
def formTimeList(workout):
    timeIntervals = []
    for i in range(len(workout['timestamp']) - 1):
        fromTime = workout['timestamp'][i]
        toTime = workout['timestamp'][i+1]
        
        timeIntervals.append(toTime - fromTime)
        
    return timeIntervals

In [5]:
# Generate a list of speed based on distance and time intervals. Return empty list if time interval is illegal
def formSpeedList(distanceIntervals, timeIntervals):
    speedList = []
    noZeroTimestamp = True
    #previousSpeed = 0
    for i in range(len(distanceIntervals)):
        if(timeIntervals[i] == 0):
            noZeroTimestamp = False
            break
        speed = distanceIntervals[i] / timeIntervals[i] * 3600
        speedList.append(speed)
        
    if noZeroTimestamp == False:
        speedList = []
    return speedList

In [6]:
timestampThreshold = 100
log = open('LOG_addSpeedCleanTimestanpDivideSports', 'w')
zin = gzip.open('../endomondoHR.json.gz', 'rb')

# Keep track of status of data
total = 0
badNumOfTimestamp = 0
badTimeInterval = 0
legalData = 0

for l in zin:
    
    # Increment total number of workouts
    total += 1
    
    # Evaluate the line
    l = l.decode('ascii')
    workout = eval(l)
    
    # Get the sport of the current workout
    currSport = workout['sport']
    
    # Log the current workout ID
    log.write('Workout #{}: '.format(workout['id']))
    
    # Check for number of timestamps
    validTimestamp = cleanDataByTimestamp(len(workout['timestamp']), timestampThreshold)

    if validTimestamp == False:
        badNumOfTimestamp += 1
        log.write('# of timestamps ({}) is below threshold {}. Sport {}. DISCARD!\n'.format(len(workout['timestamp']), timestampThreshold, currSport))
        continue;
    
    # Generate speed list
    distanceIntervals = formDistanceList(workout)
    timeIntervals = formTimeList(workout)   
    speedList = formSpeedList(distanceIntervals, timeIntervals)
    
    # Check whether speed list generated is legal
    if len(speedList) == 0:
        badTimeInterval += 1
        log.write('Found illigal timestamp interval(s). Sport{}. DISCARD!\n'.format(currSport))
        continue;
    
    # Rewrite the original speed
    workout['speed'] = speedList
    
    # Write to subfile according to workout type
    zout = gzip.open('../sportsWithCleanedTimestamp/' + currSport + '.json.gz', 'ab')
    zout.write(bytes(str(workout) + '\n', 'ascii'))
    zout.close()
    
    # Increment statistic
    legalData += 1
    
    # Log status
    log.write('{} timestamps with threshold {}. Everything good. Sport {}.\n'.format(len(workout['timestamp']), timestampThreshold, currSport))
    
zin.close()
log.close()
    

In [7]:
total

253020

In [8]:
legalData

214457

In [9]:
badNumOfTimestamp

11399

In [10]:
badTimeInterval

27164

In [7]:
pair = []
zout = open('result.txt', 'w')
print(len(speedList))
print(len(providedSpeed))
for i in range(len(providedSpeed)):
    pair.append([speedList[i], providedSpeed[i]])
zout.write('id is {}; userId is {}\n'.format(workoutId, usrId))
zout.write('[Calculated Speed, Provided Speed], Difference\n\n')
for currentPair in pair:
    zout.write('{}, {}\n'.format(currentPair, currentPair[0] - currentPair[1]))
zout.close()

500
498
