In [None]:
from __future__ import division
import numpy as np
import ijson
import pickle
import os
import jsonReader

#Need to:
        #0. Scan the file when producing metadata and create a list of data point indices
        #1. Load the list of data point indices
        #2. Randomly permute the list of data point indices
        #3. Split the list of data point indices into training, validation, and test sets
        #4. Provide a method to get the next data point in the list (and either globally save the position of the list or save it implicitly in a generator)
        #5. Provide a method to reset this list position (for a new epoch)
        #6. Rewrite the endoIterator method to respect randomization (copy the random access iterator code from the ptb reader example)

class dataInterpreter(object):
    def __init__(self, fn="endomondoHR_proper.json", attributes=None, allowMissingData=True):
        self.dataFileName=fn#Will eventually replace this with a data folder name
        self.dataFile=open(self.dataFileName, 'r')
        self.MetaDataLoaded=False
        self.dataSchemaLoaded=False
        #self.currentDataPoint=None
        self.dataPointPosition=0
        self.attIgnore=['id','url']#Attributes to ignore when building metadata
        self.metaDataFn=fn[0:len(fn)-5]+"_metaData.p"
        self.allowMissingData=allowMissingData
        #self.valTestSplit=(.1,.1)
        if attributes is not None:
            self.buildDataSchema(attributes)
    
    def buildDataSchema(self, attributes, trainValTestSplit=(.8,.1,.1)):
        self.buildMetaData()
        self.splitForValidation(trainValTestSplit)
        #self.newEpoch()#Reset all indices and counters
        self.attributes=attributes
        dataDimSum=0
        for att in self.attributes:
            dataDimSum=dataDimSum+self.encodingLengths[att]
        self.dataDim=dataDimSum
        self.dataSchemaLoaded=True
    
    def createSequentialGenerator(self): #Define a new data generator
        filename = self.dataFileName
        self.f=open(filename, 'r')
        objects = ijson.items(self.f, 'users.item')
        self.dataObjects=objects
        return self.dataObjects
    
    def dataGenerator(self, dataSetOrder):
        for dp_index in dataSetOrder:
            fileIndices = self.dataPointIndices[dp_index]
            potentialNextDataPoint=jsonReader.getDataPoint(fileIndices, self.dataFile)
            
            """
            if self.allowMissingData==False:
                #Check if the next data point contains all the requested attributes
                for i, att in enumerate(self.attributes):
                    #print(att)
                    try:
                        test=self.currentDataPoint[att]
                    except:
                        print("Skipping data point because it lacks attribute: " + att)
                        #print("Skipping data point because it lacks attribute")
                        return self.getNextDataPoint() #Try the next one instead
            """
            yield potentialNextDataPoint #returns next data point
    
    def randomizeDataOrder(self, dataIndices):
        return np.random.permutation(dataIndices)

    #def getNextDataPoint(self):
    #        jsonReader.getDataPoint(index, dataFile)
    #    return dataPoint

    def getNextDataPointSequential(self):
        try: #If there is a generator already defined
            objects=self.dataObjects
        except: #Otherwise create a new one
            #Creating new generator
            objects=self.createSequentialGenerator()
        nextDataPoint=self.__convert(objects.next())

        return nextDataPoint
    
    #def newEpoch(self):
        # A convenience function for reseting the data loader to start a new epoch
        #self.currentDataPoint = None
    #    self.dataPointPosition = 0  # The position within a data point (within an exercise)       
    
    
    def batchIterator(self, batch_size, trainValidTest):
        #Returns a tensorflow tensor (a numpy array) containing a batch of data
        #Can be used directly for feed or to preprocess for additional efficiency
        
        #Currently does not explicitly separate exercise routines. 
        #Can be augmented with a variable that captures end and begnning of a routine if this helps.
        
        if trainValidTest=='train':
            self.trainingOrder = self.randomizeDataOrder(self.trainingSet)
            dataGen=self.dataGenerator(self.trainingOrder)
        elif trainValidTest=='valid':
            self.validationOrder = self.randomizeDataOrder(self.validationSet)
            dataGen=self.dataGenerator(self.validationOrder)
        elif trainValidTest=='test':
            self.testOrder = self.randomizeDataOrder(self.testSet)
            dataGen=self.dataGenerator(self.testOrder)
        else:
            raise(exception("Invalid dataset type. Must be 'train', 'valid', or 'test'"))
        
        if self.dataSchemaLoaded==False:
            raise(RuntimeError("Need to load a data schema"))
        
        dataBatch = np.zeros((batch_size, self.dataDim))
        #self.dataDim is the total concatenated length of the data at each time point (for all attributes)
                
        #if currentDataPoint is None: #If starting an epoch, grab the first data point
        currentDataPoint=dataGen.next()
        dataPointPosition=0
        currentDataPointLength=self.getDataPointLength(currentDataPoint)
        moreData=True
        while moreData:
            for i in range(batch_size):
                #Need code for getting the current data point and iterating through it until the end of it...
                #if end of data point:
                    #currentPoint = next data point
                dataList = [] #A mutable data structure to allow us to construct the data instance...
                if dataPointPosition==currentDataPointLength: #Check to see if new data point is needed
                    try:
                        currentDataPoint=dataGen.next()
                    except: #If there is no more data, return what you have
                        moreData=False
                        yield dataBatch #May need to pad this??
                    currentDataPointLength=self.getDataPointLength(currentDataPoint)
                    dataPointPosition=0
                for j, att in enumerate(self.attributes):
                    if self.isSequence[att]: #Need to limit the sequence to the end of the batch...
                        #Put the sequence attributes in their proper positions in the tensor array
                        #These are numeric encoding schemes.
                        attData=currentDataPoint[att][dataPointPosition]#Get the next entry in the attribute sequence for the current data point
                    else:
                        #Put the context attributes in their proper positions in the tensor array
                        #These are a one-hot encoding schemes except in the case of "age" and the like
                        if self.isNominal[att]:#Checks whether the data is nominal
                            attData = self.oneHot(currentDataPoint, att) #returns a list
                        else:
                            attData = currentDataPoint #Handles ordinal and numeric data

                    scaledAttData=self.scaleData(attData, att)#Rescales data if needed
                    if self.isList(scaledAttData):
                        dataList.extend(scaledAttData)
                    else:
                        dataList.append(scaledAttData)           
                if len(dataList)==self.dataDim:
                    dataBatch[i,:]=dataList
                else:
                    print("Data list length: " + dataList)
                    print("Data schema length: " + self.dataDim)
                    raise(ValueError("Data is not formatted according to the schema"))

                dataPointPosition=dataPointPosition+1

            yield dataBatch
    
    def endoIterator(self, batch_size, num_steps, trainValidTest):
        
        batchGen = self.batchIterator(batch_size*(num_steps+1), trainValidTest)

        data_len = self.numDataPoints
        batch_len = data_len // batch_size
        epoch_size = (batch_len - 1) // num_steps

        if epoch_size == 0:
            raise ValueError("epoch_size == 0, decrease batch_size or num_steps")

        #For these guys, the labels are simply the next sequence. This is to train the model to reprodue the text.
        #Since I am not really trying to do this, I should generate the labels seperately.
        #However, I might find that training the net this way (to predict the sequence) and then transplanting the weights into the full model might be useful...
        """for i in range(epoch_size):
            batchData=self.nextBatch(batch_size)
            data = np.zeros([batch_size, batch_len, self.dataDim])
            for j in range(batch_size):
                data[j] = batchData[batch_len * j:batch_len * (j + 1)]
            x = data[:, i*num_steps:(i+1)*num_steps]
            y = data[:, i*num_steps+1:(i+1)*num_steps+1]
            yield (x, y)"""
        
        #The code below is not ideal because it trains everything in order whereby each batch is comprised sequentially of the data
        #It should be OK for basic testing, however.
        #and it may miss some transitions (not sure)
        #print( epoch_size)
        for i in range(epoch_size):
            batchData=batchGen.next()
            #print(batchData.shape)
            data = np.zeros([batch_size, num_steps+1, self.dataDim])
            for j in range(batch_size):
                data[j,:,:] = batchData[(num_steps * j):((num_steps * (j + 1))+1),:]
            #print(data.shape)
            #x = data[:, i*num_steps:((i+1)*num_steps),:]
            x = data[:, 0:num_steps, :]
            #y = data[:, i*(num_steps+1):((i+1)*num_steps+1),:]
            y = data[:, 1:(num_steps+1), :]
            yield (x, y)
        
    
    def splitForValidation(self, valTestSplit):
        #Construct seperate data files for the training, test, and validation data
        self.numDataPoints
        trainingSetSize=int(round(self.numDataPoints*valTestSplit[0]))
        validationSetSize=int(round(self.numDataPoints*valTestSplit[1]))
        testSetSize=int(round(self.numDataPoints*valTestSplit[2]))
        randomOrder=self.randomizeDataOrder(self.numDataPoints)
        
        self.trainingSet=randomOrder[0:trainingSetSize]
        self.validationSet=randomOrder[trainingSetSize:trainingSetSize+validationSetSize]
        self.testSet=randomOrder[trainingSetSize+validationSetSize:trainingSetSize+validationSetSize+testSetSize]
        
        #print("training set size:" + str(len(self.trainingSet))
        #print("validation set size:" + str(len(self.validationSet))
        #print("test set size:" + str(len(self.testSet)))

    def scaleData(self, data, att):
        #This function provides optional rescaling of the data for optimal neural network performance. 
        #It can either be run online or offline w/ results stored in a preprocessed data file (more effecient)
        if att=="speed":
            scaledData=data
            return scaledData
        elif att=="heart_rate":
            scaledData=data/250.0 #This will be replaced with an auto-ranging version
            return scaledData
        elif att=="altitude":
            scaledData=float(data)/10000.0 #This will be replaced with an auto-ranging version
            return scaledData
        else:
            return data
        
    def __convert(self, unicData): #Converts the unicode text in a dictionary to ascii
        #Shamelessly lifted from http://stackoverflow.com/questions/13101653/python-convert-complex-dictionary-of-strings-from-unicode-to-ascii
        if isinstance(unicData, dict):
            return {self.__convert(key): self.__convert(value) for key, value in unicData.iteritems()}
        elif isinstance(unicData, list):
            return [self.__convert(element) for element in unicData]
        elif isinstance(unicData, unicode):
            return unicData.encode('utf-8')
        else:
            return unicData
        
    def getDataPointLength(self, dataPoint):
        #Checks a single attribute. If the length of all sequence attributes is not equal, additional code will need to be written...
        return len(dataPoint["heart_rate"])#tries "heart_rate"
    
    def isList(self, attData):
        #checks whether the variable attData is a list and returns true or false
        return isinstance(attData, list)
        #might want to try isSubclass(attData, list) if this doesn't work...
    
    def buildEncoder(self, classLabels):
        #Constructs a dictionary that maps each class label to a list (encoding scheme) where one entry in the list is 1 and the remainder are 0
        encodingLength=classLabels.size
        encoder={}
        for i, label in enumerate(classLabels):
            encoding=[0] * encodingLength
            encoding[i]=1
            encoder[label]=encoding
        return encoder
    
    def getDataLabels(self, data, dataClass):
        #The "data" argument is in the same format as is returned by "getNdatapoints"
        #If there is a use case that involves finding all the possible labels for a given class, a seperate function should be written to save memory usage...
        class_labels = [col[dataClass] for col in data]
        return np.unique(np.array(class_labels))
    
    def writeSummaryFile(self):
        metaDataForWriting=metaDataEndomondo(self.numDataPoints, self.encodingLengths, self.oneHotEncoders, self.isSequence, self.isNominal, self.dataPointIndices)
        with open(self.metaDataFn, "wb") as f:
            pickle.dump(metaDataForWriting, f)

        #pickle.dump(metaDataForWriting, open(self.metaDataFn, "wb"))
        print("Summary file written")
        
    def loadSummaryFile(self):
        try:
            print("Loading metadata")
            with open(self.metaDataFn, "rb") as f:
                metaData = pickle.load(f)
                
            #metaData=pickle.load(open(self.metaDataFn, "rb"))
        except:
            raise(IOError("Metadata file: " + self.metaDataFn + " not in valid pickle format"))
        self.numDataPoints=metaData.numDataPoints
        self.encodingLengths=metaData.encodingLengths
        self.oneHotEncoders=metaData.oneHotEncoders
        #self.dataDim=metaData.dataDim
        self.isSequence=metaData.isSequence
        self.isNominal=metaData.isNominal
        self.dataPointIndices=metaData.dataPointIndices
        print("Metadata loaded")
        
    def buildMetaData(self):
        #Takes a list of attributes and the current datafile and constructs a schema for the data to be input into the RNN.
        if os.path.isfile(self.metaDataFn):#If a summary file exists
            self.loadSummaryFile()#Load that summary file and use it to capture all the necessary info
        else:
            print("Building data schema")
            #Build such a summary file by running through the full dataset and capturing the necessary statistics
            self.isSequence={'altitude':True, 'gender':False, 'heart_rate':True, 'id':False, 'latitude':True, 'longitude':True,
                             'speed':True, 'sport':False, 'timestamp':True, 'url':False, 'userId':False}#Handcoded
            self.isNominal={'altitude':False, 'gender':True, 'heart_rate':False, 'id':True, 'latitude':False, 'longitude':False,
                            'speed':False, 'sport':True, 'timestamp':False, 'url':True, 'userId':True}#Handcoded
            allDataClasses=['altitude', 'gender', 'heart_rate', 'id', 'latitude', 'longitude',
       'speed', 'sport', 'timestamp', 'url', 'userId']
            dataClasses=[x for x in allDataClasses if x not in self.attIgnore]#get rid of the attributes that we are ignoring
            #self.newEpoch()#makes sure to reset things
            moreData=True
            classLabels={}
            numDataPoints=0
            while moreData:
                if numDataPoints%1000==0:
                    print("Currently at data point " + str(numDataPoints))
                try:
                    currData=[self.getNextDataPointSequential()]
                    #dataClasses = self.getDataClasses(currData)#This could be removed to make it more effecient
                    for datclass in dataClasses:
                        if self.isNominal[datclass]: #If it is nominal data
                            if self.isSequence[datclass]:
                                raise(NotImplementedError("Nominal data types for sequences have not yet been implemented"))
                            dataClassLabels=self.getDataLabels(currData, datclass)
                            if classLabels.get(datclass) is None: #If it is the first step
                                classLabels[datclass]=dataClassLabels
                            else:
                                #print(np.concatenate(dataClassLabels,classLabels[datclass]))
                                classLabels[datclass]=np.unique(np.concatenate([dataClassLabels,classLabels[datclass]]))
                        else:
                            if self.isSequence[datclass]!=True:
                                #If is it nominal and not a sequence
                                raise(NotImplementedError("Non-nominal data types for non-sequences have not yet been implemented"))
                    numDataPoints=numDataPoints+1
                except:
                    moreData=False
                    print("Stopped at " + str(numDataPoints) + " data points")
                #if numDataPoints>10000:#For testing
                #    moreData=False#For testing
            
            oneHotEncoders={}
            encodingLengths={}
            dataDim=0
            for datclass in dataClasses:
                if self.isSequence[datclass]==False:
                    oneHotEncoders[datclass]=self.buildEncoder(classLabels[datclass])
                    encodingLengths[datclass]=classLabels[datclass].size
                    #dataDim=dataDim+encodingLengths[datclass]
                else:
                    if self.isNominal[datclass]:
                        raise(NotImplementedError("Nominal data types for sequences have not yet been implemented"))
                    else:
                        encodingLengths[datclass]=1
                        #dataDim=dataDim+1
            print("Getting data indices")
            dataPointIndices=jsonReader.getDataIndices(self.dataFileName)
            
            #Set all of the summary information to self properties
            self.numDataPoints=numDataPoints
            self.encodingLengths=encodingLengths#A dictionary that maps attributes to the lengths of their vector encoding schemes
            self.oneHotEncoders=oneHotEncoders#A dictionary of dictionaries where the outer dictionary maps attributes to encoding schemes and where each encoding scheme is a dictionary that maps attribute values to one hot encodings
            #self.dataDim=dataDim#The sum of all the encoding lengths for the relevant attributes
            #self.isSequence=#A dictionary that returns whether an attribute takes the form of a sequence of data
            #self.isNominal=#A dictionary that returns whether an attribute is nominal in form (neither numeric nor ordinal)
            self.dataPointIndices=dataPointIndices
            
            #Save that summary file so that it can be used next time
            self.writeSummaryFile()
        self.MetaDataLoaded=True 
        
    def oneHot(self, dataPoint, att):
        #Takes the current data point and the attribute type and uses the data schema to provide the one-hot encoding for the variable
        dataValue=dataPoint[att]       
        #Use a stored schema dictionary to return the correct encoding scheme for the attribute (an encoding scheme is also a dictionary)
        encoder=self.oneHotEncoders[att]
        #Use this encoding scheme to get the encoding
        encoding=encoder[dataValue]
        return encoding
    
    
class metaDataEndomondo(object):
    #For disk storage of metadata
    #Meant to be pickled and unpickled
    def __init__(self, numDataPoints, encodingLengths, oneHotEncoders, isSequence, isNominal, dataPointIndices):
        self.numDataPoints=numDataPoints
        self.encodingLengths=encodingLengths
        self.oneHotEncoders=oneHotEncoders
        #self.dataDim=dataDim
        self.isSequence=isSequence
        self.isNominal=isNominal
        self.dataPointIndices=dataPointIndices

In [None]:
endoReader=dataInterpreter(fn="../multimodalDBM/endomondoHR_proper.json", allowMissingData=True)
endoReader.buildDataSchema(["speed", "sport", "heart_rate","gender", "altitude"])

Building data schema
Currently at data point 0
Currently at data point 1000
Currently at data point 2000
Currently at data point 3000
Currently at data point 4000
Currently at data point 5000
Currently at data point 6000
Currently at data point 7000
Currently at data point 8000
Currently at data point 9000
Currently at data point 10000


In [34]:
testGen=endoReader.batchIterator(1,'train')

In [35]:
dataPoint=testGen.next()
dataPoint

AttributeError: 'dataInterpreter' object has no attribute 'trainingSet'

In [21]:
endoReader.f.seek(49252)
endoReader.f.read(10000)

'347, 1408224352, 1408224363, 1408224381, 1408224385, 1408224390, 1408224398, 1408224401, 1408224416, 1408224422, 1408224431, 1408224436, 1408224450, 1408224466, 1408224480, 1408224484, 1408224495, 1408224498, 1408224505, 1408224519, 1408224534, 1408224538, 1408224550, 1408224554, 1408224568, 1408224575, 1408224579, 1408224582, 1408224588, 1408224591, 1408224595, 1408224598, 1408224602, 1408224606, 1408224620, 1408224629, 1408224638, 1408224643, 1408224647, 1408224652, 1408224660, 1408224663, 1408224668, 1408224672, 1408224688, 1408224696, 1408224699, 1408224705, 1408224708, 1408224712, 1408224716, 1408224719, 1408224724, 1408224727, 1408224739, 1408224750, 1408224757, 1408224773, 1408224787, 1408224794, 1408224803, 1408224808, 1408224827, 1408224846, 1408224866, 1408224884, 1408224892, 1408224906, 1408224919, 1408224925, 1408224930, 1408224938, 1408224943, 1408224951, 1408224955, 1408224962, 1408224971, 1408224975, 1408224987, 1408224998, 1408225008, 1408225012, 1408225029, 1408225043

In [1]:
"""
This file needs to read in the json data and return both the structured data records for each data point as well as the
locations of each data point in the file
It also needs to be able to return a data point given a the known file position of that data point"""

import json

def getDataIndices(dataFileName):
    """Takes a data file name and scans through the file, recording the beginning and end file index for each data point
     Then returns this information as a list"""

    dataFile = open(dataFileName, 'r')
    dataIndices=[]# A list of (beginning, end) tuples
    eof=False
    curlyDepth=0 #A variable used for keeping track of beginnings and ends of data points
    currentIndices=[]#Placeholder
    numDataPointsAccessed=0
    #dataGen=fileIterator(dataFile, 10000)
    while not eof:
        if (numDataPointsAccessed==0) or (curlyDepth>0):
            #Go through the characters and keep track of the outermost brackets enclosing data points. Take note of their positions.
            nextChar=dataFile.read(1)
            #nextChar=dataGen.next()
            if nextChar == '{':
                if curlyDepth==1:
                    currentIndices.append(dataFile.tell()-1)
                    numDataPointsAccessed+=1
                curlyDepth=curlyDepth+1
            elif nextChar== '}':
                if curlyDepth==2:
                    if numDataPointsAccessed%1000==0:
                        print numDataPointsAccessed
                    currentIndices.append(dataFile.tell())
                    dataIndices.append(currentIndices)
                    currentIndices=[]
                curlyDepth=curlyDepth-1
        else:
            eof = True

    return dataIndices

def getDataPoint(index, dataFile):
    """Takes a pair of data point indices (the beginning and end of the file location) and a file handle
    and reads the file between those positions, converting the json formatted data to a Python dictionary"""

    dataFile.seek(index[0])
    rawJson = dataFile.read(index[1]-index[0])

    #Convert the json into dictionaries
    dataPoint = json.loads(rawJson)

    return dataPoint

def fileIterator(dataFile, readSize):
    eof=False
    while eof==False:
        try:
            dataBlock=dataFile.read(readSize)
            for dp in dataBlock:
                yield dp
        except:
            eof=True
            
        


In [2]:
dataIndices=getDataIndices('/home/lmuhlste/multimodalDBM/endomondoHR_proper.json')

1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000
26000
27000
28000
29000
30000
31000
32000
33000
34000
35000
36000
37000
38000
39000
40000
41000
42000
43000
44000
45000
46000
47000
48000
49000
50000
51000
52000
53000
54000
55000
56000
57000
58000
59000
60000
61000
62000
63000
64000
65000
66000
67000
68000
69000
70000
71000
72000
73000
74000
75000
76000
77000
78000
79000
80000
81000
82000
83000
84000
85000
86000
87000
88000
89000
90000
91000
92000
93000
94000
95000
96000
97000
98000
99000
100000
101000
102000
103000
104000
105000
106000
107000
108000
109000
110000
111000
112000
113000
114000
115000
116000
117000
118000
119000
120000
121000
122000
123000
124000
125000
126000
127000
128000
129000
130000
131000
132000
133000
134000
135000
136000
137000
138000
139000
140000
141000
142000
143000
144000
145000
146000
147000
148000
149000
150000
151000
152000
153000
154000
155000
156000
157000
158000
15

In [3]:
dataIndices[0]

[17, 35557]

In [4]:
dataFile = open('/home/lmuhlste/multimodalDBM/endomondoHR_proper.json', 'r')

In [13]:
dataFile.seek(17-1)

In [14]:
dataindex=dataIndices[0]
testDP=dataFile.read((dataindex[1]-dataindex[0])+1)

In [15]:
testDP[0:100]

'{"speed": [6.8652, 16.4736, 19.1988, 20.4804, 31.3956, 31.8096, 24.39, 23.346, 26.2368, 28.4868, 28.'

In [16]:
testDP[35500:]

'525, 60.17335429787636], "sport": "bike"}'

In [18]:
index=dataIndices[0]
index[0]-=1
formattedDP=getDataPoint(index, dataFile)

In [19]:
formattedDP

{u'altitude': [41.6,
  40.6,
  40.6,
  38.4,
  37.0,
  34.0,
  34.0,
  34.0,
  34.0,
  34.0,
  34.0,
  34.0,
  34.0,
  34.0,
  34.0,
  34.0,
  34.0,
  34.0,
  33.2,
  31.6,
  29.8,
  29.8,
  29.8,
  29.8,
  29.8,
  29.8,
  29.8,
  32.0,
  34.2,
  34.2,
  33.8,
  32.2,
  34.4,
  37.0,
  38.6,
  36.6,
  34.4,
  32.6,
  29.8,
  27.8,
  26.2,
  23.6,
  22.0,
  21.6,
  21.6,
  21.6,
  22.0,
  24.4,
  27.0,
  27.8,
  30.0,
  30.0,
  30.0,
  30.0,
  30.0,
  30.0,
  30.8,
  33.2,
  35.6,
  38.2,
  40.2,
  42.2,
  44.4,
  46.4,
  48.2,
  48.6,
  46.6,
  43.6,
  41.4,
  37.8,
  34.0,
  31.4,
  28.8,
  30.6,
  31.0,
  31.0,
  31.0,
  32.8,
  33.2,
  33.2,
  33.6,
  35.2,
  37.4,
  39.6,
  41.6,
  43.8,
  42.0,
  40.0,
  37.2,
  39.4,
  41.6,
  44.0,
  45.6,
  48.2,
  50.2,
  52.4,
  55.4,
  56.6,
  54.4,
  55.2,
  57.8,
  59.0,
  60.2,
  57.8,
  56.4,
  54.2,
  51.4,
  51.0,
  53.6,
  55.2,
  57.2,
  60.0,
  62.6,
  65.2,
  63.8,
  61.0,
  60.6,
  60.6,
  58.2,
  56.4,
  58.2,
  60.4,
  62.2,
  6

In [21]:
formattedDP.keys()

[u'url',
 u'gender',
 u'altitude',
 u'userId',
 u'longitude',
 u'heart_rate',
 u'timestamp',
 u'latitude',
 u'sport',
 u'speed',
 u'id']

In [27]:
formattedDP['speed']

[6.8652,
 16.4736,
 19.1988,
 20.4804,
 31.3956,
 31.8096,
 24.39,
 23.346,
 26.2368,
 28.4868,
 28.4868,
 27.6624,
 24.8688,
 25.794,
 14.0868,
 23.346,
 26.4168,
 31.8096,
 32.1084,
 32.274,
 30.9672,
 29.0016,
 27.5436,
 34.6464,
 31.6404,
 30.3696,
 26.4816,
 26.8128,
 26.172,
 24.1956,
 33.9804,
 29.4048,
 21.2112,
 19.4328,
 27.3456,
 29.6784,
 33.5988,
 23.4864,
 21.7512,
 29.7936,
 31.1256,
 32.58,
 32.8608,
 30.0276,
 28.5192,
 29.7324,
 21.0744,
 19.8756,
 19.7712,
 19.1304,
 18.486,
 23.3892,
 25.6068,
 27.5364,
 25.83,
 29.8764,
 23.2416,
 19.3788,
 17.892,
 16.2864,
 15.0408,
 14.6952,
 15.192,
 14.7816,
 18.918,
 25.83,
 30.204,
 35.8488,
 41.2632,
 43.1172,
 43.5888,
 41.9328,
 31.1544,
 25.3296,
 25.3044,
 27.828,
 21.7836,
 21.006,
 23.2596,
 31.7556,
 13.7628,
 17.5536,
 20.9088,
 19.7748,
 18.7128,
 22.914,
 31.4928,
 32.3676,
 33.498,
 21.762,
 21.3984,
 18.9612,
 13.7772,
 13.9788,
 13.608,
 13.4856,
 13.4856,
 19.836,
 38.2824,
 21.8124,
 20.196,
 21.2544,
 34.012

In [28]:
import jsonReader

In [29]:
jsonReader.getDataPoint()

TypeError: getDataPoint() takes exactly 2 arguments (0 given)