In [9]:
import numpy as np
import ijson
import pickle
import os
from __future__ import division

#Need to:
        #0. Scan the file when producing metadata and create a list of data point indices
        #1. Load the list of data point indices
        #2. Randomly permute the list of data point indices
        #3. Split the list of data point indices into training, validation, and test sets
        #4. Provide a method to get the next data point in the list (and either globally save the position of the list or save it implicitly in a generator)
        #5. Provide a method to reset this list position (for a new epoch)
        #6. Rewrite the endoIterator method to respect randomization (copy the random access iterator code from the ptb reader example)

class dataInterpreter(object):
    def __init__(self, fn="endomondoHR_proper.json", attributes=None, dataSet="train", allowMissingData=True):
        self.dataFileName=fn#Will eventually replace this with a data folder name
        self.MetaDataLoaded=False
        self.dataSchemaLoaded=False
        self.currentDataPoint=None
        self.currentDataPointNumber=0
        self.dataPointPosition=0
        self.attIgnore=['id','url']#Attributes to ignore when building metadata
        self.metaDataFn=fn[0:len(fn)-5]+"_metaData.p"
        self.allowMissingData=allowMissingData
        #self.valTestSplit=(.1,.1)
        if attributes is not None:
            self.buildDataSchema(attributes)
            
    def createGenerator(self): #Define a new data generator
        filename = self.dataFileName
        self.f=open(filename, 'r')
        objects = ijson.items(self.f, 'users.item')
        self.dataObjects=objects
        return self.dataObjects

    def getNextDataPoint(self):
        
        return dataPoint

    def getNextDataPointSequential(self):
        try: #If there is a generator already defined
            objects=self.dataObjects
        except: #Otherwise create a new one
            #Creating new generator
            objects=self.createGenerator()
        fileIndex=self.f.tell()
        nextDataPoint=self.__convert(objects.next())

        return (nextDataPoint, fileIndex)
    
    def newEpoch(self):
        return
    
    def endoIterator(self, batch_size, num_steps):
        return

    def scaleData(self, data, att):
        #This function provides optional rescaling of the data for optimal neural network performance. 
        #It can either be run online or offline w/ results stored in a preprocessed data file (more effecient)
        if att=="speed":
            scaledData=data
            return scaledData
        elif att=="heart_rate":
            scaledData=data/250.0 #This will be replaced with an auto-ranging version
            return scaledData
        elif att=="altitude":
            scaledData=float(data)/10000.0 #This will be replaced with an auto-ranging version
            return scaledData
        else:
            return data
        
    def __convert(self, unicData): #Converts the unicode text in a dictionary to ascii
        #Shamelessly lifted from http://stackoverflow.com/questions/13101653/python-convert-complex-dictionary-of-strings-from-unicode-to-ascii
        if isinstance(unicData, dict):
            return {self.__convert(key): self.__convert(value) for key, value in unicData.iteritems()}
        elif isinstance(unicData, list):
            return [self.__convert(element) for element in unicData]
        elif isinstance(unicData, unicode):
            return unicData.encode('utf-8')
        else:
            return unicData
    
    def buildEncoder(self, classLabels):
        #Constructs a dictionary that maps each class label to a list (encoding scheme) where one entry in the list is 1 and the remainder are 0
        encodingLength=classLabels.size
        encoder={}
        for i, label in enumerate(classLabels):
            encoding=[0] * encodingLength
            encoding[i]=1
            encoder[label]=encoding
        return encoder
    
    def writeSummaryFile(self):
        metaDataForWriting=metaDataEndomondo(self.numDataPoints, self.encodingLengths, self.oneHotEncoders, self.isSequence, self.isNominal, self.dataPointIndices)
        with open(self.metaDataFn, "wb") as f:
            pickle.dump(metaDataForWriting, f)

        #pickle.dump(metaDataForWriting, open(self.metaDataFn, "wb"))
        print("Summary file written")
        
    def loadSummaryFile(self):
        try:
            print("Loading metadata")
            with open(self.metaDataFn, "rb") as f:
                metaData = pickle.load(f)
                
            #metaData=pickle.load(open(self.metaDataFn, "rb"))
        except:
            raise(IOError("Metadata file: " + self.metaDataFn + " not in valid pickle format"))
        self.numDataPoints=metaData.numDataPoints
        self.encodingLengths=metaData.encodingLengths
        self.oneHotEncoders=metaData.oneHotEncoders
        #self.dataDim=metaData.dataDim
        self.isSequence=metaData.isSequence
        self.isNominal=metaData.isNominal
        self.dataPointIndices=metaData.dataPointIndices
        print("Metadata loaded")
        
    def buildMetaData(self):
        #Takes a list of attributes and the current datafile and constructs a schema for the data to be input into the RNN.
        if os.path.isfile(self.metaDataFn):#If a summary file exists
            self.loadSummaryFile()#Load that summary file and use it to capture all the necessary info
        else:
            print("Building data schema")
            #Build such a summary file by running through the full dataset and capturing the necessary statistics
            self.isSequence={'altitude':True, 'gender':False, 'heart_rate':True, 'id':False, 'latitude':True, 'longitude':True,
                             'speed':True, 'sport':False, 'timestamp':True, 'url':False, 'userId':False}#Handcoded
            self.isNominal={'altitude':False, 'gender':True, 'heart_rate':False, 'id':True, 'latitude':False, 'longitude':False,
                            'speed':False, 'sport':True, 'timestamp':False, 'url':True, 'userId':True}#Handcoded
            allDataClasses=['altitude', 'gender', 'heart_rate', 'id', 'latitude', 'longitude',
       'speed', 'sport', 'timestamp', 'url', 'userId']
            dataClasses=[x for x in allDataClasses if x not in self.attIgnore]#get rid of the attributes that we are ignoring
            self.newEpoch()#makes sure to reset things
            moreData=True
            classLabels={}
            dataPointIndices=[]
            numDataPoints=0
            while moreData:
                if numDataPoints%1000==0:
                    print("Currently at data point " + str(numDataPoints))
                try:
                    currDataTuple=self.getNextDataPointSequential()
                    currData=[currDataTuple[0]]
                    currDataIndex=currDataTuple[1]
                    dataPointIndices.extend(currDataIndex)
                    #dataClasses = self.getDataClasses(currData)#This could be removed to make it more effecient
                    for datclass in dataClasses:
                        if self.isNominal[datclass]: #If it is nominal data
                            if self.isSequence[datclass]:
                                raise(NotImplementedError("Nominal data types for sequences have not yet been implemented"))
                            dataClassLabels=self.getDataLabels(currData, datclass)
                            if classLabels.get(datclass) is None: #If it is the first step
                                classLabels[datclass]=dataClassLabels
                            else:
                                #print(np.concatenate(dataClassLabels,classLabels[datclass]))
                                classLabels[datclass]=np.unique(np.concatenate([dataClassLabels,classLabels[datclass]]))
                        else:
                            if self.isSequence[datclass]!=True:
                                #If is it nominal and not a sequence
                                raise(NotImplementedError("Non-nominal data types for non-sequences have not yet been implemented"))
                    numDataPoints=numDataPoints+1
                except:
                    moreData=False
                    print("Stopped at " + str(numDataPoints) + " data points")
                #if numDataPoints>10000:#For testing
                #    moreData=False#For testing
            
            oneHotEncoders={}
            encodingLengths={}
            dataDim=0
            for datclass in dataClasses:
                if self.isSequence[datclass]==False:
                    oneHotEncoders[datclass]=self.buildEncoder(classLabels[datclass])
                    encodingLengths[datclass]=classLabels[datclass].size
                    #dataDim=dataDim+encodingLengths[datclass]
                else:
                    if self.isNominal[datclass]:
                        raise(NotImplementedError("Nominal data types for sequences have not yet been implemented"))
                    else:
                        encodingLengths[datclass]=1
                        #dataDim=dataDim+1
            
            #Set all of the summary information to self properties
            self.numDataPoints=numDataPoints
            self.encodingLengths=encodingLengths#A dictionary that maps attributes to the lengths of their vector encoding schemes
            self.oneHotEncoders=oneHotEncoders#A dictionary of dictionaries where the outer dictionary maps attributes to encoding schemes and where each encoding scheme is a dictionary that maps attribute values to one hot encodings
            #self.dataDim=dataDim#The sum of all the encoding lengths for the relevant attributes
            #self.isSequence=#A dictionary that returns whether an attribute takes the form of a sequence of data
            #self.isNominal=#A dictionary that returns whether an attribute is nominal in form (neither numeric nor ordinal)
            self.dataPointIndices=dataPointIndices
            
            #Save that summary file so that it can be used next time
            self.writeSummaryFile()
        self.MetaDataLoaded=True 
        
    def oneHot(self, dataPoint, att):
        #Takes the current data point and the attribute type and uses the data schema to provide the one-hot encoding for the variable
        dataValue=dataPoint[att]       
        #Use a stored schema dictionary to return the correct encoding scheme for the attribute (an encoding scheme is also a dictionary)
        encoder=self.oneHotEncoders[att]
        #Use this encoding scheme to get the encoding
        encoding=encoder[dataValue]
        return encoding
    
    
    
    
class metaDataEndomondo(object):
    #For disk storage of metadata
    #Meant to be pickled and unpickled
    def __init__(self, numDataPoints, encodingLengths, oneHotEncoders, isSequence, isNominal, dataPointIndices):
        self.numDataPoints=numDataPoints
        self.encodingLengths=encodingLengths
        self.oneHotEncoders=oneHotEncoders
        #self.dataDim=dataDim
        self.isSequence=isSequence
        self.isNominal=isNominal
        self.dataPointIndices=dataPointIndices

In [10]:
endoReader=dataInterpreter(fn="multimodalDBM/endomondoHR_proper.json", allowMissingData=True)

In [11]:
testGen=endoReader.createGenerator()

In [14]:
dataTuple=endoReader.getNextDataPointSequential()
dataTuple[1]

49252

In [21]:
endoReader.f.seek(49252)
endoReader.f.read(10000)

'347, 1408224352, 1408224363, 1408224381, 1408224385, 1408224390, 1408224398, 1408224401, 1408224416, 1408224422, 1408224431, 1408224436, 1408224450, 1408224466, 1408224480, 1408224484, 1408224495, 1408224498, 1408224505, 1408224519, 1408224534, 1408224538, 1408224550, 1408224554, 1408224568, 1408224575, 1408224579, 1408224582, 1408224588, 1408224591, 1408224595, 1408224598, 1408224602, 1408224606, 1408224620, 1408224629, 1408224638, 1408224643, 1408224647, 1408224652, 1408224660, 1408224663, 1408224668, 1408224672, 1408224688, 1408224696, 1408224699, 1408224705, 1408224708, 1408224712, 1408224716, 1408224719, 1408224724, 1408224727, 1408224739, 1408224750, 1408224757, 1408224773, 1408224787, 1408224794, 1408224803, 1408224808, 1408224827, 1408224846, 1408224866, 1408224884, 1408224892, 1408224906, 1408224919, 1408224925, 1408224930, 1408224938, 1408224943, 1408224951, 1408224955, 1408224962, 1408224971, 1408224975, 1408224987, 1408224998, 1408225008, 1408225012, 1408225029, 1408225043