In [None]:
import numpy as np
import math


class DataProcessor:

    #columnsTags - names of columns
    columnsTags=[]

    #dataMatrix as loaded from file (first row excluded)
    dataMatrix=[]

    # each row has value and two vectors for collector then features & onehot FGC
    Onehot_Normalized_Matrix = []

    #sequences lengthes and maxlen
    maxSequenceLength=0
    sequencesLengths=[]

    # X and Y vectors to be retrieved
    X=[]
    Y=[]

    #the only function to be called
    def Generate_X_Y_Vectors(self,path):

        self.LoadFile(path)
        self.SortMatrix()
        self.Generate_OneHot_Normalized_Matrix()
        self.maxSequenceLength=self.getMaxSeqLen()
        self.generate_X_Y_vectors()

    #convert onrhot_normalized_matrix to the X,Y (COllaborative filtering last step)
    def generate_X_Y_vectors(self):

        #empty target for non sequence transactions
        EmptyTarget=[0]*len(self.Onehot_Normalized_Matrix[0][2])
        # empty transaction for padding
        EmptyTransaction=[0]*(len(self.Onehot_Normalized_Matrix[0][1])+len(self.Onehot_Normalized_Matrix[0][2]))

        index=0
        for i in range(len(self.sequencesLengths)):
            sequence=[]

            tmp=self.Onehot_Normalized_Matrix[index][1]
            tmp.extend(self.Onehot_Normalized_Matrix[index][2])
            sequence.append(tmp)
            index+=1

            for j in range(self.sequencesLengths[i]-2):
                tmp = self.Onehot_Normalized_Matrix[index][1]
                tmp.extend(self.Onehot_Normalized_Matrix[index][2])
                sequence.append(tmp)
                index+=1

            for j in range(len(sequence),self.maxSequenceLength):
                sequence.append(EmptyTransaction)

            self.X.append(sequence)

            if self.sequencesLengths[i]==1:
                self.Y.append(EmptyTarget)
            else:
                self.Y.append(self.Onehot_Normalized_Matrix[index][2])
                index += 1



    #get max len of transactions
    def getMaxSeqLen(self):
        maxLen=0
        count=1
        for i in range(1,len(self.Onehot_Normalized_Matrix)):
            #check if new sequence started
            if self.Onehot_Normalized_Matrix[i][0] != self.Onehot_Normalized_Matrix[i-1][0]:
                #save count if its bigger
                if maxLen<count:
                    maxLen=count
                #save seuence length of the sequence
                self.sequencesLengths.append(count)
                count=1
            else:
                count+=1

        #get last sequence length
        if maxLen < count:
            maxLen = count
        self.sequencesLengths.append(count)

        return maxLen


    #convert all data in the matrix to one hot vectors and normalized values
    def Generate_OneHot_Normalized_Matrix(self):

        self.init_Onehot_Normalized_Matrix()

        COLLECTOR_KEY_Index = self.getColumnIndex('COLLECTOR_KEY')
        Date_Index=self.getColumnIndex('LAST_DATE')
        Target_Index = self.getColumnIndex('target_2')
        FGC_Index=self.getColumnIndex('FGC')


        for columnIndex in range(len(self.columnsTags)):

            #do not add values of data or target
            if columnIndex==Date_Index or columnIndex==Target_Index:
                continue

            #add collector_key at first place
            elif columnIndex==COLLECTOR_KEY_Index:
                self.addColumnValuesAsIs(columnIndex,0)

            #add FGC as one hot vector in third (last) place
            elif columnIndex == FGC_Index:
                uniqueTags=self.getUniqueTags(columnIndex)
                self.addColumnsValuesAsOneHot(columnIndex,2,uniqueTags)

            #add values in features (second place) onehot and normalized features
            else:
                try:
                    #normalize for numerical values (add normalized values in features)
                    float(self.dataMatrix[0][columnIndex])
                    mean,std=self.get_Mean_STD(columnIndex)
                    self.addColumnsValuesNormalized(columnIndex,1,mean,std)

                except ValueError:
                    #one hot for non-numerical vaules (add onhot values in features)
                    uniqueTags = self.getUniqueTags(columnIndex)
                    self.addColumnsValuesAsOneHot(columnIndex, 1, uniqueTags)

    #get values at column Index and add them in their place by normalization(mean,std)
    def addColumnsValuesNormalized(self,columnIndex,vectorIndex,mean,std):
        for i in range(len(self.dataMatrix)):
            self.Onehot_Normalized_Matrix[i][vectorIndex].append\
            ((float(self.dataMatrix[i][columnIndex])-mean)/(std+1))

    #get mean and std of a column
    def get_Mean_STD(self,columnIndex):
        mat = np.array(self.dataMatrix)
        column = mat[:, columnIndex]
        column=np.array(column).astype(np.float)
        mean = column.mean()
        std = math.sqrt(column.var())
        return mean,std

    #get values at column index and add them in their place in one hot matrix
    def addColumnsValuesAsOneHot(self,columnIndex,vectorIndex,uniqueTags):
        for i in range(len(self.dataMatrix)):
            vec=[0]*len(uniqueTags)
            vec[uniqueTags.index(self.dataMatrix[i][columnIndex])]=1
            self.Onehot_Normalized_Matrix[i][vectorIndex].extend(vec)


    #get unique list of tags
    def getUniqueTags(self,columnIndex):
        unique=[]
        for i in range(len(self.dataMatrix)):
            if self.dataMatrix[i][columnIndex] not in unique:
                unique.append(self.dataMatrix[i][columnIndex])
        return unique


    #get values at column Index and add them in first array in one hot matrix
    def addColumnValuesAsIs(self,columnIndex,vectorIndex):
        for i in range(len(self.dataMatrix)):
            self.Onehot_Normalized_Matrix[i][vectorIndex].append\
            (self.dataMatrix[i][columnIndex])

    #put three empty vector in each row
    def init_Onehot_Normalized_Matrix(self):
        for i in range(len(self.dataMatrix)):
            row=[[],[],[]]
            self.Onehot_Normalized_Matrix.append(row)

    #sort Matrix - ascending with user - descending with time
    def SortMatrix(self):
        COLLECTOR_KEY_Index=self.getColumnIndex('COLLECTOR_KEY')
        DAYS_SINCE_LAST_VISIT_Index=self.getColumnIndex('DAYS_SINCE_LAST_VISIT')

        #bubble sort
        for i in range(len(self.dataMatrix)):
            swaps=False
            for j in range(len(self.dataMatrix)-i-1):

                #sort on key asc and then on days desc
                if (float(self.dataMatrix[j][COLLECTOR_KEY_Index])>float(self.dataMatrix[j+1][COLLECTOR_KEY_Index]))\
                or \
                (float(self.dataMatrix[j][COLLECTOR_KEY_Index])==float(self.dataMatrix[j+1][COLLECTOR_KEY_Index])
                 and
                 float(self.dataMatrix[j][DAYS_SINCE_LAST_VISIT_Index]) < float(self.dataMatrix[j+1][DAYS_SINCE_LAST_VISIT_Index])):

                    a,b=j,j+1

                    self.dataMatrix[a],self.dataMatrix[b]\
                    =self.dataMatrix[b],self.dataMatrix[a]
                    swaps=True

            #if already sorted list (no more swaps done in a full iteration)
            if swaps==False:
                break



    #get index of the column name (0 based)
    def getColumnIndex(self,columnName):
        for i in range(len(self.columnsTags)):
            if self.columnsTags[i]==columnName:
                return i
        return -1


    #load data from file and close it
    def LoadFile(self,path):
        reader = open(path, mode='r')
        self.columnsTags = reader.readline().split(',')

        recordNum = 1

        #loop line by line
        for record in reader:

            record = record.strip()
            if record == '':
                continue

            fields = record.split(',')

            #check that fileds are in the same length of columns names
            if len(fields) < len(self.columnsTags):
                print 'missing or extra fields in record (' + recordNum + ')'

            #update the matrix with new vector (fields vector as written in file)
            self.dataMatrix.append(fields)

        reader.close()

    