In [11]:
class ClassifierPersonalised():
    
    def __init__(self):
        self.frequency = {}
        self.accuracy = -1
        self.prf = []
        self.conf_matrix = []
        
        
    def classify(self, x_train, y_train, x_test, y_test): 
        
        if len(x_train) != len(y_train) or len(x_test) != len(y_test):
            raise ValueError("Length mismatch of data and labels") 
        
        ### compute frequency of labels in train data ###
        self.frequency = dict([(i,0) for i in np.unique(y_train)])
        for number in y_train:
            self.frequency[number]+=1 
        for key in self.frequency:
            self.frequency[key] = float(self.frequency[key]) / len(y_train)

            
        #print "Number of samples:", len(x_train)
        #print "Number of features:", len(x_train[0])
        
        ### setting up classifier ###
        clf = RandomForestClassifier(max_depth=10, random_state=0)
        clf.fit(x_train, y_train) 

        ### compute model performance for test data###
        y_true = y_test
        y_pred = clf.predict(x_test)
        self.accuracy = sklearn.metrics.accuracy_score(y_true, y_pred, normalize=True, sample_weight=None)
        self.prf = sklearn.metrics.precision_recall_fscore_support(y_true, y_pred, beta=1.0, pos_label=1, average='weighted', sample_weight=None)
        self.conf_matrix = sklearn.metrics.confusion_matrix(y_true, y_pred)
        #print "Performance computed: ", self.conf_matrix
        
        
        
        
       
    # x_data should have two columns for user and data
    def classify_personalised(self, x_data_list, y_data, iteration, users, min_days, encode=True):
        
        
        # train autoencoder & encode data
        encoded_x_data_list = []
        for xd in x_data_list:
            x_data = []
            for x in xd:
                x_data.append(x[1])
                
            input_layer_size = len(x_data[0])
            ae = AETrainer(input_layer_size)
            ae.set_data(x_data, y_data)
            ae.train_model()
            ae.plot_losses()
            encoded_data = ae.encode_data(x_data)
            x_data = encoded_data.data.numpy().tolist()
            
            if len(xd) != len(x_data):
                print "Check if both are equal", len(xd), len(x_data)
                stop
            
            temp_data = []
            for i in range(len(x_data)):
                item = [xd[i][0], x_data[i]]
                temp_data.append(item)
            encoded_x_data_list.append(temp_data)
        x_data_list = encoded_x_data_list 
        print "Ecoded X data"

        
        # get list of users
        self.users = users
        
        k_frequency = []
        k_accuracy = []
        k_prf = []
        k_conf_matrix = []
        
        
        counter = 0
        for i in range(len(self.users)):
            current_user = self.users[i]
            #print "User: ", i, current_user
        
            # get user's y-data
            u_y_data = []
            for j in range(len(x_data_list[0])):
                if x_data_list[0][j][0] == current_user:
                    u_y_data.append(y_data[j])
                    
            if len(u_y_data) < min_days:
                #print "Not enought data for the user: ", len(u_y_data)
                continue
            
            
            # get user's x-data
            u_x_data = []
            for xd in x_data_list:
                if len(u_x_data) == 0:
                    u_x_data = [d[1] for d in xd if d[0] == current_user]
                    #print "U-Data len", len(u_x_data)
                else:
                    temp_data = [d[1] for d in xd if d[0] == current_user]
                    for j in range(len(temp_data)):
                        for d in temp_data[j]:
                            u_x_data[j].append(d)
                    #print "U-Data len", len(u_x_data)
            
            if len(u_x_data) != len(u_y_data):
                print "Unequal x & y lengths", len(u_x_data), len(u_y_data)
                stop
            #print "number of samples for the user:", len(u_y_data)
            
            counter += 1
            
            for itr in range(iteration):
                # setting up train & test data 
                all_train_data, all_test_data, all_train_labels, all_test_labels = cross_validation.train_test_split(
                    u_x_data, u_y_data, test_size=0.2, random_state=itr)


                # return is there not enough unique labels
                if len(set(all_train_labels)) != len(set(all_test_labels)) or len(set(all_train_labels)) == 1:
                    #print 'Not enough samples for this iteration'
                    #print 'Train-test labels', len(set(all_train_labels)), len(set(all_test_labels))
                    continue


                self.classify(all_train_data, all_train_labels, all_test_data, all_test_labels)
                #print "Confusion matrix: ", self.conf_matrix
                k_frequency.append(self.frequency)
                k_accuracy.append(self.accuracy)
                k_prf.append(self.prf)
                k_conf_matrix.append(self.conf_matrix)

                self.frequency = k_frequency
                self.accuracy = k_accuracy
                self.prf = k_prf
                self.conf_matrix = k_conf_matrix 
        print counter


In [10]:
class ClassifierPersonalisedDivergenceHelper:

    @staticmethod
    def compute_svm_accuracy(xy_data_list, iteration=3, users=[], min_days=30):
        # setting up the data
        y_data = [d[4] for d in xy_data_list[0]]
        x_data = []
        for xy_data in xy_data_list:
            x_data.append([[d[0], d[5]] for d in xy_data])

        # classify
        model = ClassifierPersonalised()
        model.classify_personalised(x_data, y_data, iteration, users, min_days)

        return model