In [41]:
run FunctionsLoader.ipynb

0.2.0_4
0.2.0_4


In [42]:
from sklearn.decomposition import KernelPCA 

In [43]:
class Predictions():
    
    
    def __init__(self, min_days=30):
        self.min_days = min_days
            
        
    # time_window should be in mins (e.g; 30mins)
    # sd_ratio should be between 0 and 1 (e.g; 0.25 for using 1/4 of sd)

    def loadData(self, time_window, sd_ratio, t_hist):
        
        ### read mobility data ###
        ld = LocationData()
        ld.readLocationData(precision_value=4) # read file
        ld.filterUser(self.min_days) #filter users with min 30 days
        #print len(ld.users)
        ld.convertToEqualTimeSeries(time_window * 60 * 1000) # convert to time series of 30mins
        dt = ld.createUserDailyTrajectoryLists() # create daily trajectories of users 

        ### read mood data ###
        pd = PhqData()
        pd.readPhqData() 
        pd.computeScores(ld.users, self.min_days)
        pd.computeDivergence()
        pd.convertTo2Labels(sd_ratio=sd_ratio) 
        #len(list(set(m[0] for m in pd.phq_scores)))

        ### merge data ###
        mlm = PhqLocMerge()
        mlm.mergeData(dt, pd.phq_scores, t_hist)
        
        ### set data as a class variable ###
        self.data = mlm.data

    
    
    def prepareInput(self):
    
        ###  transform mobility data ### 
        users = list(set([m[0] for m in self.data]))
        dt2 = approachAvg2(lm_data=self.data, users=users) # Computing displacement changes
        dt3 = approachAvg3(lm_data=self.data, users=users) # Computing displacement [normalised]
        dt4 = approachAvg4(lm_data=self.data, users=users, topn=20) # Computing displacement [normalised]

        ###  combinations of data for using different combinations of features ### 
        self.data1 = [dt2] 
        self.data2 = [dt3] 
        self.data3 = [dt4] 
        self.data4 = [dt2,dt3] 
        self.data5 = [dt2,dt4] 
        self.data6 = [dt3,dt4] 
        self.data7 = [dt2,dt3,dt4]
    
    
    def computePredictions(self, encoded_layer_size=10, classifier_type=1):

        ### filter users ### 
        filtered_users = []
        users = list(set([m[0] for m in self.data]))
        for u in users:
            ud = [d for d in self.data1[0] if d[0]==u]
            if len(ud) > self.min_days:
                filtered_users.append(u)
        print 'Number of users', len(filtered_users)
        
        ### computing prdiction accuracy ### 
        model1 = PCAClassifierPersonalisedHelper.compute_svm_accuracy(self.data1, iteration=20, classifier_type=classifier_type, users=filtered_users, min_days=self.min_days, encoded_layer_size=encoded_layer_size)
        model2 = PCAClassifierPersonalisedHelper.compute_svm_accuracy(self.data2, iteration=20, classifier_type=classifier_type, users=filtered_users, min_days=self.min_days, encoded_layer_size=encoded_layer_size) 
        model3 = PCAClassifierPersonalisedHelper.compute_svm_accuracy(self.data3, iteration=20, classifier_type=classifier_type, users=filtered_users, min_days=self.min_days, encoded_layer_size=encoded_layer_size) 
        model4 = PCAClassifierPersonalisedHelper.compute_svm_accuracy(self.data4, iteration=20, classifier_type=classifier_type, users=filtered_users, min_days=self.min_days, encoded_layer_size=encoded_layer_size) 
        model5 = PCAClassifierPersonalisedHelper.compute_svm_accuracy(self.data5, iteration=20, classifier_type=classifier_type, users=filtered_users, min_days=self.min_days, encoded_layer_size=encoded_layer_size) 
        model6 = PCAClassifierPersonalisedHelper.compute_svm_accuracy(self.data6, iteration=20, classifier_type=classifier_type, users=filtered_users, min_days=self.min_days, encoded_layer_size=encoded_layer_size) 
        model7 = PCAClassifierPersonalisedHelper.compute_svm_accuracy(self.data7, iteration=20, classifier_type=classifier_type, users=filtered_users, min_days=self.min_days, encoded_layer_size=encoded_layer_size) 
        
        self.models = [model1, model2, model3, model4, model5, model6, model7]

        
    def plotPredictions(self, include_mt=False, t_hist=14, file_path=''):
#         model_names = ['IR 1','IR 2','IR 3','IR 1+2','IR 1+3','IR 2+3','IR 1+2+3']
        model_names = ['IR 1','IR 2','IR 3','IR 1+2','IR 1+3','IR 2+3','IR 1+2+3']
        label_dict = {1:'Sensitivity', 0:'Specificity'}
        Plots.multiModel(models=p.models, model_names=model_names, label_dict=label_dict, include_mt=include_mt, t_hist=t_hist, file_path=file_path)
        
        


In [44]:
class PCAClassifierPersonalised():
    
    def __init__(self):
        self.frequency = {}
        self.accuracy = -1
        self.prf = []
        self.conf_matrix = []
        
        
    def classify(self, x_train, y_train, x_test, y_test, classifier_type): 
        
        if len(x_train) != len(y_train) or len(x_test) != len(y_test):
            raise ValueError("Length mismatch of data and labels") 
        
        ### compute frequency of labels in train data ###
        self.frequency = dict([(i,0) for i in np.unique(y_train)])
        for number in y_train:
            self.frequency[number]+=1 
        for key in self.frequency:
            self.frequency[key] = float(self.frequency[key]) / len(y_train)

            
        #print "Number of samples:", len(x_train)
        #print "Number of features:", len(x_train[0])
        
        ### setting up classifier ###
        if classifier_type == 1:
            clf = RandomForestClassifier(max_depth=10, random_state=0)
        elif classifier_type == 2:
            clf = svm.SVC(probability=False,  kernel="rbf")
        elif classifier_type == 3:
            clf = GradientBoostingClassifier()
#             clf = AdaBoostClassifier()
            
        clf.fit(x_train, y_train) 

        ### compute model performance for test data###
        y_true = y_test
        y_pred = clf.predict(x_test)
        self.accuracy = sklearn.metrics.accuracy_score(y_true, y_pred, normalize=True, sample_weight=None)
        self.prf = sklearn.metrics.precision_recall_fscore_support(y_true, y_pred, beta=1.0, pos_label=1, average='weighted', sample_weight=None)
        self.conf_matrix = sklearn.metrics.confusion_matrix(y_true, y_pred)
        #print "Performance computed: ", self.conf_matrix
        
        
        
       
    # x_data should have two columns for user and data
    def classify_personalised(self, x_data_list, y_data, iteration, classifier_type, users, min_days, encoded_layer_size):
        
        
        # train autoencoder & encode data
        encoded_x_data_list = []
        for xd in x_data_list:
            x_data = []
            for x in xd:
                x_data.append(x[1])
                
            input_layer_size = len(x_data[0])
            
            ae = FeatureExtractionPCA(input_layer_size)
            encoded_data = ae.encode_data(x_data)
            x_data = encoded_data.tolist()
            
            if len(xd) != len(x_data):
                print "Check if both are equal", len(xd), len(x_data)
                stop
            
            temp_data = []
            for i in range(len(x_data)):
                item = [xd[i][0], x_data[i]]
                temp_data.append(item)
            encoded_x_data_list.append(temp_data)
        x_data_list = encoded_x_data_list 
        #print "Ecoded X data"

        
        # get list of users
        self.users = users
        
        k_frequency = []
        k_accuracy = []
        k_prf = []
        k_conf_matrix = []
        
        
        counter = 0
        for i in range(len(self.users)):
            current_user = self.users[i]
            #print "User: ", i, current_user
        
            # get user's y-data
            u_y_data = []
            for j in range(len(x_data_list[0])):
                if x_data_list[0][j][0] == current_user:
                    u_y_data.append(y_data[j])
                    
            if len(u_y_data) < min_days:
                #print "Not enought data for the user: ", len(u_y_data)
                continue
            
            
            # get user's x-data
            u_x_data = []
            for xd in x_data_list:
                if len(u_x_data) == 0:
                    u_x_data = [d[1] for d in xd if d[0] == current_user]
                    #print "U-Data len", len(u_x_data)
                else:
                    temp_data = [d[1] for d in xd if d[0] == current_user]
                    for j in range(len(temp_data)):
                        for d in temp_data[j]:
                            u_x_data[j].append(d)
                    #print "U-Data len", len(u_x_data)
            
            if len(u_x_data) != len(u_y_data):
                print "Unequal x & y lengths", len(u_x_data), len(u_y_data)
                stop
            #print "number of samples for the user:", len(u_y_data)
            
            counter += 1
            
            for itr in range(iteration):
                # setting up train & test data 
                all_train_data, all_test_data, all_train_labels, all_test_labels = cross_validation.train_test_split(
                    u_x_data, u_y_data, test_size=0.2, random_state=itr)


                # return is there not enough unique labels
                if len(set(all_train_labels)) != len(set(all_test_labels)) or len(set(all_train_labels)) == 1:
                    #print 'Not enough samples for this iteration'
                    #print 'Train-test labels', len(set(all_train_labels)), len(set(all_test_labels))
                    continue


                self.classify(all_train_data, all_train_labels, all_test_data, all_test_labels, classifier_type)
                #print "Confusion matrix: ", self.conf_matrix
                k_frequency.append(self.frequency)
                k_accuracy.append(self.accuracy)
                k_prf.append(self.prf)
                k_conf_matrix.append(self.conf_matrix)

                self.frequency = k_frequency
                self.accuracy = k_accuracy
                self.prf = k_prf
                self.conf_matrix = k_conf_matrix 
        #print counter


In [45]:
class PCAClassifierPersonalisedHelper:
    
    @staticmethod
    def compute_svm_accuracy(xy_data_list, iteration=3, classifier_type=1, users=[], min_days=30, encoded_layer_size=10):
        # setting up the data
        y_data = [d[4] for d in xy_data_list[0]]
        x_data = []
        for xy_data in xy_data_list:
            x_data.append([[d[0], d[5]] for d in xy_data])

        # classify
        model = PCAClassifierPersonalised()
        model.classify_personalised(x_data, y_data, iteration, classifier_type, users, min_days, encoded_layer_size)

        return model

In [46]:
class FeatureExtractionPCA:
            
    def __init__(self, encoded_layer_size):
        self.pca = KernelPCA(n_components=encoded_layer_size, kernel='rbf')
        
    def encode_data(self, x_data):
        f = self.pca.fit_transform(x_data)
        return f

In [47]:
# p.computePredictions(encoded_layer_size=1, classifier_type=1)
# file_path = 'plots/pca/prediction_' + '_encoded_ls_' + str(1) + '_classifier_' + str(1) + '.pdf'
# p.plotPredictions(include_mt=True, t_hist=t_hist, file_path=file_path)      

In [48]:
all_models = [] # store models of all iterations (for debugging)
time_window = 10
t_hist = 14
dropout_rate = 0.1
p = Predictions(30)
p.loadData(time_window=time_window, sd_ratio=0.25, t_hist=t_hist)
p.prepareInput()


Initial user count 6705
User count 5616
Min time ratio 0.500023134225
Number of users 44
Pos count: 900
Neg count: 1489


In [None]:
for encoded_layer_size in range(1,21):
    print 'layer size', encoded_layer_size
    for classifier_type in [1,2,3]:
        print 'classifier type', classifier_type
        p.computePredictions(encoded_layer_size=encoded_layer_size, classifier_type=classifier_type)
        file_path = 'plots/pca/prediction_' + '_encoded_ls_' + str(encoded_layer_size) + '_classifier_' + str(classifier_type) + '.pdf'
        p.plotPredictions(include_mt=True, t_hist=t_hist, file_path=file_path) 

layer size 1
classifier type 1
Number of users 24
[0, 1]
classifier type 2
Number of users 24
[0, 1]
classifier type 3
Number of users 24
[0, 1]
layer size 2
classifier type 1
Number of users 24
[0, 1]
classifier type 2
Number of users 24
[0, 1]
classifier type 3
Number of users 24
[0, 1]
layer size 3
classifier type 1
Number of users 24
[0, 1]
classifier type 2
Number of users 24
[0, 1]
classifier type 3
Number of users 24
[0, 1]
layer size 4
classifier type 1
Number of users 24
[0, 1]
classifier type 2
Number of users 24
[0, 1]
classifier type 3
Number of users 24
[0, 1]
layer size 5
classifier type 1
Number of users 24
[0, 1]
classifier type 2
Number of users 24
[0, 1]
classifier type 3
Number of users 24
[0, 1]
layer size 6
classifier type 1
Number of users 24
[0, 1]
classifier type 2
Number of users 24
[0, 1]
classifier type 3
Number of users 24
[0, 1]
layer size 7
classifier type 1
Number of users 24
[0, 1]
classifier type 2
Number of users 24
[0, 1]
classifier type 3
Number of u

In [4]:
# mn = ['IR 1','IR 2','IR 3','IR 1+2','IR 1+3','IR 2+3','IR 1+2+3']
# ld = {1:'Sensitivity', 0:'Specificity'}
# Plots.multiModel(models=p.models, model_names=mn, label_dict=ld, include_mt=True, t_hist=7, file_path='plots/prediction_plot.png')

3