In [12]:
run FunctionsLoader.ipynb

0.2.0_4
0.2.0_4


In [8]:
# time_window should be in mins (e.g; 30mins)
# sd_ratio should be between 0 and 1 (e.g; 0.25 for using 1/4 of sd)

def computeCorrelations(time_window, sd_ratio, input_representation):

    min_days = 30 
    
    ### read mobility data ###
    ld = LocationData()
    ld.readLocationData(precision_value=4) # read file
    ld.filterUser(min_days) #filter users with min 30 days
    #print len(ld.users)
    ld.convertToEqualTimeSeries(time_window * 60 * 1000) # convert to time series of 30mins
    dt = ld.createUserDailyTrajectoryLists() # create daily trajectories of users 
    
    ### read mood data ###
    pd = PhqData()
    pd.readPhqData()
    pd.computeScores(ld.users, min_days)
    pd.computeDivergence()
    pd.convertTo2Labels(sd_ratio=sd_ratio)
    #len(list(set(m[0] for m in pd.phq_scores)))

    ### merge data ###
    mlm = PhqLocMerge()
    mlm.mergeData(dt, pd.phq_scores)
    
    ###  transform mobility data ### 
    users = list(set([m[0] for m in mlm.data]))
    if input_representation == 1:
        dt = approachAvg1(lm_data=mlm.data, users=users) # Rescaling lat and lons
    elif input_representation == 2:
        dt = approachAvg2(lm_data=mlm.data, users=users) # Computing displacement changes
    elif input_representation == 3:
        dt = approachAvg3(lm_data=mlm.data, users=users) # Computing displacement [normalised]
    elif input_representation == 4:
        dt = approachAvg4(lm_data=mlm.data, users=users, topn=20) # Computing displacement [normalised]

    ### filter users ### 
    filtered_users = []
    for u in users:
        ud = [d for d in dt if d[0]==u]
        if len(ud) > min_days:
            filtered_users.append(u)
    print 'Number of users', len(filtered_users)

    ### setting up the data ### 
    y_data1 = [d[2] for d in dt] # 2: score & 4: divergence 
    y_data2 = [d[4] for d in dt] # 2: score & 4: divergence 
    x_data = [[d[0], d[5]] for d in dt] #get user-id & mobility data 
    
    
    ### train autoencoder & encode data ### 
    mobility_data = [] # get only mobility data 
    for x in x_data:
        mobility_data.append(x[1]) 
    

    # train autoencoder
    input_layer_size = len(mobility_data[0])
    ae = AETrainer(input_layer_size)
    ae.set_data(mobility_data, y_data2)
    ae.train_model()
    
    # encode data
    encoded_data = ae.encode_data(mobility_data)
    mobility_data = encoded_data.data.numpy().tolist()

    # check if encoded data has the same length as x_data (un-encoded)
    if len(mobility_data) != len(x_data):
        print "Check if both are equal", len(mobility_data), len(x_data)
        stop
        
    # add user id to encoded data
    temp_data = []
    for i in range(len(mobility_data)):
        item = [x_data[i][0], mobility_data[i]]
        temp_data.append(item)
    x_data = temp_data
    print "Ecoded X data"
    #print x_data[0]

    
    # compute correlation for each user
    print "Computing correlations for:", len(filtered_users), "users"
    result = list()
    for i in range(len(filtered_users)):
        current_user = filtered_users[i]

        # get user's y-data
        u_y_data = []
        for j in range(len(x_data)):
            if x_data[j][0] == current_user:
                u_y_data.append(y_data1[j])

        # get user's x-data
        u_x_data = [d[1] for d in x_data if d[0] == current_user]

        if len(u_y_data) < min_days or len(u_x_data) < min_days: 
            print 'Not enought data for the user' 
            print current_user, len(u_y_data), len(u_x_data)
            stop
            
        if len(u_x_data) != len(u_y_data):
            print "Unequal x & y lengths", len(u_x_data), len(u_y_data)
            stop

        # get cor value for each feature
        for j in range(len(u_x_data[0])):
            x_values = [x[j] for x in u_x_data]
            cor = stats.pearsonr(x=x_values, y=u_y_data)
        
            r = [current_user, cor[0], cor[1], j]
            result.append(r)
    
    # return result
    return(result)

In [9]:
def write_results(results, file_path):
    
    f  = open(file_path, 'w')
    
    # iterate over each line
    for r in results:
        # convert line to string
        s = ''
        for j in range(len(r)):
            s += str(r[j]) 
            if j < (len(r)-1):
                s += ","
            else:
                s += '\n'
        # write line to file
        f.write(s)
    # close file
    f.close() 


In [10]:
def computeAllCorrelations():
    for time_window in [10, 30, 60]:
        for sd_ratio in [0.25, 0.5, 1]:
            for input_representation in [2,3,4]:
                r = computeCorrelations(time_window=30, sd_ratio=0.25, input_representation=2)
                file_path = 'results/cor_time_window' + str(time_window) + '_sd_ratio_' + str(int(1/sd_ratio)) + '_input_' + str(input_representation) + '.csv'
                write_results(r, file_path)
                print file_path


In [None]:
computeAllCorrelations()

Initial user count 6705
User count 5616
Min time ratio 0.500023134225
Number of users 44
Pos count: 900
Neg count: 1489
Number of users 24
Ecoded X data
Computing correlations for: 24 users
results/cor_time_window10_sd_ratio_4_input_2.csv
Initial user count 6705
User count 5616
Min time ratio 0.500023134225
Number of users 44
Pos count: 900
Neg count: 1489
Number of users 24
Ecoded X data
Computing correlations for: 24 users
results/cor_time_window10_sd_ratio_4_input_3.csv
Initial user count 6705
User count 5616
Min time ratio 0.500023134225
Number of users 44
Pos count: 900
Neg count: 1489
Number of users 24
Ecoded X data
Computing correlations for: 24 users
results/cor_time_window10_sd_ratio_4_input_4.csv
Initial user count 6705
User count 5616
Min time ratio 0.500023134225
Number of users 44
Pos count: 900
Neg count: 1489
Number of users 24
Ecoded X data
Computing correlations for: 24 users
results/cor_time_window10_sd_ratio_2_input_2.csv
Initial user count 6705
User count 5616
Min 