In [1]:
import pandas as pd
debian_mailing_list = pd.read_csv("./debian-user-filtered.dat", header = None)

processed_dml = []
for items in debian_mailing_list[0]:
    link = [int(s) for s in items.split(" ")]
    processed_dml.append(link)
    
threads_dict = {}
for link in processed_dml:
    thread_id = link[4]
    if thread_id not in threads_dict:
        threads_dict.update({thread_id : [link[0:5]]})
    else:
        threads_dict[thread_id].append(link[0:5])

In [127]:
test_data = []
i = 1
for key in threads_dict:
    if i > 200:
        break
    i += 1
    test_data.append(threads_dict[key])

In [3]:
def time_stamp(link):
    return link[0]

def return_all_statistic(partition):
    num_of_links = []
    num_of_nodes = []
    duration = []
    variance = []
    
    for thread in partition:
        thread = sorted(thread, key = time_stamp)
        duration.append((thread[-1][0] - thread[0][0])/day)
        
        num_of_links.append(len(thread))
        
        thread_node = []
        for link in thread:
            for node in link[1:3]:
                if node not in thread_node:
                    thread_node.append(node)
        num_of_nodes.append(len(thread_node))                                                                                                                          
        
        if len(thread) < 2:
            variance.append(0)
            continue
        
        intercontact_time = []
        for i in range(len(thread)-1):
            intercontact_time.append((thread[i+1][0] - thread[i][0])/day)
            
        expectation = (thread[-1][0] - thread[0][0]) / (day * len(intercontact_time)) 
        
        var = 0
        for t in intercontact_time:
            var += (t - expectation)**2
        var = var / len(intercontact_time)
        variance.append(var)
        
    return num_of_links, num_of_nodes, duration, variance

In [39]:
import numpy as np
import math
day = 24*60*60
def thread_score(num_of_links, num_of_nodes, duration, variance, alpha, beta, gamma):
    return (num_of_links ** gamma) / ( num_of_nodes  + alpha * variance + beta * duration)

thread_score = np.vectorize(thread_score)

def partition_value(list_num_links, list_num_node, list_duration, list_variance, alpha, beta, gamma):
    total_links = np.sum(list_num_links)
    return np.sum(thread_score(list_num_links, list_num_node, list_duration, list_variance, alpha, beta, gamma)) /total_links

def partial_alpha(num_of_links, num_of_nodes, duration, variance, alpha, beta, gamma):
    partial_alpha = - variance * (num_of_links ** gamma) / ((num_of_nodes + alpha * variance + beta * duration) ** 2)
    return partial_alpha
partial_alpha = np.vectorize(partial_alpha)

def partial_beta(num_of_links, num_of_nodes, duration, variance, alpha, beta, gamma):
    partial_beta = - duration * (num_of_links ** gamma) / ((num_of_nodes + alpha * variance + beta * duration) ** 2)
    return partial_beta
partial_beta = np.vectorize(partial_beta)

def partial_gamma(num_of_links, num_of_nodes, duration, variance, alpha, beta, gamma):
    partial_gamma = math.log(gamma) * (num_of_links ** gamma) / (num_of_nodes + alpha * variance + beta * duration)
    return partial_gamma
partial_gamma = np.vectorize(partial_gamma)

def grad_partition_value(list_num_links, list_num_node, list_duration, list_variance, alpha, beta, gamma):
    total_links = np.sum(list_num_links)
    p_alpha = partial_alpha(list_num_links, list_num_node, list_duration, list_variance, alpha, beta, gamma)
    p_beta =  partial_beta(list_num_links, list_num_node, list_duration, list_variance, alpha, beta, gamma)
    p_gamma = partial_gamma(list_num_links, list_num_node, list_duration, list_variance, alpha, beta, gamma)
    return np.sum(p_alpha)/total_links, np.sum(p_beta)/total_links, np.sum(p_gamma)/total_links

In [110]:
def finding_nemo(ground_truth_partition, alpha = 1, beta = 1, gamma = 1.5, learning_rate = 0.05, back_track = 0.5):
    big_thread = []
    for thread in ground_truth_partition:
        for link in thread:
            big_thread.append(link)
    whole_partition = [big_thread]
    
    whole_stat = return_all_statistic(whole_partition)
    gt_stat = return_all_statistic(ground_truth_partition)
    
    # first try to optimize f(E_g) to a value greater than 0.5
    a = alpha
    b = beta
    g = gamma
    lr = learning_rate
    value = partition_value(gt_stat[0], gt_stat[1], gt_stat[2], gt_stat[3], alpha, beta, gamma)
    while value < 0.55:
        grad = grad_partition_value(gt_stat[0], gt_stat[1], gt_stat[2], gt_stat[3], a, b, g)
        a_tmp = a + lr * grad[0]
        b_tmp = b + lr * grad[1]
        g_tmp = g + lr * grad[2]
        
        i = 0
        while not (a_tmp >= 0 and b_tmp >= 0 and g_tmp >=1):
            lr = learning_rate * back_track
            a_tmp = a + lr * grad[0]
            b_tmp = b + lr * grad[1]
            g_tmp = g + lr * grad[2]
            i += 1
            if i > 100:
                print('feu vkl')
                return
        a = a_tmp
        b = b_tmp
        g = g_tmp
        value = partition_value(gt_stat[0], gt_stat[1], gt_stat[2], gt_stat[3], a, b, g)
        
    whole_value = partition_value(whole_stat[0], whole_stat[1], whole_stat[2], whole_stat[3], a, b, g)
    prev_value = -100
    current_value = value - whole_value
    i = 0
    while not (current_value > 0 and current_value - prev_value < 0.1):
        i += 1
        whole_grad = grad_partition_value(whole_stat[0], whole_stat[1], whole_stat[2], whole_stat[3], a, b, g)
        grad = grad_partition_value(gt_stat[0], gt_stat[1], gt_stat[2], gt_stat[3], a, b, g)
        some_grad = np.array(grad) - np.array(whole_grad)
        a_tmp = a + lr * some_grad[0]
        b_tmp = b + lr * some_grad[1]
        g_tmp = g + lr * some_grad[2]
        
        j = 0
        while not (a_tmp >= 0 and b_tmp >= 0 and g_tmp >=1 and partition_value(gt_stat[0], gt_stat[1], gt_stat[2], gt_stat[3], a_tmp, b_tmp, g_tmp) > 0.5):
            lr = learning_rate * back_track
            a_tmp = a + lr * some_grad[0]
            b_tmp = b + lr * some_grad[1]
            g_tmp = g + lr * some_grad[2]
            j += 1
            if j > 100:
                print('fail vkl')
                return
        a = a_tmp
        b = b_tmp
        g = g_tmp
        
        
        prev_value = current_value
        whole_value = partition_value(whole_stat[0], whole_stat[1], whole_stat[2], whole_stat[3], a, b, g)
        partition_value(gt_stat[0], gt_stat[1], gt_stat[2], gt_stat[3], a, b, g)
        current_value = value - whole_value
        
        if i > 1000:
            break
    print(i)
    print(a, b, g)

In [128]:
import time
start_time = time.time()
finding_nemo(test_data)
print(time.time() - start_time)

1
0.9912384945556407 0.9331450478739075 1.7871100014890133
0.026973724365234375


In [19]:
stat = return_all_statistic(test_data)

In [43]:
import time
start_time = time.time()
value = partition_value(stat[0], stat[1], stat[2], stat[3], 0.097, 0.086, 1.731)
grad = grad_partition_value(stat[0], stat[1], stat[2], stat[3], 0.1, 0.1, 1.7)
print(value)
print(grad)
print(time.time() - start_time)

0.6221314201405056
(-0.035451234077685134, -0.14020989994902108, 0.31722971661065313)
0.0019974708557128906


In [23]:
np.sum(grad) / 123

-0.14898868543687402

In [90]:
stat[1][2]

5

In [39]:
from numpy import linalg as LA
w, v = LA.eig(np.array(
    [[2, -1, -1], 
     [-1, 1, 0],
     [-1, 0, 1]]))
w

array([ 3.00000000e+00, -2.22044605e-16,  1.00000000e+00])

In [105]:
w, v = LA.eig(np.array(
    [[1, -1/2, -1/2, -1/2, -1/2], 
     [-1/2, 1, 0, 0, 0],
     [-1/2, 0, 1, 0, 0],
     [-1/2, 0, 0, 1, 0],
     [-1/2, 0, 0, 0, 1]]))
w

array([2., 0., 1., 1., 1.])

In [106]:
w, v = LA.eig(np.array(
    [[1, -1/2, -1/2], 
     [-1/2, 1, -1/2],
     [-1/2, -1/2, 1]]))
w

array([ 1.50000000e+00, -5.55111512e-17,  1.50000000e+00])

In [118]:
w, v = LA.eig(np.array(
    [[1, -1/1.4142, -1/1.4142], 
     [-1/1.4142, 1, 0],
     [-1/1.4142, 0, 1]]))
w

array([ 2.00000959e+00, -9.59013795e-06,  1.00000000e+00])

In [117]:
1.4142**2

1.9999616399999998

In [None]:
# import random
# def generate_partition(ground_truth_partition): #create a path from ground truth partition to root
#     current_length = len(ground_truth_partition)
#     prev_partition = ground_truth_partition
#     result = []
#     for i in range(len(ground_truth_partition)-1):
#         new_thread = prev_partition[0]
#         for link in prev_partition[1]:
#             new_thread.append(link)
            
#         new_partition = []
#         new_partition.append(new_thread)
#         for thread in prev_partition[2:]:
#             new_partition.append(thread)
        
        
#         prev_partition = new_partition
#         current_length -= 1
        
#         result.append(new_partition)
#     return result