## Specify data location

'vector_type' options:
1. "tfidf"
2. "freq"
3. "count"
4. "binary"

In [1]:
data_dir = "/home/aa043/sea/gpu_data/data/vul/"
vector_type = "tfidf"

## Load data
#### It may take long to load the comment vectors and (specially) the SO answer vectors
It took about 3 minutes in my PC

In [2]:
import pickle
import datetime

start_time = datetime.datetime.now().replace(microsecond=0)
print('Loading comment vectors...')
with open(data_dir+'comment_'+vector_type+'_vectors.pkl', 'rb') as f:
        com_vec_dict = pickle.load(f)
end_time = datetime.datetime.now().replace(microsecond=0)
print("It took (h:m:s)", end_time - start_time, "to load comment vectors")
start_time = datetime.datetime.now().replace(microsecond=0)
print('Loading answer vectors...')
with open(data_dir+'answer_'+vector_type+'_vectors.pkl', 'rb') as f:
        ans_vec_dict = pickle.load(f)
end_time = datetime.datetime.now().replace(microsecond=0)
print("It took (h:m:s)", end_time - start_time, "to load answer vectors")

Loading comment vectors...
It took (h:m:s) 0:00:10 to load comment vectors
Loading answer vectors...
It took (h:m:s) 0:02:37 to load answer vectors


In [26]:
print(len(com_vec_dict))
print(len(ans_vec_dict))
print(len(com_vec_dict)+len(ans_vec_dict))
print(list(com_vec_dict.keys())[:5])
print(list(ans_vec_dict.keys())[:5])
print(len(com_vec_dict[109]))
print(len(ans_vec_dict[1307427]))
print(ans_vec_dict[1307427])

1577
23992
25569
[1165, 1166, 45, 85, 109]
[43464438, 90959, 183332, 557046, 1307427]
89815
89815
[0.         1.75342213 2.36999289 ... 0.         0.         0.        ]


## Similarity Function

#### Accepts a comment vector and a dictionary of answer IDs and vectors
#### Return a dictionary of rankings and the IDs associated with every ranking
(The smaller the most similar)

In [5]:
from scipy import spatial
from collections import defaultdict, OrderedDict

def rank_cosine_similarity(comment, answers):
    sim_dict = defaultdict(list)
    for ans_id, ans_vec in answers.items():
        sim_dict[1-spatial.distance.cosine(comment, ans_vec)].append(ans_id)
    sorted_sim = OrderedDict(sorted(sim_dict.items(), reverse=True))
    
    return {i+1: ans_ids for i, ans_ids in enumerate(sorted_sim.values())}

### Usage example 1

Uncomment to test

In [6]:
import numpy as np

example_com_vec_dict = {606: np.array([2., 2., 2.]), 607: np.array([-2., -2., -2.])}
example_ans_vec_dict = {7001: np.array([1., 1., 1.]), 9012: np.array([-4., -3., -1.]), 4103: np.array([4., 3., 1.]), 5054: np.array([3., 3., 3.])}
rankings = rank_cosine_similarity(example_com_vec_dict[606], example_ans_vec_dict)
print(example_com_vec_dict)
print(example_ans_vec_dict)
print(rankings)

{606: array([2., 2., 2.]), 607: array([-2., -2., -2.])}
{7001: array([1., 1., 1.]), 9012: array([-4., -3., -1.]), 4103: array([4., 3., 1.]), 5054: array([3., 3., 3.])}
{1: [7001, 5054], 2: [4103], 3: [9012]}


### Usage example 2

Uncomment to test

In [5]:
# print(list(com_vec_dict.keys())[:3])
# print(list(ans_vec_dict.keys())[:10])

[1165, 1166, 45]
[43464438, 90959, 183332, 557046, 1307427, 2341514, 2498271, 2654368, 3555009, 3560415]


In [13]:
test_com_id = 45
related_ans_ids = [43464438, 90959, 183332, 557046, 1307427]

test_ans_vec_dict = {k: ans_vec_dict[k] for k in related_ans_ids}
rankings = rank_cosine_similarity(com_vec_dict[test_com_id], test_ans_vec_dict)
print('vector for comment'+str(test_com_id)+':', com_vec_dict[test_com_id])
# print(test_ans_vec_dict)
print('Rankings: {rank: answer_ids}')
print(rankings)

vector for comment45: [0.         1.24410541 1.21503207 ... 0.         0.         0.        ]
Rankings: {rank: answer_ids}
{1: [557046], 2: [90959], 3: [43464438], 4: [1307427], 5: [183332]}
