# Check the correctness of the created comment/answer vectors

## Retrieve data from disk

In [1]:
import pandas as pd
import pickle
import datetime


data_dir = "/home/aa043/sea/gpu_data/data/vul/"
vector_type = "tfidf"

print('Loading comments and SO answers...')
comment_df_raw = pd.read_csv(data_dir+"comments.csv")
answer_df_raw = pd.read_csv(data_dir+"answers.csv")

print('Loading integer index...')
with open(data_dir+'int_index.pkl', 'rb') as f:
        int_index = pickle.load(f)

print('Loading comment vectors...')
start_time = datetime.datetime.now().replace(microsecond=0)
with open(data_dir+'comment_'+vector_type+'_vectors.pkl', 'rb') as f:
        com_vec_dict = pickle.load(f)
end_time = datetime.datetime.now().replace(microsecond=0)
print("It took (h:m:s)", end_time - start_time, "to load comment vectors")
start_time = datetime.datetime.now().replace(microsecond=0)
print('Loading answer vectors...')
with open(data_dir+'answer_'+vector_type+'_vectors.pkl', 'rb') as f:
        ans_vec_dict = pickle.load(f)
end_time = datetime.datetime.now().replace(microsecond=0)
print("It took (h:m:s)", end_time - start_time, "to load answer vectors")

Loading comments and SO answers...
Loading integer index...
Loading comment vectors...
It took (h:m:s) 0:00:10 to load comment vectors
Loading answer vectors...
It took (h:m:s) 0:02:36 to load answer vectors


## Remove duplicated comments and SO answers

In [2]:
comment_df = comment_df_raw.drop_duplicates(subset='comment_id')
answer_df = answer_df_raw.drop_duplicates(subset='AnswerId')

## Make comment/answer dictonaries (ID and text)

#### Processing

In [3]:
# Convert dataframe to dictionary
comment_dict_raw = comment_df.set_index("comment_id").T.to_dict("list")
answer_dict_raw = answer_df.set_index("AnswerId").T.to_dict("list")
# Make Dictionay have only ID and text
comment_dict = {key: value[1] for key, value in comment_dict_raw.items()}
answer_dict = {key: value[1] for key, value in answer_dict_raw.items()}

## Test Comments and SO Answers

#### Test a comment

In [4]:
test_loc = 1165
print(comment_dict[test_loc])
i = 0
test_words = {}
for value in com_vec_dict[test_loc]:
    if value != 0:
        test_words[int_index[i]] = value
    i += 1
print(test_words)

// http://www.ietf.org/rfc/rfc1991.txt
{'http': 0.863308414882052, 'org': 0.8079326542235203, 'rfc': 1.0435801096798, 'ietf': 1.0518054398316954, 'www': 1.0858703115712187, 'txt': 1.8546960017115999, 'rfc1991': 9.050640993218508}


#### Test an SO Answer

In [5]:
test_loc = 183332
print(answer_dict[test_loc])
i = 0
test_words = {}
for value in ans_vec_dict[test_loc]:
    if value != 0:
        test_words[int_index[i]] = value
    i += 1
print(test_words)

<p><a href="http://www.gnupg.org/related_software/gpgme/index.en.html" rel="nofollow noreferrer">GPGme</a>. Simple to use and compatible with the <a href="http://www.ietf.org/rfc/rfc4880.txt" rel="nofollow noreferrer">OpenPGP format</a></p>

{'the': 0.734788699140412, 'p': 1.2150320655944524, 'a': 1.7206635530551664, 'to': 0.7785175023712089, 'and': 0.8723686080476383, 'http': 1.4617082086112219, 'href': 1.3205125390329544, 'rel': 1.3905568929824847, 'org': 1.3679488955808667, 'nofollow': 1.5256573490465497, 'html': 0.9012136792493759, 'with': 1.0945354728009946, 'noreferrer': 2.0835174983883764, 'rfc': 1.0435801096798, 'ietf': 1.0518054398316954, 'www': 1.8385382564905584, 'use': 1.2570817816472315, 'format': 2.1928216511226992, 'en': 2.0793731024367474, 'txt': 1.8546960017115999, 'simple': 3.04411280768809, 'index': 3.5842416604868452, 'related': 3.5566787336751187, 'software': 3.7608563746417674, 'compatible': 4.415099834841462, 'openpgp': 5.757849695926477, 'rfc4880': 5.87528179281