## Retrieve data from disk

In [1]:
import pandas as pd

data_dir = "/home/aa043/sea/gpu_data/data/vul/"
# one of "binary", "count", "tfidf", and "freq"
vector_type = 'freq'

comment_df_raw = pd.read_csv(data_dir+"comments.csv")
answer_df_raw = pd.read_csv(data_dir+"answers.csv")

### Show sample comments and SO answers

In [2]:
comment_df_raw.head()

Unnamed: 0,cited_RFCID,comment_id,commentText
0,RFC1991,1165,// http://www.ietf.org/rfc/rfc1991.txt
1,RFC1991,1166,"// https://www.ietf.org/rfc/rfc2440.txt, http:..."
2,RFC1738,45,"r""""""\n Returns a tuple (username,password) ..."
3,RFC1738,85,"// <>"" are never valid in a uri see http://ww..."
4,RFC1738,109,"r""""""\n Returns a tuple (username,password) ..."


In [3]:
answer_df_raw.head()

Unnamed: 0,cited_RFCID,AnswerId,Body
0,RFC1991,43464438,<h1>The Analysys of the Tests Results</h1>\n\n...
1,RFC1991,90959,"<p>I use <a href=""http://www.gnupg.org/related..."
2,RFC1991,183332,"<p><a href=""http://www.gnupg.org/related_softw..."
3,RFC1991,557046,"<p>I found the <a href=""http://www.bouncycastl..."
4,RFC1991,1307427,"<p>PKCS#8 is not an encrypted-file format, it'..."


## Remove duplicated comments and SO answers

In [4]:
print('Comments shape before duplicates removel:', comment_df_raw.shape)
comment_df = comment_df_raw.drop_duplicates(subset='comment_id')
print('Comments shape after duplicates removel:', comment_df.shape)
print('Answers shape before duplicates removel:', answer_df_raw.shape)
answer_df = answer_df_raw.drop_duplicates(subset='AnswerId')
print('Answers shape before duplicates removel:', answer_df.shape)

Comments shape before duplicates removel: (1879, 3)
Comments shape after duplicates removel: (1577, 3)
Answers shape before duplicates removel: (132656, 3)
Answers shape before duplicates removel: (23992, 3)


## Make comment/answer dictonary (ID and text). Convert texts to word-sequences

#### Processing

In [5]:
# Convert dataframe to dictionary
comment_dict_raw = comment_df.set_index("comment_id").T.to_dict("list")
answer_dict_raw = answer_df.set_index("AnswerId").T.to_dict("list")
# Make Dictionay have only ID and text
comment_dict = {key: value[1] for key, value in comment_dict_raw.items()}
answer_dict = {key: value[1] for key, value in answer_dict_raw.items()}

#### Printing

In [6]:
print('Sample comment as a sting and as a word sequence:-')
print(comment_dict[1166])
print('==========')
print('Sample SO answer a sting and as a word sequence:-')
print(answer_dict[183332])

Sample comment as a sting and as a word sequence:-
// https://www.ietf.org/rfc/rfc2440.txt, http://www.ietf.org/rfc/rfc1991.txt,
// and parse() from g10/parse-packet.c. This block contains code from GnuPG
// which is copyrighted by FSF, Werner Koch, and g10 Code GmbH.
Sample SO answer a sting and as a word sequence:-
<p><a href="http://www.gnupg.org/related_software/gpgme/index.en.html" rel="nofollow noreferrer">GPGme</a>. Simple to use and compatible with the <a href="http://www.ietf.org/rfc/rfc4880.txt" rel="nofollow noreferrer">OpenPGP format</a></p>



## Text Tokenisation

#### Processing

In [7]:
from keras.preprocessing.text import Tokenizer

# Extract texts 
comments = list(comment_dict.values())
answers = list(answer_dict.values())
documents = comments + answers

# Tokenisation
# Filter out these characters
filter_chars = '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n\''
# Create tokeniser
com_t = Tokenizer(filters=filter_chars)
ans_t = Tokenizer(filters=filter_chars)
doc_t = Tokenizer(filters=filter_chars)
# fit tokenizers to the documents (commenst and SO answers)
doc_t.fit_on_texts(documents)
com_t.fit_on_texts(comments)
ans_t.fit_on_texts(answers)

# doc_t.word_counts     # A dictionary of words and their counts
# doc_t.word_index      # A dictionary of words and their uniquely assigned integers
# doc_t.word_docs       # A dictionary of words and how many documents each appeared in
# doc_t.document_count  # An integer count of the total number of documents that were used to fit the Tokenizer

Using TensorFlow backend.


#### Printing

In [8]:
print('Comment Vocab ordered by popularity:-')
print(list(com_t.word_index.keys()))

Comment Vocab ordered by popularity:-


In [9]:
print('SO Answer Vocab ordered by popularity:-')
print(list(ans_t.word_index.keys()))

SO Answer Vocab ordered by popularity:-


In [10]:
print('Stats:-')
print('Number of comments:', com_t.document_count)
print('Number of SO Answers:', ans_t.document_count)
print('Together:', doc_t.document_count)
print('Comment Vocabulary Size:', len(com_t.word_index))
print('SO Answers Vocabulary Size:', len(ans_t.word_index))
print('Total Vocabulary Size:', len(doc_t.word_index))
print('Longest comment:', max([len(x) for x in comments]), 'characters')
print('Longest SO Answer:', max([len(x) for x in answers]), 'characters')
print('Longest Comment/SO-Answer:', max([len(x) for x in documents]), 'characters')

Stats:-
Number of comments: 1577
Number of SO Answers: 23992
Together: 25569
Comment Vocabulary Size: 10603
SO Answers Vocabulary Size: 87100
Total Vocabulary Size: 89814
Longest comment: 84567 characters
Longest SO Answer: 35088 characters
Longest Comment/SO-Answer: 84567 characters


## Vectorisation

#### Create vectors

In [11]:
# To deal with memory issue
# Delete if exists. Then, create again
doc_vectors = None
del doc_vectors

# one of "binary", "count", "tfidf", and "freq"
doc_vectors = doc_t.texts_to_matrix(documents, mode=vector_type)

print(doc_vectors.shape[0], "vectors of size", doc_vectors.shape[1], "created")

25569 vectors of size 89815 created


#### Pair documents (comments and SO answers) with their vectors

In [12]:
# Create all pairs
doc_vec_pairs = []
for i, doc in enumerate(documents):
    doc_vec_pairs.append((doc, doc_vectors[i]))
# Separate comments from answers
com_vec_pairs = doc_vec_pairs[:len(comments)]
ans_vec_pairs = doc_vec_pairs[len(comments):]

#### Link vectors to document IDs

In [13]:
import sys

# Looping with 'break's and 'pop's that significantly speeds the process
com_vec_dict, ans_vec_dict = {}, {}
temp_com_vec_pairs = com_vec_pairs[:]
temp_ans_vec_pairs = ans_vec_pairs[:]
for key, value in comment_dict.items():
    for i, pair in enumerate(temp_com_vec_pairs):
        if pair[0] == value:
            com_vec_dict[key] = pair[1]
            temp_com_vec_pairs.pop(i)
            break
for key, value in answer_dict.items():
    for i, pair in enumerate(temp_ans_vec_pairs):
        if pair[0] == value:
            ans_vec_dict[key] = pair[1]
            temp_ans_vec_pairs.pop(i)
            break

# Print success or failure
if len(com_vec_dict) == len(comments) and len(ans_vec_dict) == len(answers):
    print(len(com_vec_dict), 'com_vec dictoinary entries created')
    print(len(ans_vec_dict), 'ans_vec dictoinary entries created')
else:
    sys.exit('Something went wrong')

1577 com_vec dictoinary entries created
23992 ans_vec dictoinary entries created


## Save vectors to disk

In [14]:
import pickle

# com_vec_s = pd.Series(com_vec_dict)
# com_vec_s.to_pickle(data_dir+'comment_'+vector_type+'_vectors.pkl')
with open(data_dir+'comment_'+vector_type+'_vectors.pkl', 'wb') as f:
        pickle.dump(com_vec_dict, f)
print('Comment vectors saved to disk')
# ans_vec_s = pd.Series(ans_vec_dict)
# ans_vec_s.to_pickle(data_dir+'answer_'+vector_type+'_vectors.pkl')
with open(data_dir+'answer_'+vector_type+'_vectors.pkl', 'wb') as f:
        pickle.dump(ans_vec_dict, f)
print('SO answer vectors saved to disk')

Comment vectors saved to disk
SO answer vectors saved to disk
