In [1]:
import pandas as pd
import numpy as np
from requests_html import HTML
from string import punctuation
from nltk.corpus import stopwords
from collections import Counter

In [2]:
all_question = pd.read_csv('/Users/zhang/MscProject_tweak2vec/data_stacksample/Questions.csv',encoding='ISO-8859-1')
all_tag = pd.read_csv('/Users/zhang/MscProject_tweak2vec/data_stacksample/Tags.csv',encoding='ISO-8859-1')

In [3]:
# corpus r:50k   py:50k   js:50k   cpp:25k   php:25k

cpp_tag = all_tag[all_tag.Tag=='c++']
cpp_question = pd.merge(cpp_tag,all_question,on='Id')

js_tag = all_tag[all_tag.Tag=='javascript']
js_question = pd.merge(js_tag,all_question,on='Id')

php_tag = all_tag[all_tag.Tag=='php']
php_question = pd.merge(php_tag,all_question,on='Id')

In [5]:
r_question = pd.read_csv('/Users/zhang/MscProject_tweak2vec/rquestions/Questions.csv', usecols=['Title','Body'])

In [6]:
py_question = pd.read_csv('/Users/zhang/MscProject_tweak2vec/pythonquestions/Questions.csv',encoding='ISO-8859-1',usecols=['Title','Body'])

In [7]:
r_question['Text'] = r_question['Title'] + r_question['Body']
py_question['Text'] = py_question['Title'] + py_question['Body']
js_question['Text'] = js_question['Title'] + js_question['Body']
cpp_question['Text'] = cpp_question['Title'] + cpp_question['Body']
php_question['Text'] = php_question['Title'] + php_question['Body']

In [8]:
r_question_list = np.array(r_question['Text'])
py_question_list = np.array(py_question['Text'])
js_question_list = np.array(js_question['Text'])
cpp_question_list = np.array(cpp_question['Text'])
php_question_list = np.array(php_question['Text'])

In [9]:
def clean_doc2tokens(data):
    doc = HTML(html=data).text #remove html label
    doc = ''.join(' ' if c in punctuation else c for c in doc) #remove punctuations
    tokens = doc.split()
    tokens = [w.lower() for w in tokens]
    tokens = [w for w in tokens if w.isalpha()]
    stop_words = set(stopwords.words('english'))
    tokens = [w for w in tokens if not w in stop_words]
    return tokens

def tokens2doc(tokens, vocab_list):
    tokens = ['UNK' if w not in vocab_list else w for w in tokens]
    return ' '.join(tokens)

def update_vocab(tokens, vocab):
    tokens = clean_doc2tokens(data)
    vocab.update(tokens)
    

# create vocab (clean data, split tokens)
# create clean corpus (clean data again, replace UNK)
# train word vectors

In [10]:
def create_vocab(corpus):
    corpus_nounk = list()
    vocab = Counter()
    loop = 1
    for data in corpus:
        if loop%10000 == 0:
            print(loop)
        tokens = clean_doc2tokens(data)
        corpus_nounk.append(tokens)
        vocab.update(tokens)
        loop = loop + 1
    return corpus_nounk,vocab

def create_vocab_list(vocab, min_occurrence):
    vocab_freq_list = [[k,c] for k,c in vocab.most_common() if c >= min_occurrence]
    return vocab_freq_list

def create_clean_corpus(corpus_nounk, vocab_list):
    corpus_withunk = list()
    loop = 1
    for data in corpus_nounk:
        if loop%10000 == 0:
            print(loop)
        loop = loop + 1
        tokens = ['UNK' if w not in vocab_list else w for w in data]
        corpus_withunk.append(tokens)
    return corpus_withunk


In [11]:
# corpus r:100k   py:100k   js:50k   cpp:25k   php:25k
cop_r = r_question_list[0:50000]
cop_py = py_question_list[0:50000]
cop_js = js_question_list[0:50000]
cop_cpp = cpp_question_list[0:25000]
cop_php = php_question_list[0:25000]
cop = (cop_r, cop_py, cop_js, cop_cpp, cop_php)

corpus = np.concatenate(cop, axis=0)

In [12]:
len(corpus)

200000

In [13]:
corpus_nounk, vocab = create_vocab(corpus)

10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
110000
120000
130000
140000
150000
160000
170000
180000
190000
200000


In [29]:
np.save('stack_vocaball.npy', vocab) 

In [37]:
min_occurrence = 10
vocab_freq_list = create_vocab_list(vocab, min_occurrence)
vocab_list = [w[0] for w in vocab_freq_list]
len(vocab_list)

47614

In [27]:
np.save('stack_vocab10.npy',np.array(vocab_freq_list))

In [38]:
corpus_withunk = create_clean_corpus(corpus_nounk, vocab_list)

10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
110000
120000
130000
140000
150000
160000
170000
180000
190000
200000


In [41]:
np.save('stack_corpus_withunk.npy',np.array(corpus_withunk))