In [1]:
import pandas as pd
import numpy as np
from requests_html import HTML
from string import punctuation
from nltk.corpus import stopwords
from collections import Counter

In [2]:
all_question = pd.read_csv('/Users/zhang/MscProject_tweak2vec/data_stacksample/Questions.csv',encoding='ISO-8859-1')
all_tag = pd.read_csv('/Users/zhang/MscProject_tweak2vec/data_stacksample/Tags.csv',encoding='ISO-8859-1')

In [3]:
# corpus r:50k   py:50k   js:50k   cpp:25k   php:25k

cpp_tag = all_tag[all_tag.Tag=='c++']
cpp_question = pd.merge(cpp_tag,all_question,on='Id')

js_tag = all_tag[all_tag.Tag=='javascript']
js_question = pd.merge(js_tag,all_question,on='Id')

php_tag = all_tag[all_tag.Tag=='php']
php_question = pd.merge(php_tag,all_question,on='Id')

In [4]:
r_question = pd.read_csv('/Users/zhang/MscProject_tweak2vec/rquestions/Questions.csv', usecols=['Title','Body'])
r_question

Unnamed: 0,Title,Body
0,How to access the last value in a vector?,<p>Suppose I have a vector that is nested in a...
1,Worse sin: side effects or passing massive obj...,<p>I have a function inside a loop inside a fu...
2,Explain the quantile() function in R,<p>I've been mystified by the R quantile funct...
3,How to test for the EOF flag in R?,<p>How can I test for the <code>EOF</code> fla...
4,Is there an R package for learning a Dirichlet...,<p>I'm looking for a an <code>R</code> package...
5,Optimization packages for R,<p>Does anyone know of any optimization packag...
6,Thinking in Vectors with R,<p>I know that R works most efficiently with v...
7,Vectorize my thinking: Vector Operations in R,<p>So earlier I answered my own question on th...
8,Is R a compiled language?,<p>I can't find it anywhere on the web (and I ...
9,Filtering data in R,<p>I have a CSV of file of data that I can loa...


In [5]:
py_question = pd.read_csv('/Users/zhang/MscProject_tweak2vec/pythonquestions/Questions.csv',encoding='ISO-8859-1',usecols=['Title','Body'])
py_question

Unnamed: 0,Title,Body
0,How can I find the full path to a font from it...,<p>I am using the Photoshop's javascript API t...
1,Get a preview JPEG of a PDF on Windows?,<p>I have a cross-platform (Python) applicatio...
2,Continuous Integration System for a Python Cod...,<p>I'm starting work on a hobby project with a...
3,cx_Oracle: How do I iterate over a result set?,<p>There are several ways to iterate over a re...
4,Using 'in' to match an attribute of Python obj...,<p>I don't remember whether I was dreaming or ...
5,Class views in Django,"<p><a href=""http://www.djangoproject.com/"">Dja..."
6,Python and MySQL,<p>I can get Python to work with Postgresql bu...
7,How do I use Python's itertools.groupby()?,<p>I haven't been able to find an understandab...
8,Adding a Method to an Existing Object Instance,<p>I've read that it is possible to add a meth...
9,How do you express binary literals in Python?,<p>How do you express an integer as a binary n...


In [6]:
r_question['Text'] = r_question['Title'] + r_question['Body']
py_question['Text'] = py_question['Title'] + py_question['Body']
js_question['Text'] = js_question['Title'] + js_question['Body']
cpp_question['Text'] = cpp_question['Title'] + cpp_question['Body']
php_question['Text'] = php_question['Title'] + php_question['Body']

In [7]:
r_question_list = np.array(r_question['Text'])
py_question_list = np.array(py_question['Text'])
js_question_list = np.array(js_question['Text'])
cpp_question_list = np.array(cpp_question['Text'])
php_question_list = np.array(php_question['Text'])

In [8]:
def clean_doc2tokens(data):
    doc = HTML(html=data).text #remove html label
    doc = ''.join(' ' if c in punctuation else c for c in doc) #remove punctuations
    tokens = doc.split()
    tokens = [w.lower() for w in tokens]
    tokens = [w for w in tokens if w.isalpha()]
    stop_words = set(stopwords.words('english'))
    tokens = [w for w in tokens if not w in stop_words]
    return tokens

def tokens2doc(tokens, vocab_list):
    tokens = ['UNK' if w not in vocab_list else w for w in tokens]
    return ' '.join(tokens)

def update_vocab(tokens, vocab):
    tokens = clean_doc2tokens(data)
    vocab.update(tokens)
    

# create vocab (clean data, split tokens)
# create clean corpus (clean data again, replace UNK)
# train word vectors

In [26]:
def create_vocab(corpus):
    corpus_nounk = list()
    vocab = Counter()
    loop = 1
    for data in corpus:
        if loop%10000 == 0:
            print(loop)
        tokens = clean_doc2tokens(data)
        corpus_nounk.append(tokens)
        vocab.update(tokens)
        loop = loop + 1
    return corpus_nounk,vocab

def create_vocab_list(vocab, min_occurrence):
    vocab_freq_list = [[k,c] for k,c in vocab.most_common() if c >= min_occurrence]
    return vocab_freq_list

def create_clean_corpus(corpus_nounk, vocab_list):
    corpus_withunk = list()
    loop = 1
    for data in corpus_nounk:
        if loop%10000 == 0:
            print(loop)
        loop = loop + 1
        tokens = ['UNK' if w not in vocab_list else w for w in data]
        corpus_withunk.append(tokens)
    return corpus_withunk


In [10]:
# corpus r:100k   py:100k   js:50k   cpp:25k   php:25k

min_occurrence = 2

cop_r = r_question_list[0:50000]
cop_py = py_question_list[0:50000]
cop_js = js_question_list[0:50000]
cop_cpp = cpp_question_list[0:25000]
cop_php = php_question_list[0:25000]
cop = (cop_r, cop_py, cop_js, cop_cpp, cop_php)

corpus = np.concatenate(cop, axis=0)

In [11]:
len(corpus)

200000

In [35]:
np.save('stack_vocab.npy',np.array(vocab_freq_list))

In [12]:
corpus_nounk, vocab = create_vocab(corpus)

10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
110000
120000
130000
140000
150000
160000
170000
180000
190000
200000


In [40]:
vocab_freq_list = create_vocab_list(vocab, min_occurrence)
vocab_list = [w[0] for w in vocab_freq_list]
vocab_list.append('UNK')

In [24]:
corpus_withunk = create_clean_corpus(corpus_nounk, vocab_list)

10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
110000
120000
130000
140000
150000
160000
170000
180000
190000
200000


In [36]:
corpus_withunk = np.load('/Users/zhang/MscProject_tweak2vec/corpus/stack_corpus_unk.npy')

In [39]:
len(corpus_withunk)

200000