In [3]:
import logging
import pandas as pd
import numpy as np
import gensim
import nltk
import re
from bs4 import BeautifulSoup

from scipy.stats import kurtosis
from scipy.stats import skew
from scipy.stats import gmean

In [4]:
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS =nltk.corpus.stopwords.words('english')

In [5]:
import warnings
warnings.filterwarnings('ignore')

In [6]:
def clean_text(text):
    text = BeautifulSoup(text, "lxml").text # HTML decoding
    text = text.lower() # lowercase text
    text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text
    text = BAD_SYMBOLS_RE.sub('', text) # delete symbols which are in BAD_SYMBOLS_RE from text
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) # delete stopwors from text
    return text

In [7]:
wv = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)
wv.fill_norms() 
wv.init_sims(replace=True) 

In [8]:
from scipy.stats import kurtosis
from scipy.stats import skew
from scipy.stats import gmean

In [9]:
def word_averaging(wv, words):
    all_words, mean = set(), []
    cou=0
    for word in words:
        if isinstance(word, np.ndarray):
            mean.append(word)
        elif word in wv.key_to_index:
            mean.append(wv.vectors[wv.key_to_index[word]])
            all_words.add(wv.key_to_index[word])

    if not mean:
        logging.warning("cannot compute similarity with no input %s", words)
        return np.zeros(wv.vector_size,)
    mean = gensim.matutils.unitvec(np.array(mean).mean(axis=0)).astype(np.float32)
    return mean

In [10]:
def word_averaging_list(wv, text_list):
    return np.vstack([word_averaging(wv, post) for post in text_list ])

In [11]:
def w2v_tokenize_text(text):
    tokens = []
    for sent in nltk.sent_tokenize(text, language='english'):
        for word in nltk.word_tokenize(sent, language='english'):
            tokens.append(word)
    return tokens

In [12]:
fn=['AppReviews','JIRA','StackOverflow']
for k in range(0,3):
    fname=fn[k]+'.csv'
    df = pd.read_csv(fname,encoding='utf-8')
    df['sentence'] = df['sentence'].apply(clean_text)
    print(df.head())
    comtdata=df
    test_tokenized = comtdata.apply(lambda r: w2v_tokenize_text(r['sentence']), axis=1).values
    X_comtdata_average1 = word_averaging_list(wv,test_tokenized)
    fname=fn[k]+'w2v.csv'
    np.savetxt(fname,X_comtdata_average1, delimiter=',', fmt='%f')

                                            sentence  oracle
0  package file invalid phone factory reset wante...      -1
1  iffy nice clean app sometimes works times does...      -1
2                             cool freezes everytime      -1
3  network error suddenly downloading update pack...      -1
4  annoying let choose pictures want freezes forc...      -1




                                            sentence  oracle
0                                        guys stupid      -1
1  lost whole morning cause hbases regionserver d...      -1
2                       quote messing deep hbase dfs      -1
3                   think going sweep shit kill root      -1
4  idiot yeah idiotpath good commonslang hairball...      -1
    id                                           sentence  oracle
0    6                                      sadly working      -1
1   78  everything builds fine try deploy application ...      -1
2   90                     causing null pointer exception      -1
3  139            attempts ive made shortcut unsuccessful      -1
4  162                                           dont use      -1


