In [1]:
import logging
import pandas as pd
import numpy as np
import gensim
import nltk
import re
from bs4 import BeautifulSoup
import csv

In [2]:
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS =nltk.corpus.stopwords.words('english')

In [3]:
import warnings
warnings.filterwarnings('ignore')

In [4]:
def clean_text(text):
    text = BeautifulSoup(text, "lxml").text # HTML decoding
    text = text.lower() # lowercase text
    text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text
    text = BAD_SYMBOLS_RE.sub('', text) # delete symbols which are in BAD_SYMBOLS_RE from text
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) # delete stopwors from text
    return text

In [5]:
wv = pd.read_table('glove.6B.300d.txt', sep=" ", index_col=0, header=None, quoting=csv.QUOTE_NONE)

In [6]:
def vec(w):
  return wv.loc[w].as_matrix()

In [7]:
def w2v_tokenize_text(text):
    tokens = []
    for sent in nltk.sent_tokenize(text, language='english'):
        for word in nltk.word_tokenize(sent, language='english'):
            if len(word) < 2:
                continue
            tokens.append(word)
    return tokens

In [8]:
def wordprint(words,wv):
    all_words, mean = set(), []
    for word in words:
        if isinstance(word, np.ndarray):
            mean.append(word)
        elif word in wv.index:
           a=np.array(wv.loc[word])
           mean.append(a)
    if not mean:
        logging.warning("cannot compute similarity with no input %s", words)
        # FIXME: remove these examples in pre-processing
        return np.zeros(300,)   
    mean = gensim.matutils.unitvec(np.array(mean).mean(axis=0)).astype(np.float32)
    return mean

In [9]:
def word_averaging_listn(wv, text_list):
    return np.vstack([wordprint(post,wv) for post in text_list ])

In [10]:
fn=['AppReviews','JIRA','StackOverflow']
for k in range(0,3):
    fname=fn[k]+'.csv'
    df = pd.read_csv(fname,encoding='utf-8')
    df['sentence'] = df['sentence'].apply(clean_text)
    print(df.head(10))
    comtdata=df
    test_tokenized = comtdata.apply(lambda r: w2v_tokenize_text(r['sentence']), axis=1).values
    X_comtdata_averagen=word_averaging_listn(wv, test_tokenized)
    fname=fn[k]+'glove.csv'
    np.savetxt(fname,X_comtdata_averagen, delimiter=',', fmt='%f')

                                            sentence  oracle
0  package file invalid phone factory reset wante...      -1
1  iffy nice clean app sometimes works times does...      -1
2                             cool freezes everytime      -1
3  network error suddenly downloading update pack...      -1
4  annoying let choose pictures want freezes forc...      -1
5  anoying bug use powertoggles notification bar ...      -1
6  resource error wont complete l freezes every t...      -1
7  cant install cant install samsung note iii ple...      -1
8  comprehensive flawed ive tried lots options no...      -1
9  crashed system stopped working matter app kept...      -1




                                            sentence  oracle
0                                        guys stupid      -1
1  lost whole morning cause hbases regionserver d...      -1
2                       quote messing deep hbase dfs      -1
3                   think going sweep shit kill root      -1
4  idiot yeah idiotpath good commonslang hairball...      -1
5                          pull back think different      -1
6                                               suck      -1
7  still stuck loop though cant actually close re...      -1
8            original edited patch messed stupid sed      -1
9                                   made stupid rule      -1




    id                                           sentence  oracle
0    6                                      sadly working      -1
1   78  everything builds fine try deploy application ...      -1
2   90                     causing null pointer exception      -1
3  139            attempts ive made shortcut unsuccessful      -1
4  162                                           dont use      -1
5  187  however try run file_name gives system cant fi...      -1
6  200  problem without zooming added content doesnt g...      -1
7  201  application workaround scaling window size app...      -1
8  256                                    else gui hanged      -1
9  342                  sure map implementation hazelcast      -1


