In [1]:
import logging
import pandas as pd
import numpy as np
import gensim
import nltk
import re
from bs4 import BeautifulSoup
import csv
import warnings
warnings.filterwarnings ('ignore')

In [2]:
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS =nltk.corpus.stopwords.words('english')

def clean_text(text):
    if type(text)==str:
        
        text = BeautifulSoup(text, "lxml").text # HTML decoding
        text = text.lower() # lowercase text
        text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text
        text = BAD_SYMBOLS_RE.sub('', text) # delete symbols which are in BAD_SYMBOLS_RE from text
        text = ' '.join(word for word in text.split() if word not in STOPWORDS) # delete stopwors from text
        return text
    return ""

In [3]:
wv = pd.read_table('glove.6B.300d.txt', sep=" ", index_col=0, header=None, quoting=csv.QUOTE_NONE)

In [4]:
def w2v_tokenize_text(text):
    tokens = []
    for sent in nltk.sent_tokenize(text, language='english'):
        for word in nltk.word_tokenize(sent, language='english'):
            if len(word) < 2:
                continue
            tokens.append(word)
    return tokens

In [5]:
def wordprint(words,wv):
    all_words, mean = set(), []
    for word in words:
        if isinstance(word, np.ndarray):
            mean.append(word)
        elif word in wv.index:
            a=np.array(wv.loc[word].to_numpy())
            mean.append(a)
    if not mean:
        logging.warning("cannot compute similarity with no input %s", words)
        # FIXME: remove these examples in pre-processing
        return np.zeros(300,)   
    mean = gensim.matutils.unitvec(np.array(mean).mean(axis=0)).astype(np.float32)
    return mean

In [6]:
def  word_averaging_listn(wv, text_list):
    return np.vstack([wordprint(post,wv) for post in text_list ])

In [7]:
fname='GitterCom'+'.csv'
df = pd.read_csv(fname,encoding='utf8')
df['message'] = df['message'].apply(clean_text)
df.drop(['Channel','messageId','time','user'],axis=1,inplace=True)
df['Category']=df['Category'].str.lower()
df['Purpose']=df['Purpose'].str.lower()
df['Subcategory']=df['Subcategory'].str.lower()
print(df.head(10))
comtdata=df
test_tokenized = comtdata.apply(lambda r: w2v_tokenize_text(r['message']), axis=1).values
X_comtdata_averagen=word_averaging_listn(wv, test_tokenized)
fname='completed_glove300d'+'.csv'
np.savetxt(fname,X_comtdata_averagen, delimiter=',', fmt='%f')
df = pd.read_csv('GitterCom.csv',encoding='utf8')
df['message'] = df['message'].apply(clean_text)
print(df.head(10))
comtdata=df

test_tokenized = comtdata.apply(lambda r: w2v_tokenize_text(r['message']), axis=1).values


X_comtdata_averagen=word_averaging_listn(wv, test_tokenized)

np.savetxt('glove300d.csv',X_comtdata_averagen, delimiter=',', fmt='%f')

                                             message    Purpose  \
0  hi team recently upgraded cucumberjvm version ...  team wide   
1  exception thread main javalangnosuchmethoderro...  team wide   
2  amit007 looks like inconsistent cucumber jar v...  team wide   
3                        github trying replace irc p  team wide   
4  aslakhellesoy thanks seems like using older ve...  team wide   
5  hi one question friends starting project studi...  team wide   
6  danon9111 integrate cucumber another testing f...  team wide   
7  aslakhellesoy reading cucumber book found code...  team wide   
8                 sidkiyassine call methods directly  team wide   
9  intergrated cucumber+appium https githubcom pr...  team wide   

        Category                          Subcategory  
0        dev-ops  development operation notifications  
1        dev-ops  development operation notifications  
2  communication         communication with teammates  
3  communication         communication





    Channel                 messageId                      time  \
0  Cucumber  5551fd48f853e7f14c2b7b3c  2015-05-12T13:16:56.794Z   
1  Cucumber  5551fd4a00ed57993752bbb5  2015-05-12T13:16:58.574Z   
2  Cucumber  5551feedf853e7f14c2b7b62  2015-05-12T13:23:57.768Z   
3  Cucumber  5552032a1817239c37e4e01b  2015-05-12T13:42:02.416Z   
4  Cucumber  55520e5900ed57993752be4e  2015-05-12T14:29:45.058Z   
5  Cucumber  55637ddfbd9b9407519bef4b  2015-05-25T19:54:07.526Z   
6  Cucumber  55649744220dcade3c4d5503  2015-05-26T15:54:44.800Z   
7  Cucumber  556498018f5532b4396ac6d1  2015-05-26T15:57:53.346Z   
8  Cucumber  5565704b8f5532b4396adb01  2015-05-27T07:20:43.093Z   
9  Cucumber  5565d8a31d1bf91146cb914f  2015-05-27T14:45:55.343Z   

             user                                            message  \
0         amit007  hi team recently upgraded cucumberjvm version ...   
1         amit007  exception thread main javalangnosuchmethoderro...   
2   aslakhellesoy  amit007 looks like inconsis



