# Installs

In [1]:
import torch 
from sklearn.model_selection import train_test_split
import transformers

from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM
from transformers import glue_convert_examples_to_features as convert_examples_to_features

import numpy as np
import pandas as pd
from pandas.plotting import scatter_matrix, andrews_curves
import re
from bs4 import BeautifulSoup
import pickle

import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/martinvielvoye/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/martinvielvoye/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/martinvielvoye/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [2]:
from time import time, sleep
from IPython import get_ipython
from IPython.display import Audio, display, HTML
from ipywidgets import IntProgress

class Beeper:

    def __init__(self, threshold, **audio_kwargs):
        self.threshold = threshold
        self.start_time = None    # time in sec, or None
        self.audio = audio_kwargs

    def pre_execute(self):
        if not self.start_time:
            self.start_time = time()
            

    def post_execute(self):
        end_time = time()
        if self.start_time and end_time - self.start_time > self.threshold:
            audio = Audio(**self.audio, autoplay=True)
            display(audio)
            print((end_time - self.start_time)/60, " minutes.")
        self.start_time = None

beeper = Beeper(5, filename='./beep-07.wav')

ipython = get_ipython()
ipython.events.register('pre_execute', beeper.pre_execute)
ipython.events.register('post_execute', beeper.post_execute)




# Data Loader

In [3]:
def y_output(x):
    tot_list = np.zeros(len(y_tags))
    for element in x:
        if(element in y_tags):
            tot_list[y_tags.tolist().index(element)] += 1
    return tot_list

def totListReg(x, totList):
    regex = r"\<(.*?)\>"
    #print(x)
    matches = re.finditer(regex, x,re.MULTILINE)
    tagList = []
    for matchNum, match in enumerate(matches, start = 1):
        #print ("Match {matchNum} was found at {start}-{end}: {match}".format(matchNum = matchNum, start = match.start(), end = match.end(), match = match.group()))
    
        for groupNum in range(0, len(match.groups())):
            groupNum = groupNum + 1
            tagList.append(match.group(groupNum))
            #print ("Group {groupNum} found at {start}-{end}: {group}".format(groupNum = groupNum, start = match.start(groupNum), end = match.end(groupNum), group = match.group(groupNum)))
    for values in tagList:
        totList.append(values)
    #return tagList if match else None
    return tagList

def listReg(x):
    regex = r"\<(.*?)\>"
    #print(x)
    matches = re.finditer(regex, x,re.MULTILINE)
    tagList = []
    for matchNum, match in enumerate(matches, start = 1):
        #print ("Match {matchNum} was found at {start}-{end}: {match}".format(matchNum = matchNum, start = match.start(), end = match.end(), match = match.group()))
    
        for groupNum in range(0, len(match.groups())):
            groupNum = groupNum + 1
            tagList.append(match.group(groupNum))
            #print ("Group {groupNum} found at {start}-{end}: {group}".format(groupNum = groupNum, start = match.start(groupNum), end = match.end(groupNum), group = match.group(groupNum)))
    #return tagList if match else None
    return tagList

def InputTransf(x):
    html = x["Body"]
    raw = BeautifulSoup(html, "html.parser").get_text()

    tokens = word_tokenize(raw)
    tokens = [s + " " for s in tokens]
    if(type(x["Title"]) == np.float): 
        fullText = (''.join(tokens[:]))
    else:
        fullText = (x["Title"] + ". " + ''.join(tokens[:]))
    return fullText
    
    #text = nltk.Text(tokens)
    #text

def CleanedInputStem(x):
    sw = set()
    sw.update(tuple(nltk.corpus.stopwords.words('english')))
    
    tokens = word_tokenize(x)
    stemmer = SnowballStemmer("english")
    corpora = [stemmer.stem(w) for w in tokens if not w in list(sw)]
    #corpora = [w for w in tokens if not w in list(sw)]
    #corpora = [s + " " for s in corpora]
    #print(''.join(corpora[:]))

    return corpora

def SemiCleanedInput(x):
    sw = set()
    sw.update(tuple(nltk.corpus.stopwords.words('english')))
    lemmatizer = WordNetLemmatizer()

    tokens = word_tokenize(x)
    tokens = [word.lower() for word in tokens]
    #corpora = [lemmatizer.lemmatize(w.lower()) for w in tokens if not w in list(sw)]
    #corpora = list(set(corpora))
    corpora = list(dict.fromkeys(tokens))
    #print(corpora)
    corpora = " ".join(corpora[:])
    #print(corpora)


    #corpora = [w for w in tokens if not w in list(sw)]
    #corpora = [s + " " for s in corpora]
    #print(''.join(corpora[:]))

    return corpora

def SemiCleanedInput2(x):
    sw = set()
    sw.update(tuple(nltk.corpus.stopwords.words('english')))
    lemmatizer = WordNetLemmatizer()

    tokens = word_tokenize(x)
    tokens = [word.lower() for word in tokens]
    #corpora = [lemmatizer.lemmatize(w.lower()) for w in tokens if not w in list(sw)]
    #corpora = list(set(corpora))
    corpora = list(dict.fromkeys(tokens))

    #corpora = [w for w in tokens if not w in list(sw)]
    #corpora = [s + " " for s in corpora]
    #print(''.join(corpora[:]))

    return corpora

def CleanedInputLemma(x):
    sw = set()
    sw.update(tuple(nltk.corpus.stopwords.words('english')))
    lemmatizer = WordNetLemmatizer()

    tokens = word_tokenize(x)
    tags2 = nltk.pos_tag(tokens)
    tokens = [word.lower() for word,pos in tags2 if (pos == 'NN' or pos == 'NNP' or pos == 'NNS' or pos == 'NNPS')]
    #corpora = [lemmatizer.lemmatize(w.lower()) for w in tokens if not w in list(sw)]
    #corpora = list(set(corpora))
    corpora = list(dict.fromkeys(tokens))
    

    #corpora = [w for w in tokens if not w in list(sw)]
    #corpora = [s + " " for s in corpora]
    #print(''.join(corpora[:]))

    return corpora

def input(x, result = "Token"):
    #html = x["Body"]
    #raw = BeautifulSoup(html, "html.parser").get_text()
    text = "[CLS] " + x + " [SEP]"
    tokenized_text_1 = tokenizer.tokenize(text)
    tokenized_text_1 = tokenized_text_1[:512]
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text_1)
    segments_ids = [0] * len(indexed_tokens)
    #print(segments_ids)

    if(result == "Token"): return tokenized_text_1
    else : return segments_ids
    #return indexed_tokens, segments_ids

In [4]:
data = pd.read_csv("./QueryResults.csv",
                           engine="python",
                           sep=',',
                           decimal='.')

emb_data = pd.read_csv("./Top10000Scores.csv",
                           engine="python",
                           sep=',',
                           decimal='.')

In [42]:
featData = data["Tags"].apply(listReg)
featData = pd.DataFrame(featData, index = featData.index)
featData["Input"] = data[["Title", "Body"]].apply(InputTransf, axis = 1)
featData["Clean_Input"] = featData["Input"].apply(CleanedInputLemma)
featData["Semiclean_Input"] = featData["Input"].apply(SemiCleanedInput)

#featData["Presence"]= featData[["Tags", "Clean_Input"]].apply(pred_evel_df, axis = 1)
#featData

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

model = BertModel.from_pretrained('bert-base-uncased')

# Set the model in evaluation mode to deactivate the DropOut modules
# This is IMPORTANT to have reproducible results during evaluation!
model.eval()

# If you have a GPU, put everything on cuda
#tokens_tensor = tokens_tensor.to('cuda')
#segments_tensors = segments_tensors.to('cuda')
#model.to('cuda')
all_encoded = []

#featData["Token"] = featData["Semiclean_Input"].apply(inputTransf, axis = 1)
featData["TokenELMO"] = featData["Semiclean_Input"].apply(input)
#featData["Index"] = featData[["Body", "Tags"]].apply(lambda x: inputTransf(x, "nope"), axis = 1)

totList = []
data["Tags"].apply(totListReg, args=(totList,))
allTags = pd.DataFrame(totList, columns = ["uniTags"])
allTags["uniTags"].value_counts()

y_tags = allTags["uniTags"].value_counts().head(150).index.values
featData["y_output"] = featData["Tags"].apply(y_output)

8.679772730668386  minutes.


In [43]:
# load elmo_train_new
pickle_in = open("./elmo_train_03032019.pickle", "rb")
elmo_train_new = pickle.load(pickle_in)
pickle_in.close()

all_train = []
for i in range(np.int(len(elmo_train_new)/2)):
  all_train.append(elmo_train_new[i*2])

emb_train = np.concatenate(all_train, axis = 0)
len_train = len(emb_train)
#Need to clean "train[label]" for actual labels
xtrain, xvalid, ytrain, yvalid = train_test_split(emb_train, 
                                                  featData["y_output"][:len_train].tolist(),  
                                                  random_state=42, 
                                                  test_size=0.1)
len_act = len(xtrain)
y_sparse_train = featData["y_output"].values.tolist()[:len_act]
y_sparse_test = featData["y_output"].values.tolist()[len_act:len_train]

In [44]:
totList = []
data["Tags"].apply(totListReg, args=(totList,))
allTags = pd.DataFrame(totList, columns = ["uniTags"])
allTags["uniTags"].value_counts()

y_tags = allTags["uniTags"].value_counts().head(150).index.values
featData["y_output"] = featData["Tags"].apply(y_output)

clean_array = featData["Clean_Input"].values.tolist()
temp = []
for strin in clean_array:
    strin = [s + " " for s in strin]
    temp.append(''.join(strin))
clean_array = temp

x_train = clean_array[:42500]
x_test = clean_array[42500:]

clean_array_tag = featData["Tags"].values.tolist()
temp = []
for strin in clean_array_tag:
    strin = [s for s in strin]
    temp.append(' '.join(strin))
clean_array_tag = temp

y_clean_train = clean_array_tag[:42500]
y_clean_test = clean_array_tag[42500:]

In [61]:
emb_data["Input"] = emb_data[["Title", "Body"]].apply(InputTransf, axis = 1)
emb_data["Clean_Input"] = emb_data["Input"].apply(CleanedInputLemma)
emb_data["SemiClean_Input"] = emb_data["Input"].apply(SemiCleanedInput2)


1.9295649647712707  minutes.


In [62]:
clean_emb_array_clean = emb_data["Clean_Input"].values.tolist()
temp = []
for strin in clean_emb_array_clean:
    strin = [s + " " for s in strin]
    temp.append(''.join(strin))
clean_emb_array_clean = temp
 
clean_emb_array = emb_data["SemiClean_Input"].values.tolist()
temp = []
for strin in clean_emb_array:
    strin = [s for s in strin]
    temp.append(''.join(strin))
clean_emb_array = temp

In [63]:
voc_tok = []
for sentences in clean_emb_array:
    tokens = word_tokenize(sentences)
    for words in tokens:
        voc_tok.append(words)
print(len(voc_tok))
voc_tok = list(set(voc_tok))

print(len(voc_tok))

133424
56943


0.8568068186442057  minutes.


In [64]:
emb_data["Input"] = emb_data["Input"].apply(lambda x: word_tokenize(x.lower()))
emb_data[["Input", "Clean_Input", "SemiClean_Input"]]

Unnamed: 0,Input,Clean_Input,SemiClean_Input
0,"[you, are, a, victim, of, branch, prediction, ...","[victim, branch, prediction, fail, railroad, j...","[you, are, a, victim, of, branch, prediction, ..."
1,"[why, is, processing, a, sorted, array, faster...","[array, piece, c++, code, behavior, reason, da...","[why, is, processing, a, sorted, array, faster..."
2,"[undo, a, commit, and, redo, $, git, commit, -...","[commit, reset, head~, <, edit, files, >, add,...","[undo, a, commit, and, redo, $, git, -m, ``, s..."
3,"[executive, summary, $, git, push, -d, <, remo...","[executive, summary, push, -d, <, remote_name,...","[executive, summary, $, git, push, -d, <, remo..."
4,"[how, do, i, undo, the, most, recent, local, c...","[commits, git, files, commit, server, repository]","[how, do, i, undo, the, most, recent, local, c..."
5,"[how, do, i, delete, a, git, branch, locally, ...","[git, branch, attempts, delete, remote, -d, re...","[how, do, i, delete, a, git, branch, locally, ..."
6,"[i, am, a, professor, ,, teaching, at, univers...","[professor, universities, nyc, core, software,...","[i, am, a, professor, ,, teaching, at, univers..."
7,"[amending, the, most, recent, commit, message,...","[commit, message, editor, command, line, new, ...","[amending, the, most, recent, commit, message,..."
8,"[one, does, not, simply, redirect, using, jque...","[jquery, window.location.replace, simulate, ht...","[one, does, not, simply, redirect, using, jque..."
9,"[to, understand, what, yield, does, ,, you, mu...","[yield, generators, iterables, list, items, it...","[to, understand, what, yield, does, ,, you, mu..."


0.1978582978248596  minutes.


# Data Loading

In [49]:
import gensim

In [114]:
def word_averaging(wv, words):
    all_words, mean = set(), []
    
    for word in words:
        if word is ' ': 
            pass
        #if wv.vocab[word] is None: 
        #    print('Found One 2')
        #if wv.vectors[wv.vocab[word].index] is None: 
        #    print("Found One 3")
        else:
            if isinstance(word, np.ndarray):
                mean.append(word)

            elif word in wv.vocab:
                #print("word", word)
                #print("wv.vocab[word]", wv.vocab[word])
                #print("wv.vocab[word].index", wv.vocab[word].index)
                #print()
                mean.append(wv.vectors[wv.vocab[word].index])
                all_words.add(wv.vocab[word].index)

    if not mean:
        #logging.warning("cannot compute similarity with no input %s", words)
        # FIXME: remove these examples in pre-processing
        return np.zeros(150,)

    mean = gensim.matutils.unitvec(np.array(mean).mean(axis=0)).astype(np.float32)
    return mean

def  word_averaging_list(wv, text_list):
    return np.vstack([word_averaging(wv, review) for review in text_list ])

In [109]:
model = gensim.models.Word2Vec(emb_data["SemiClean_Input"].tolist(), size=150, window = 10, min_count = 2, workers = 10)  # an empty model, no training yet
model.train(clean_emb_array_clean, total_examples = len(clean_emb_array_clean), epochs = 10)  # can be a non-repeatable, 1-pass generator



(19758830, 24490140)

0.2750523328781128  minutes.


In [126]:
(x_train[8234])

"requestparam spring mvc parameters controller kind requests http //localhost:8080/submit/id/id123432 @ requestmapping value = requestmethod.get string showloginwindow pathvariable id logout password modelattribute submitmodel model bindingresult errors loginexception request controllers exception 'controller bean method "

In [115]:
X_train_word_average = word_averaging_list(model.wv,x_train)
print('part 2')
X_test_word_average = word_averaging_list(model.wv,x_test)
print('done')

part 2
done


0.18678092161814372  minutes.


In [119]:
xtrain_w2v, xvalid_w2v, ytrain_w2v, yvalid_w2v = train_test_split(X_train_word_average, 
                                                  featData["y_output"][:len_train].tolist(),  
                                                  random_state=42, 
                                                  test_size=0.1)
len_act = len(xtrain)
y_sparse_train_w2v = featData["y_output"].values.tolist()[:len_act]
y_sparse_test_w2v = featData["y_output"].values.tolist()[len_act:len_train]

# Kneighbors


## EMLo

In [5]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import f1_score

In [16]:
neigh = KNeighborsClassifier(n_neighbors=3)
neigh.fit(xtrain, y_sparse_train)

predict_neigh = neigh.predict(xvalid)
f1_scor = f1_score(y_sparse_test, predict_neigh, average='micro')
f1_scor_2 = f1_score(y_sparse_test, predict_neigh, average='weighted')

print(f1_scor, f1_scor_2)

0.012389867841409693 0.011020425155544565


## Word2Vec (baseline)

In [120]:
neigh = KNeighborsClassifier(n_neighbors=3)
neigh.fit(xtrain_w2v, y_sparse_train_w2v)

predict_neigh = neigh.predict(xvalid_w2v)
f1_scor = f1_score(y_sparse_test, predict_neigh, average='micro')
f1_scor_2 = f1_score(y_sparse_test_w2v, predict_neigh, average='weighted')

print(f1_scor, f1_scor_2)

0.011798600631087942 0.01049355818729097


0.9861274480819702  minutes.


In [14]:
elmo = hub.Module("https://tfhub.dev/google/elmo/2", trainable=True)

```
# just a random sentence
x = ["Roasted ants are a popular snack in Columbia"]

# Extract ELMo features 
embeddings = elmo(x, signature="default", as_dict=True)["elmo"]

embeddings.shape
```
The output is a 3 dimensional tensor of shape (1, 8, 1024):

The first dimension of this tensor represents the number of training samples. This is 1 in our case
The second dimension represents the maximum length of the longest string in the input list of strings. Since we have only 1 string in our input list, the size of the 2nd dimension is equal to the length of the string – 8
The third dimension is equal to the length of the ELMo vector

In [16]:
def elmo_vectors(x, i):
  embeddings = elmo(x, signature="default", as_dict=True)["elmo"]
  if(i%5 == 0): print(i)

  with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    sess.run(tf.tables_initializer())
    # return average of ELMo features
    return (sess.run(tf.reduce_mean(embeddings,1)), i)

In [15]:
i = 0
j = 0

elmo_train, i = [elmo_vectors(tf.convert_to_tensor(x, i)) for x in list_train]
elmo_test, j = [elmo_vectors(tf.convert_to_tensor(x, j)) for x in list_test]

INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


# Regression

In [53]:
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import GradientBoostingRegressor

In [54]:
mor = MultiOutputRegressor(GradientBoostingRegressor(random_state=0))

In [55]:
mor.fit(xtrain, y_sparse_train)

KeyboardInterrupt: 

697.0608496983846  minutes.


In [None]:
predict_mor = mor.predict(xvalid, y_sparse_test)
print(f1_score(y_sparse_test, predict_mor, average='micro'))