In [5]:
import numpy as np
import torch
import tensorflow_hub as tfh
from xml.etree.ElementTree import ElementTree

tree = ElementTree()
tree.parse("data//Posts.xml")
root = tree.getroot()
posts = root.iter("row")

questions = dict()
answers = dict()
# first preprocess the data and split them into questions and answers
for post in posts:
      if int(post.attrib["PostTypeId"]) == 1:
            if "AnswerCount" in post.attrib and int(post.attrib["AnswerCount"]) > 0:
                  # question
                  this_question = {
                        "id": post.attrib["Id"],
                        "title": post.attrib["Title"],
                        "body": post.attrib["Body"],
                        "accepted": post.attrib["AcceptedAnswerId"] if "AcceptedAnswerId" in post.attrib else -1  
                  }
                  questions[post.attrib["Id"]] = this_question
      elif int(post.attrib["PostTypeId"]) == 2:
            # answer
            this_answer = {
                  "id": post.attrib["Id"],
                  "question_id": post.attrib["ParentId"],
                  "body": post.attrib["Body"],
            }
            answers[post.attrib["Id"]] = this_answer

# print(questions)
# print(answers)

print(f"There are {len(questions)} questions and {len(answers)} answers")

There are 8480 questions and 15906 answers


In [6]:
import tensorflow_hub as hub
import tensorflow as tf
import tensorflow_text as text

# Load the Preprocessor and Bert models, this is gonna take a while
BERT_URL = "https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4"
PREPROCESS_URL = "https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3"
preprocessor = hub.KerasLayer(PREPROCESS_URL)
# text_test = ['this is such an amazing movie!']
# text_preprocessed = preprocessor(text_test)
bert_model =  hub.KerasLayer(BERT_URL)


In [7]:
# Testing the embedding 
text_test = ['this is such an amazing movie!', "asdasdasd, asdasd"]
text_preprocessed = preprocessor(text_test)
bert_results = bert_model(text_preprocessed)

print(f'Pooled Outputs Shape:{bert_results["pooled_output"].shape}')
print(f'Pooled Outputs Values:{bert_results["pooled_output"][0, :12]}')
print(f'Sequence Outputs Shape:{bert_results["sequence_output"].shape}')
print(f'Sequence Outputs Values:{bert_results["sequence_output"][0, :12]}')

Pooled Outputs Shape:(2, 768)
Pooled Outputs Values:[-0.92169887 -0.3935344  -0.53931653  0.6825622   0.43848443 -0.14021143
  0.8774711   0.2604334  -0.6311292  -0.9999658  -0.26319984  0.85105276]
Sequence Outputs Shape:(2, 128, 768)
Sequence Outputs Values:[[ 0.19451581  0.2514167   0.19075033 ... -0.248451    0.38568538
   0.13291009]
 [-0.5947865  -0.39420372  0.25245705 ... -0.76946753  1.156416
   0.3247575 ]
 [ 0.00641512 -0.15766484  0.5461022  ... -0.17451063  0.60289633
   0.42672294]
 ...
 [ 0.21948352 -0.20927072  0.5386833  ...  0.24693576  0.18250972
  -0.44427046]
 [ 0.01080249 -0.4455315   0.3599099  ...  0.317228    0.23562819
  -0.6307053 ]
 [ 0.2932115  -0.10581924  0.6114754  ...  0.20745865  0.14494647
  -0.35353386]]


In [8]:
questions


{'1': {'id': '1',
  'title': 'Why does stop VOT duration vary depending on place of articulation?',
  'body': '<p>From the (albeit <em>citation needed</em>) section of the <a href="http://en.wikipedia.org/wiki/Aspirated_consonant" rel="noreferrer">Wikipedia article</a> on aspiration:</p>\n\n<blockquote>\n  <p>Spanish /p t k/, for example, have voice onset times (VOTs) of about 5, 10, and 30 milliseconds, whereas English /p t k/ have VOTs of about 60, 70, and 80 ms. Korean has been measured at 20, 25, and 50 ms for /p t k/ and 90, 95, and 125 for /pʰ tʰ kʰ/.</p>\n</blockquote>\n\n<p>This is also confirmed from my anecdotal explorations in the topic.</p>\n\n<p>The question I have is what causes the different stop consonants to have different VOTs. I couldn\'t find any good linguistic descriptions through some preliminary googling.</p>\n\n<p>The two hypotheses I had were both based on the (my own) idea that voicing begins once air pressure subsides from high stop-like levels to some trigg

In [9]:
from annoy import AnnoyIndex
import random
# TODO: get rid of the tags and weird symbols
f = 768
t = AnnoyIndex(f, 'angular')
for i, word in enumerate(questions.values()):
    v = bert_model[[word["body"]]]["pooled_outputs"]
    t.add_item(i, v)

index_to_word = dict(enumerate( questions.values()))
word_to_index = dict([(word, i) for i, word in enumerate( questions.values())])

t.build(10) # 10 trees
t.save('test.ann')

# ...

word = "machine"
this_word_index = word_to_index[word]

u = AnnoyIndex(f, 'angular')
u.load('test.ann') # super fast, will just mmap the file
nearest_i = u.get_nns_by_item(this_word_index, 1000) # will find the 1000 nearest neighbors
nearest_word = [(i, index_to_word[i]) for i in nearest_i]
nearest_word

ModuleNotFoundError: No module named 'annoy'