In [1]:
import numpy as np
import torch
import tensorflow_hub as tfh
from xml.etree.ElementTree import ElementTree
from collections import defaultdict
tree = ElementTree()
tree.parse("data//Posts.xml")
root = tree.getroot()
posts = root.iter("row")

questions = {}
answers = {}
answer_to_question_map = defaultdict(int)
# first preprocess the data and split them into questions and answers
for post in posts:
      if int(post.attrib["PostTypeId"]) == 1:
            if "AnswerCount" in post.attrib and int(post.attrib["AnswerCount"]) > 0:
                  # question
                  this_question = {
                        "id": int(post.attrib["Id"]),
                        "title": post.attrib["Title"],
                        "body": post.attrib["Body"],
                        "accepted": post.attrib["AcceptedAnswerId"] if "AcceptedAnswerId" in post.attrib else -1  
                  }
                  questions[int(post.attrib["Id"])] = this_question
      elif int(post.attrib["PostTypeId"]) == 2:
            # answer
            this_answer = {
                  "id": int(post.attrib["Id"]),
                  "question_id": post.attrib["ParentId"],
                  "body": post.attrib["Body"],
            }
            answers[int(post.attrib["Id"])] = this_answer
            answer_to_question_map[int(post.attrib["ParentId"])] = int(post.attrib["Id"])
# print(questions)
# print(answers)

print(f"There are {len(questions)} questions and {len(answers)} answers")
print("Answer at id",answer_to_question_map[questions[14]["id"]], "has the answer to question at id 14")
print("Question at id 14:", questions[14]["body"])
print("Answer at id", answer_to_question_map[questions[14]["id"]], ":", answers[answer_to_question_map[questions[9]["id"]]]["body"])


  from .autonotebook import tqdm as notebook_tqdm


KeyError: 'Body'

In [83]:
# we clean the text here and print out the same question and answer before (question 9 and answer 388)
from bs4 import BeautifulSoup as bs
# we parse the questions body so that it doesn't contain html tags and html symbols
test_q = questions[14]["body"]
test_a = answers[689]["body"]
test_q = bs(test_q, "lxml").text
test_a = bs(test_a, "lxml").text

print("Question at index 9:", test_q)
print("Answer at index", answer_to_question_map[questions[9]["id"]], ":", test_a)
# Notice 
# 1) The html tags are now gone (including <a href...>)
# 2) HTML symbols are converted to their proper symbols &#8709 -> ∅

Question at index 9: I know that an intransitive verb accepts only one argument, i.e., the subject. So, such verbs do not need a complement. But how could one understand the concept of an intransitive complementizer, in Chomskyan syntax? After all, isn’t the very nature of complementizers to accept a complement?
Edit: According to Andrew Radford, "none of the English finite complementisers (e.g. if, that, that and the null finite complementiser ∅ found in main clauses) are transitive".  So, in what sense is the complementizer that, for example, intransitive? How does it differ from other complementizers that could be classified as transitive in other languages?

Answer at index 14856 : This question was based on the assumption that the transitivity of a complementizer was some inherent feature of it. One of the consequences of that feature would be that the closest nominal c-commanded by that complementizer would be given accusative case. So, if there was such kind of complementizers i

In [84]:
# Now we clean up the text for real
from bs4 import BeautifulSoup as bs

# first question
for id, value in questions.items():
    value["body"] = bs(value["body"], "lxml").text
    questions[id] = value
# then answer
for id, value in answers.items():
    value["body"] = bs(value["body"], "lxml").text
    answers[id] = value
    

In [94]:
print(questions[1]["body"]) #all clean now!
print(answers[7]["body"]) #all clean now!


From the (albeit citation needed) section of the Wikipedia article on aspiration:

Spanish /p t k/, for example, have voice onset times (VOTs) of about 5, 10, and 30 milliseconds, whereas English /p t k/ have VOTs of about 60, 70, and 80 ms. Korean has been measured at 20, 25, and 50 ms for /p t k/ and 90, 95, and 125 for /pʰ tʰ kʰ/.

This is also confirmed from my anecdotal explorations in the topic.
The question I have is what causes the different stop consonants to have different VOTs. I couldn't find any good linguistic descriptions through some preliminary googling.
The two hypotheses I had were both based on the (my own) idea that voicing begins once air pressure subsides from high stop-like levels to some trigger point.
The first hypothesis was that air from a far back pressure release has to travel farther and through more obstructed corridors to reach the open air. Whereas air from a /p/ release can immediately and quickly reach the open, air from a /k/ release has to go all t

In [2]:
import tensorflow_hub as hub
import tensorflow as tf
import tensorflow_text as text

# Load the Preprocessor and Bert models, this is gonna take a while
# we are loading the version that automatically lowercase the words for us
BERT_URL = "https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4"
PREPROCESS_URL = "https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3"
preprocessor = hub.KerasLayer(PREPROCESS_URL)
bert_model =  hub.KerasLayer(BERT_URL)


In [11]:
# Testing the embedding 
text_test = ['this is such an amazing movie!', "asdasdasd, asdasd"]

text_preprocessed = preprocessor(text_test)
bert_results = bert_model(text_preprocessed)
    

print(f'Pooled Outputs Shape:{bert_results["pooled_output"].shape}')
print(f'Pooled Outputs Values:{bert_results["pooled_output"][0, :12]}')
print(f'Sequence Outputs Shape:{bert_results["sequence_output"].shape}')
print(f'Sequence Outputs Values:{bert_results["sequence_output"][0, :12]}')

Pooled Outputs Shape:(2, 768)
Pooled Outputs Values:[-0.9216987  -0.39353448 -0.5393166   0.6825621   0.43848443 -0.14021152
  0.8774711   0.26043335 -0.63112926 -0.9999658  -0.26319999  0.8510528 ]
Sequence Outputs Shape:(2, 128, 768)
Sequence Outputs Values:[[ 0.19451573  0.25141695  0.1907506  ... -0.24845074  0.38568527
   0.1329099 ]
 [-0.5947868  -0.3942031   0.25245702 ... -0.769467    1.1564163
   0.32475683]
 [ 0.00641491 -0.15766448  0.5461023  ... -0.17451026  0.60289633
   0.4267228 ]
 ...
 [ 0.21948312 -0.20927079  0.5386832  ...  0.24693549  0.18250962
  -0.44427082]
 [ 0.01080245 -0.4455314   0.35991004 ...  0.31722838  0.2356281
  -0.630706  ]
 [ 0.29321173 -0.10581895  0.6114755  ...  0.20745848  0.1449465
  -0.35353366]]


In [95]:
from annoy import AnnoyIndex
# documentation: https://github.com/spotify/annoy
import random

# helper function for encoding a sentence into bert embedding
def encode_sentence(sentence):
    text_preprocessed = preprocessor([sentence])
    bert_results = bert_model(text_preprocessed)
    return bert_results["pooled_output"][0]

# 768 because that's the dimension of bert 
# testing 
f = 768
assert(encode_sentence("This is a test sentence").shape == 768)
t = AnnoyIndex(f, 'angular')
t.set_seed(487)
test_list = ["I love pizza", "This is a test", "487 is fun"]
for i, sentence in enumerate(test_list):
    
    print(i)
    v = encode_sentence(sentence)
    t.add_item(i, v)
t.build(10)
t.save("test.ann")
test_sentence = "i LOVE BURGER!"
# t.load()
k = 1
closest_index = t.get_nns_by_vector(encode_sentence(test_sentence), 1)
print(closest_index)


0
1
2
[0]


[0]


In [96]:
# Building our "vocabulary" for real now
t = AnnoyIndex(f, 'angular')
t.set_seed(487)
index = 0
for id, sentence in questions.items():
    print(index) # just tracking progress since this takes hours to train
    v = encode_sentence(sentence["body"])
    t.add_item(id, v)
    index += 1

t.build(10) # build 10 trees, allegedly the more the better  
try:
    t.save('bert_embedding.ann')
except OSError as e:
    print(e) # might run out of space if this file is too large
# this cell add the sentence bert embedding of every body in our question field 
# to annoy, we test it in the next cell

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
27

KeyboardInterrupt: 

In [None]:
# set a seed 
t.set_seed(487)
# we need this to tell what the original sentences are 

t.build(10) # build 10 trees, allegedly the more the better  
t.save('bert_embedding_unnormalize.ann')

# Find k-nearest neighbor
k = 3 # first set k = 5
test_sentence = "Why did Old English lose both thorn and eth?" # one of the questions on linguistic stack exchange
test_sentence_vector = encode_sentence(test_sentence)

f = 768
u = AnnoyIndex(f, 'angular')
u.load('bert_embedding.ann') 
nearest_i = u.get_nns_by_vector(test_sentence_vector, k) # will return the index of the k nearest neighbor
nearest_word = [(i, questions[i]) for i in nearest_i] # 
nearest_word