In [2]:
import numpy as np
import torch as pt
import pandas as pd
import json
import string

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
def jsonlToDataset(jsonl_path):
    dataset = []
    with open(jsonl_path, 'r') as json_file:
        json_list = list(json_file)
    for json_str in json_list:
        dataset.append(json.loads(json_str))
    return dataset

In [4]:
train_data_path = "D:/Deep_learning_laboratory/assignment_2/assignment_2_data/train.jsonl"
test_data_path = "D:/Deep_learning_laboratory/assignment_2/assignment_2_data/test.jsonl"
train_dataset = jsonlToDataset(train_data_path)
test_dataset = jsonlToDataset(test_data_path)

In [5]:
assert len(train_dataset) == 40398
assert len(test_dataset) == 1267

In [7]:
train_dataset[0]

{'qID': '3QHITW7OYO7Q6B6ISU2UMJB84ZLAQE-2',
 'sentence': "Ian volunteered to eat Dennis's menudo after already having a bowl because _ despised eating intestine.",
 'option1': 'Ian',
 'option2': 'Dennis',
 'answer': '2'}

In [23]:
#idea: use RNN (plain). we need data = (sentence, option1, option2), label = (answer)

In [None]:
#https://radimrehurek.com/gensim/auto_examples/tutorials/run_word2vec.html#sphx-glr-auto-examples-tutorials-run-word2vec-py

from gensim.models import KeyedVectors
wv = KeyedVectors.load("D:/Deep_learning_laboratory/assignment_2/word2vec-google-news-300", mmap='r')

In [12]:
# CODE TO DOWNLOAD WORD2VEC MODEL. Once downloaded, you can load weights from the folder you saved them into
#import gensim.downloader as api
#wv = api.load('word2vec-google-news-300')
#wv.save("word2vec-google-news-300")

In [13]:
pairs = [
    ('car', 'minivan'),   
    ('car', 'bicycle'),   
    ('car', 'airplane'),  
    ('car', 'cereal'),    
    ('capitalism', 'communism'), # 60% WTF
]
for w1, w2 in pairs:
    print('%r\t%r\t%.2f' % (w1, w2, wv.similarity(w1, w2)))

'car'	'minivan'	0.69
'car'	'bicycle'	0.54
'car'	'airplane'	0.42
'car'	'cereal'	0.14
'capitalism'	'communism'	0.60


### Create a sample Dataset and remove punctuation

In [43]:
sample_dataset = []
for i in range(80):
    sample_dataset.append(train_dataset[i]['sentence'].split(maxsplit=-1))

In [44]:
def remove_punctuation(dataset):
    punctuation_to_remove = string.punctuation
    punctuation_to_remove = punctuation_to_remove.replace("_","")
    for i in range(len(dataset)):
        dataset[i] = dataset[i].translate(str.maketrans('', '', punctuation_to_remove))

In [45]:
remove_punctuation(sample_dataset[0])
sample_dataset[0]

Ian volunteered to eat Dennis's menudo after already having a bowl because _ despised eating intestine.
Ian volunteered to eat Dennis's menudo after already having a bowl because _ despised eating intestine.


### Stemming or Not?
Do we need to stem words before feeding them to the Word2Vec model? Better not, the result below shows us that stemming.

In [117]:
from nltk.stem import PorterStemmer
sample_dataset_stemmed = []
def preprocess_words(words):
    processed_words = []
    stemmer=PorterStemmer()
    for word in words:
        processed_words.append(stemmer.stem(word))
    return processed_words
for i in range(80):
    sample_dataset_stemmed.append(preprocess_words(sample_dataset[i]))

In [118]:
def number_of_words_of_phrase_not_in_Word2Vec(phrase):
    numberOfWordsNotInWord2Vec = 0
    for s in stri:
        try:
            result.append(wv[s])
        except KeyError:
            numberOfWordsNotInWord2Vec = numberOfWordsNotInWord2Vec + 1
    return numberOfWordsNotInWord2Vec

numberOfWordsNotInWord2Vec = 0
numberOfStemmedWordsNotInWord2Vec = 0

for i in range(80):
    numberOfWordsNotInWord2Vec += number_of_words_of_phrase_not_in_Word2Vec(sample_dataset[i])
    numberOfStemmedWordsNotInWord2Vec += number_of_words_of_phrase_not_in_Word2Vec(sample_dataset_stemmed[i]) 

print("There are " + str(numberOfWordsNotInWord2Vec) + " missing words, and " + str(numberOfStemmedWordsNotInWord2Vec) + " missing stemmed words in the Word2Vec model")

#TODO
# word "to" is missing. What to do with missing words?
# Dennis is reconognized, while Dennis's is not. We need to preprocess the data.

There are 320 missing words, and 320 missing stemmed words in the Word2Vec model


### Since there are the same number of stemmed words missing in Word2Vec as the number of unstemmed words, stemming won't be used
## How to handle missing words? (Words that are not present in Word2Vec model)

In [None]:
sample_phrase = sample_dataset[i]
sample_phrase

'K'

In [112]:
shape = wv['Ian',].shape
a = pt.FloatTensor(shape[0], shape[1])
a.shape

torch.Size([1, 300])

In [145]:
sample_phrase_embeddings = []
for word in sample_phrase:
    try:
        sample_phrase_embeddings.append(pt.from_numpy(wv[word]))
    except KeyError:
        sample_phrase_embeddings.append(pt.FloatTensor(shape[0], shape[1])) #random distribution

### Given a sample data turn into a format fit for a classification task
Example:

("a man _ a beard", opt1="has", opt2="have", label=0), --- label=i, means use option i

turns into

("a man has a beard", label=1), ("a man have a beard", label=0), --- label=0 means the phrase is wrong, label=1 means the phrase is true

In [29]:
#Example
train_dataset[0]

{'qID': '3QHITW7OYO7Q6B6ISU2UMJB84ZLAQE-2',
 'sentence': "Ian volunteered to eat Dennis's menudo after already having a bowl because _ despised eating intestine.",
 'option1': 'Ian',
 'option2': 'Dennis',
 'answer': '2'}

In [15]:
def writeOptionInPhrase(phrase, option):
    return phrase.replace("_", option)
writeOptionInPhrase(train_dataset[0]["sentence"], train_dataset[0]["option2"])

"Ian volunteered to eat Dennis's menudo after already having a bowl because Dennis despised eating intestine."

In [37]:
def preprocess_data(dataset):
    preprocessed_data = []
    preprocessed_labels = []
    for element in dataset:
        preprocessed_data.append(writeOptionInPhrase(element["sentence"], element["option1"]))
        preprocessed_labels.append(1 if int(element["answer"]) == 1 else 0)
        preprocessed_data.append(writeOptionInPhrase(element["sentence"], element["option2"]))
        preprocessed_labels.append(1 if int(element["answer"]) == 2 else 0)
    return preprocessed_data, preprocessed_labels
preprocessed_data, preprocessed_labels = preprocess_data(train_dataset)


In [38]:
#our preprocessed dataset contains double the phrases of the original
assert len(preprocessed_data) == 2*len(train_dataset)
#the first phrase became:
#("Ian volunteered to eat Dennis's menudo after already having a bowl because Ian despised eating intestine.", 0)
assert preprocessed_labels[0] == 0
#("Ian volunteered to eat Dennis's menudo after already having a bowl because Dennis despised eating intestine.", 1)
assert preprocessed_labels[1] == 1

### Now we can build a Generator class, to apply the previous steps to the whole dataset
* x_data will contain all the embeddings of each phrase. Each phrase of n words is embedded as a list of n vectors with dimensionality 300
* y_label will contain all possible labels. If the phrase is correct, then label=1, else label=0

Oss: as explained above, we didn't use Stemming

In [125]:
class Generator:
    x_data = []
    y_label = []
    embedding_shape = ()
    word2Vec = []
    
    
    def __init__(self, path_to_data):
        dataset = jsonlToDataset(path_to_data)
        self.remove_punctuation(dataset)
        preprocessed_data, self.y_label = self.preprocess_data(dataset)
        self.remove_list(dataset)
        self.initialize_word2vec()
        self.embed_dataset(preprocessed_data)
        self.remove_list(preprocessed_data)
        
    def jsonlToDataset(self, jsonl_path):
        dataset = []
        with open(jsonl_path, 'r') as json_file:
            json_list = list(json_file)
        for json_str in json_list:
            dataset.append(json.loads(json_str))
        return dataset
    
    def remove_punctuation(self, dataset):
        for i in range(len(dataset)):
            dataset[i]["sentence"] = self.remove_punctuation_helper(
                dataset[i]["sentence"])
            
    def remove_punctuation_helper(self, phrase):
        punctuation_to_remove = string.punctuation
        punctuation_to_remove = punctuation_to_remove.replace("_","")
        return phrase.translate(str.maketrans('', '', punctuation_to_remove))
    
    def preprocess_data(self, dataset):
        preprocessed_data = []
        preprocessed_labels = []
        for element in dataset:
            preprocessed_data.append(writeOptionInPhrase(element["sentence"], element["option1"]))
            preprocessed_labels.append(1 if int(element["answer"]) == 1 else 0)
            preprocessed_data.append(writeOptionInPhrase(element["sentence"], element["option2"]))
            preprocessed_labels.append(1 if int(element["answer"]) == 2 else 0)
        return preprocessed_data, preprocessed_labels
    
    def initialize_word2vec(self):
        from gensim.models import KeyedVectors
        self.word2Vec = KeyedVectors.load("D:/Deep_learning_laboratory/assignment_2/word2vec-google-news-300", mmap='r')
        self.embedding_shape = self.word2Vec['Ian',].shape
    
    def embed_dataset(self, preprocessed_data):
        embedded_data = []
        for phrase in preprocessed_data:
            self.x_data.append(self.phrase_embedding(phrase.split(maxsplit=-1)))
            
    def phrase_embedding(self, phrase):
        sample_phrase_embeddings = []
        for word in phrase:
            try:
                sample_phrase_embeddings.append(pt.from_numpy(self.word2Vec[word]))
            except KeyError:
                sample_phrase_embeddings.append(pt.FloatTensor(self.embedding_shape[0], self.embedding_shape[1])) #random distribution
        return sample_phrase_embeddings
    
    def remove_list(self, list_obj):
        del list_obj[:]
        del list_obj
            
    
gen = Generator(train_data_path)

In [128]:
len(gen.x_data[0])

16

## Model: Plain RNN
The network will be a plain RNN. Given a phrase our model will output whether the phrase is good or not. Half of our training set contains correct phrases, and the other half wrong ones (by construction), so the dataset is balanced.

## Training

## Evaluation