In [11]:
from typing import List
from nltk import word_tokenize

# Test Plan
The general metric we are using in this test plan is that the computer should translate a sentence into a series of emojis that a human can translate back into a single sentence. To do this we will split this into a couple of parts:
   
   1. Generate 100 sentences
   2. Summarize each of the sentences
   3. Take the top 20 sentences, sorted by the certainty score
   4. For each machine translated sentence:
       1. Provide the user with the emojis
       2. Provide the user with an approximate sentence length
       3. Prompt the user to tranlate the emojis into a sentence
   5. For each machine translated sentence-user translated sentence pair:
       1. Calculate the distance between the two sentences using sent2vec (might need another metric)

We then will have a list of scores for each sentence.

### Sentence Generation
The sentences are gathered from the Stanford NLP research group's NMT dataset. The ones that we are currently using are located at https://nlp.stanford.edu/projects/nmt/data/iwslt15.en-vi/tst2012.en. All of these sentences will be loaded into memory and cleaned if neccessary

In [67]:
# Load the sentences
file_path = "data/tst2012.en"
testing_sentences = []
with open(file_path, "r") as sents:
    testing_sentences = [sent for sent in sents]
            
# Filter the sentences based on less than ten words
word_limit_lower = 5
word_limit_upper = 5
testing_sentences = list(filter(lambda sent: len(word_tokenize(sent)) <= word_limit_upper and 
                                             len(word_tokenize(sent)) >= word_limit_lower, testing_sentences))

# Some of the sentences have &apos; instead of ' but our algorithm doesn't handle that so replace with
# regular "'"
testing_sentences = [testing_sentence.replace("&apos;", "'") for testing_sentence in testing_sentences]

# We want 100 sentences at least
print(f"{len(testing_sentences)} sentences in dataset")

38 sentences in dataset


### Sentence Summarization
To do this we will just be using an exported Python V1 program that is just the NaiveEmojiTranslation notebook exported to .py. We summarize with the current best known params based on some limited observation. The sentence will be summarized using the best currently known parameters, and then the summaries scored based on the scoring function.

In [80]:
# cosine distance gives warnings when div by 0
import warnings; warnings.simplefilter('ignore')
# Exported NaiveEmojiTranslation to Python file as of October 24th
from NaiveEmojiTranslation_V1 import summarize, lemmatizerNLTK
# Sort the sentences by their uncertainty scores. This is imported as a generic scoring
# function so that it can be swapped in and out easily
from NaiveEmojiTranslation_V1 import score_summarization_result_average as scoring_function

# JUST FOR TESTING ONLY USE TEN
testing_sentences = testing_sentences[:10]

# Summarize each testing sentence with the current best known parameters
summarized_sentences = []
for sentence in testing_sentences:
    summarized_sentences.append(summarize(sentence, keep_stop_words=True, 
                                  lemma_func=lemmatizerNLTK.lemmatize, scoring_func=scoring_function))
    
# Sort the list by the scoring function
summarized_sentences_sorted = list(sorted(summarized_sentences, key=scoring_function))

# Choose only the top 30 summaries
testing_summaries = summarized_sentences_sorted[:30]

# User Input

In [121]:
from NaiveEmojiTranslation_V1 import EmojiSummarizationResult
from dataclasses import dataclass

@dataclass
class UserSummarization:
    machine_summarization: EmojiSummarizationResult
    user_guess: str = ""
    difference: float = -1

user_summaries = []
for summary in summarized_sentences_sorted:
    print(f"Emoji Sequence: {summary.emojis}")
    print("Sentence Length: {}".format(len(word_tokenize(" ".join(summary.n_grams)))))
    
    translation = input("What's your translation?")
    
    user_summaries.append(UserSummarization(summary, translation))
    print(user_summaries[-1])
    break

Emoji Sequence: 🙏
Sentence Length: 4


What's your translation? thank you


UserSummarization(machine_summarization=EmojiSummarizationResult(emojis='🙏', n_grams=['thank you very much'], uncertainty_scores=[0.15913492441177368], elapsed_time=0), user_guess='thank you', difference=-1)


# Scoring

In [86]:
# Initialize the sent2vec model
import sent2vec
s2v = sent2vec.Sent2vecModel()
s2v.load_model('../models/wiki_unigrams.bin') # https://drive.google.com/open?id=0B6VhzidiLvjSa19uYWlLUEkzX3c

In [128]:
from scipy.spatial.distance import cosine # Distance between sentence and emoji in sent2vec vector space

for user_summary in user_summaries:
    user_emb, mach_emb = s2v.embed_sentences([user_summary.user_guess, " ".join(user_summary.machine_summarization.n_grams)])
    user_summary.difference = cosine(user_emb, mach_emb)
    print("User guessed: {}\nSummary Input: {}\nDifference: {}".format(user_summary.user_guess, " ".join(user_summary.machine_summarization.n_grams), user_summary.difference))
    
print("Average cosine difference ", sum([user_summary.difference for user_summary in user_summaries]) / len(user_summaries))

User guessed: thank you
Summary Input: thank you very much
Difference: 0.15913492441177368
Average cosine difference  0.15913492441177368
