# Convolution Nerual Network with Trained Word2Vec Embeddings for Part-Of-Speech Tagging - Demo

#### Necessary Files: latinModel, englishModel, ttokenizer.json, ttokenizerlatin.json, wtokenizer.json, wtokenizerlatin.json

_______________________________
### Step 1: Run the following three cells to prepare some data the model will need.

In [41]:
import tensorflow as tf
from tensorflow import keras
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from nltk.tokenize import word_tokenize
import io
import json

In [42]:
def accuracy_masked(y_true, y_pred):
    y_true_class = K.argmax(y_true, axis=-1)
    y_pred_class = K.argmax(y_pred, axis=-1)

    ignore_mask = K.cast(K.not_equal(y_true_class, 0), 'int32')
    matches = K.cast(K.equal(y_true_class, y_pred_class), 'int32') * ignore_mask
    accuracy = K.sum(matches) / K.maximum(K.sum(ignore_mask), 1)
    return accuracy

In [51]:
tagdictenglish = {
    "conj": "Conjunction",
    ".": "Punctuation",
    "propn": "Proper Noun",
    "num": "Numeral",
    "adv": "Adverb",
    "verb": "Verb",
    "noun": "Noun",
    "pron": "Pronoun",
    "adj": "Adjective",
    "part": "Particle",
    "det": "Determiner",
    "x": "Other",
    "adp": "Adposition",
}

tagdictlatin = {
    "propn": "Proper Noun",
    "adv": "Adverb",
    "x": "Other",
    "intj": "Interjection",
    "cconj": "Coordinating Conjunction",
    "punct": "Punctuation",
    "det": "Determiner",
    "adj": "Adjective",
    "pron": "Pronoun",
    "sconj": "Subordinating Conjunction",
    "num": "Numeral",
    "aux": "Auxiliary Verb",
    "noun": "Noun",
    "adp": "Adposition",
    "verb": "Verb",
    "part": "Particle"
}

________________________________
### Step 2: Run the next two cells to load up the models and their Tokenizers.

In [44]:
loaded_english_model_tf = tf.keras.models.load_model('englishModel',custom_objects={'accuracy_masked':accuracy_masked})
loaded_latin_model_tf = tf.keras.models.load_model('latinModel', custom_objects={'accuracy_masked':accuracy_masked})

In [45]:
with open('wtokenizer.json') as f:
    data = json.load(f)
    word_tokenizer_english = keras.preprocessing.text.tokenizer_from_json(data)
    
with open('ttokenizer.json') as f:
    data = json.load(f)
    tag_tokenizer_english = keras.preprocessing.text.tokenizer_from_json(data)
    
with open('wtokenizerlatin.json') as f:
    data = json.load(f)
    word_tokenizer_latin = keras.preprocessing.text.tokenizer_from_json(data)
    
with open('ttokenizerlatin.json') as f:
    data = json.load(f)
    tag_tokenizer_latin = keras.preprocessing.text.tokenizer_from_json(data)

______________________________
### Step 3: Run the next two cells to prepare the Part-of-Speech-Tagging Functions.

In [52]:
def output_prediction_english(text):
    text = [word_tokenize(text)]
    text_encoded = word_tokenizer_english.texts_to_sequences(text) 
    text_padded = pad_sequences(text_encoded, maxlen=50, padding='pre', truncating='post')
    ynew = np.argmax(loaded_english_model_tf.predict(text_padded), axis=-1)
    prediction = ynew[0]
    prediction = np.trim_zeros(prediction)
    decoded = tag_tokenizer_english.sequences_to_texts([prediction])
    decoded = word_tokenize(decoded[0])
    decoded = [tagdictenglish[tag] for tag in decoded]
    print("      Sentence= %s\nPredicted Tags= %s" % (text, decoded))

In [53]:
def output_prediction_latin(text):
    text = [word_tokenize(text)]
    text_encoded = word_tokenizer_latin.texts_to_sequences(text) 
    text_padded = pad_sequences(text_encoded, maxlen=50, padding='pre', truncating='post')
    ynew = np.argmax(loaded_latin_model_tf.predict(text_padded), axis=-1)
    prediction = ynew[0]
    prediction = np.trim_zeros(prediction)
    decoded = tag_tokenizer_latin.sequences_to_texts([prediction])
    decoded = word_tokenize(decoded[0])
    decoded = [tagdictlatin[tag] for tag in decoded]
    print("      Sentence= %s\nPredicted Tags= %s" % (text, decoded))

___________________________
### Step 5: Create your own sentence in the variable named english and run the cell to see its parts of speech. Look up your favorite latin phrase or create your own (if you know latin, that is), store it in the variable named latin, and run the cell to see its parts of speech.

In [54]:
english = "The red cat took a very long walk along the winding river." 
output_prediction_english(english)

      Sentence= [['The', 'red', 'cat', 'took', 'a', 'very', 'long', 'walk', 'along', 'the', 'winding', 'river', '.']]
Predicted Tags= ['Pronoun', 'Pronoun', 'Pronoun', 'Pronoun', 'Adposition', 'Pronoun', 'Pronoun', 'Pronoun', 'Pronoun', 'Pronoun', 'Pronoun', 'Pronoun', 'Adposition']


In [55]:
latin = "Forsan et haec olim meminisse iuvabit."
output_prediction_latin(latin)

      Sentence= [['Forsan', 'et', 'haec', 'olim', 'meminisse', 'iuvabit', '.']]
Predicted Tags= ['Adverb', 'Coordinating Conjunction', 'Adjective', 'Adverb', 'Verb', 'Adjective', 'Punctuation']
