<a href="https://colab.research.google.com/github/Elena-Lebedeva/drafts/blob/main/Codecademy_NLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

TEXT PREPROCESSING
Noise Removal

The .sub() method has three required arguments:

pattern – a regular expression that is searched for in the input string. There must be an r preceding the string to indicate it is a raw string, which treats backslashes as literal characters.
replacement_text – text that replaces all matches in the input string
input – the input string that will be edited by the .sub() method
The method returns a string with all instances of the pattern replaced by the replacement_text.

In [None]:
import re 
text = "<p>    This is a paragraph</p>" 
result = re.sub(r'<.?p>', '', text)
print(result) 
#    This is a paragraph

    This is a paragraph


In [None]:
# \s -  indicates a single whitespace character.
# \w stands for “word character”. It always matches the ASCII characters [A-Za-z0-9_]. 
# Notice the inclusion of the underscore and digits.

In [None]:
text = "    This is a paragraph" 
result = re.sub(r'\s{4}', '', text)
print(result) 
# This is a paragraph

In [None]:
headline_one = '<h1>Nation\'s Top Pseudoscientists Harness High-Energy Quartz Crystal Capable Of Reversing Effects Of Being Gemini</h1>'
tweet = '@fat_meats, veggies are better than you think.'
headline_no_tag = re.sub(r'</?h1>', '', headline_one)
tweet_no_at = re.sub(r'@', '', tweet)

In [None]:
# Tokenization
from nltk.tokenize import word_tokenize
 
text = "Tokenize this text"
tokenized = word_tokenize(text)
 
print(tokenized)
# ["Tokenize", "this", "text"]

In [None]:
# To tokenize at the sentence level, we can use sent_tokenize() from the same module.
from nltk.tokenize import sent_tokenize
 
text = "Tokenize this sentence. Also, tokenize this sentence."
tokenized = sent_tokenize(text)
 
print(tokenized)
# ['Tokenize this sentence.', 'Also, tokenize this sentence.']

In [None]:
from nltk.tokenize import word_tokenize, sent_tokenize

ecg_text = 'An electrocardiogram is used to record the electrical conduction through a person\'s heart. The readings can be used to diagnose cardiac arrhythmias.'

tokenized_by_word = word_tokenize(ecg_text)
tokenized_by_sentence = sent_tokenize(ecg_text)

Normalization

In [None]:
my_string = 'tHiS HaS a MiX oF cAsEs'
 
print(my_string.upper())
# 'THIS HAS A MIX OF CASES'
 
print(my_string.lower())
# 'this has a mix of cases'

In [None]:
from nltk.corpus import stopwords 
stop_words = set(stopwords.words('english')) 

In [None]:
from nltk.corpus import stopwords 
stop_words = set(stopwords.words('english')) 
nbc_statement = "NBC was founded in 1926 making it the oldest major broadcast network in the USA"
 
word_tokens = word_tokenize(nbc_statement) 
# tokenize nbc_statement
 
statement_no_stop = [word for word in word_tokens if word not in stop_words]
 
print(statement_no_stop)
# ['NBC', 'founded', '1926', 'making', 'oldest', 'major', 'broadcast', 'network', 'USA']

In [None]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords 
stop_words = set(stopwords.words('english'))

survey_text = 'A YouGov study found that American\'s like Italian food more than any other country\'s cuisine.'
tokenized_survey = word_tokenize(survey_text)
text_no_stops = [word for word in tokenized_survey if word not in stop_words]

Stemming
 is the text preprocessing normalization task concerned with bluntly removing word affixes (prefixes and suffixes). For example, stemming would cast the word “going” to “go”. This is a common method used by search engines to improve matching between user input and website hits.

In [None]:
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
tokenized = ['NBC', 'was', 'founded', 'in', '1926', '.', 'This', 'makes', 'NBC', 'the', 'oldest', 'major', 'broadcast', 'network', '.']
 
stemmed = [stemmer.stem(token) for token in tokenized]
 
print(stemmed)
# ['nbc', 'wa', 'found', 'in', '1926', '.', 'thi', 'make', 'nbc', 'the', 'oldest', 'major', 'broadcast', 'network', '.']

In [None]:
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()

populated_island = 'Java is an Indonesian island in the Pacific Ocean. It is the most populated island in the world, with over 140 million people.'
island_tokenized = word_tokenize (populated_island)
stemmed = [stemmer.stem(token) for token in island_tokenized]

Lemmatization is a method for casting words to their root forms. This is a more involved process than stemming, because it requires the method to know the part of speech for each word. Since lemmatization requires the part of speech, it is a less efficient approach than stemming.

In [None]:
tokenized = ["NBC", "was", "founded", "in", "1926"]
 
lemmatized = [lemmatizer.lemmatize(token) for token in tokenized]
 
print(lemmatized)
# ["NBC", "wa", "founded", "in", "1926"]

In [None]:
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

populated_island = 'Indonesia was founded in 1945. It contains the most populated island in the world, Java, with over 140 million people.'

tokenized_string = word_tokenize(populated_island)

lemmatized_words = [lemmatizer.lemmatize(token) for token in tokenized_string]

Part-of-Speech Tagging
To improve the performance of lemmatization, we need to find the part of speech for each word in our string. In script.py, to the right, we created a part-of-speech tagging function. The function accepts a word, then returns the most common part of speech for that word.

In [None]:
from nltk.corpus import wordnet
from collections import Counter
# Inside of our function, we use the wordnet.synsets() function to get a set of synonyms for the word:
# The returned synonyms come with their part of speech.
def get_part_of_speech(word):
  probable_part_of_speech = wordnet.synsets(word)

#Next, we create a Counter() object and set each value to the count of the number of synonyms that fall into each part of speech:
pos_counts["n"] = len(  [ item for item in probable_part_of_speech if item.pos()=="n"]  )
... 
# This line counts the number of nouns in the synonym set.

# Now that we have a count for each part of speech, 
# we can use the .most_common() counter method to find and return the most likely part of speech:
most_likely_part_of_speech = pos_counts.most_common(1)[0][0]

# Now that we can find the most probable part of speech for a given word, 
# we can pass this into our lemmatizer when we find the root for each word.
tokenized = ["How", "old", "is", "the", "country", "Indonesia"]
 
lemmatized = [lemmatizer.lemmatize(token, get_part_of_speech(token)) for token in tokenized]
 
print(lemmatized)
# ['How', 'old', 'be', 'the', 'country', 'Indonesia']
# Previously: ['How', 'old', 'is', 'the', 'country', 'Indonesia']

In [None]:
import nltk
from nltk.corpus import wordnet
from collections import Counter

def get_part_of_speech(word):
  probable_part_of_speech = wordnet.synsets(word)
  
  pos_counts = Counter()

  pos_counts["n"] = len(  [ item for item in probable_part_of_speech if item.pos()=="n"]  )
  pos_counts["v"] = len(  [ item for item in probable_part_of_speech if item.pos()=="v"]  )
  pos_counts["a"] = len(  [ item for item in probable_part_of_speech if item.pos()=="a"]  )
  pos_counts["r"] = len(  [ item for item in probable_part_of_speech if item.pos()=="r"]  )
  
  most_likely_part_of_speech = pos_counts.most_common(1)[0][0]
  return most_likely_part_of_speech

In [None]:
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from part_of_speech import get_part_of_speech

lemmatizer = WordNetLemmatizer()

populated_island = 'Indonesia was founded in 1945. It contains the most populated island in the world, Java, with over 140 million people.'

tokenized_string = word_tokenize(populated_island)
lemmatized_pos = [lemmatizer.lemmatize(token, get_part_of_speech(token)) for token in tokenized_string]


try:
  print(f'The lemmatized words are: {lemmatized_pos}')
except:
  print('Expected a variable called `lemmatized_pos`')

PARSING WITH REGULAR EXPRESSIONS

Compiling and Matching
.compile(). This method takes a regular expression pattern as an argument and compiles the pattern into a regular expression object, which you can later use to find matching text. The regular expression object below will exactly match 4 upper or lower case characters.

Regular expression objects have a .match() method that takes a string of text as an argument and looks for a single match to the regular expression that starts at the beginning of the string.

If .match() finds a match that starts at the beginning of the string, it will return a match object. The match object lets you know what piece of text the regular expression matched, and at what index the match begins and ends. If there is no match, .match() will return None.

With the match object stored in result, you can access the matched text by calling result.group(0). If you use a regex containing capture groups, you can access these groups by calling .group() with the appropriately numbered capture group as an argument.

Instead of compiling the regular expression first and then looking for a match in separate lines of code, you can simplify your match to one line:

In [None]:
regular_expression_object = re.compile("[A-Za-z]{4}")
result = regular_expression_object.match("Toto")
result = re.match("[A-Za-z]{4}","Toto")
# With this syntax, re‘s .match() method takes a regular expression pattern as the first argument and a string as the second argument

In [None]:
import re

# characters are defined
character_1 = "Dorothy"
character_2 = "Henry"

# compile your regular expression here
regular_expression = re.compile("[A-Za-z]{7}")

# check for a match to character_1 here
result_1 = regular_expression.match(character_1)
print(result_1)

# store and print the matched text here
match_1 = result_1.group(0)
print(match_1)

# compile a regular expression to match a 7 character string of word characters and check for a match to character_2 here
result_2 = re.match("[A-Za-z]{7}",character_2)
print(result_2)

.search() method. Unlike .match() which will only find matches at the start of a string, .search() will look left to right through an entire piece of text and return a match object for the first match to the regular expression given. If no match is found, .search() will return None.
Given a regular expression as its first argument and a string as its second argument, .findall() will return a list of all non-overlapping matches of the regular expression in the string. 

In [None]:
result = re.search("\w{8}","Are you a Munchkin?")

In [None]:
import re

# import L. Frank Baum's The Wonderful Wizard of Oz
oz_text = open("the_wizard_of_oz_text.txt",encoding='utf-8').read().lower()

# search oz_text for an occurrence of 'wizard' here
found_wizard = re.search("wizard",oz_text)
print(found_wizard)
all_lions = re.findall("lion", oz_text) 
print(all_lions)
number_lions = len(all_lions)
print(number_lions)

Noun: the name of a person (Ramona,class), place, thing (textbook), or idea (NLP)
Pronoun: a word used in place of a noun (her,she)
Determiner: a word that introduces, or “determines”, a noun (the)
Verb: expresses action (studying) or being (are,has)
Adjective: modifies or describes a noun or pronoun (new)
Adverb: modifies or describes a verb, an adjective, or another adverb (happily)
Preposition: a word placed before a noun or pronoun to form a phrase modifying another word in the sentence (on)
Conjunction: a word that joins words, phrases, or clauses (and)
Interjection: a word used to express emotion (Wow).

You can automate the part-of-speech tagging process with nltk‘s pos_tag() function! The function takes one argument, a list of words in the order they appear in a sentence, and returns a list of tuples, where the first entry in the tuple is a word and the second is the part-of-speech tag.

In [None]:
import nltk
from nltk import pos_tag
from word_tokenized_oz import word_tokenized_oz

# save and print the sentence stored at index 100 in word_tokenized_oz here
witches_fate = word_tokenized_oz[100]
print(witches_fate)

# create a list to hold part-of-speech tagged sentences here
pos_tagged_oz = []

# create a for loop through each word tokenized sentence in word_tokenized_oz here
for word_tokenized_sentence in word_tokenized_oz:
    part_of_speech = pos_tag(word_tokenized_sentence)
    pos_tagged_oz.append(part_of_speech)

  # part-of-speech tag each sentence and append to pos_tagged_oz here
  

# store and print the 101st part-of-speech tagged sentence here
witches_fate_pos = pos_tagged_oz[100]
print(witches_fate_pos)

This technique of grouping words by their part-of-speech tag is called chunking.
With chunking in nltk, you can define a pattern of parts-of-speech tags using a modified notation of regular expressions. You can then find non-overlapping matches, or chunks of words, in the part-of-speech tagged sentences of a text.

The regular expression you build to find chunks is called chunk grammar. A piece of chunk grammar can be written as follows:

In [None]:
chunk_grammar = "AN: {<JJ><NN>}"

AN is a user-defined name for the kind of chunk you are searching for. You can use whatever name makes sense given your chunk grammar. In this case AN stands for adjective-noun.

A pair of curly braces {} surround the actual chunk grammar
JJ operates similarly to a regex character class, matching any adjective
NN matches any noun, singular or plural.

The chunk grammar above will thus match any adjective that is followed by a noun.

To use the chunk grammar defined, you must create a nltk RegexpParser object and give it a piece of chunk grammar as an argument.

In [None]:
chunk_parser = RegexpParser(chunk_grammar)

You can then use the RegexpParser object’s .parse() method, which takes a list of part-of-speech tagged words as an argument, and identifies where such chunks occur in the sentence!

Consider the part-of-speech tagged sentence below:

In [None]:
pos_tagged_sentence = [('where', 'WRB'), ('is', 'VBZ'), ('the', 'DT'), ('emerald', 'JJ'), ('city', 'NN'), ('?', '.')]

In [None]:
chunked = chunk_parser.parse(pos_tagged_sentence)

In [None]:
from nltk import RegexpParser, Tree
from pos_tagged_oz import pos_tagged_oz

# define adjective-noun chunk grammar here
chunk_grammar = "AN: {<JJ><NN>}"

# create RegexpParser object here
chunk_parser = RegexpParser(chunk_grammar)

# chunk the pos-tagged sentence at index 282 in pos_tagged_oz here

scaredy_cat = chunk_parser.parse(pos_tagged_oz[282])
print(scaredy_cat)
# pretty_print the chunked sentence here
Tree.fromstring(str(scaredy_cat)).pretty_print()
print(pos_tagged_oz[282])

Chunking Noun Phrases or NP-chunking

In [None]:
chunk_grammar = "NP: {<DT>?<JJ>*<NN>}"

NP is the user-defined name of the chunk you are searching for. In this case NP stands for noun phrase
DT> matches any determiner
? is an optional quantifier, matching either 0 or 1 determiners
JJ> matches any adjective
"*" is the Kleene star quantifier, matching 0 or more occurrences of an adjective
NN> matches any noun, singular or plural

In [None]:
from nltk import RegexpParser
from pos_tagged_oz import pos_tagged_oz
from np_chunk_counter import np_chunk_counter

# define noun-phrase chunk grammar here
chunk_grammar = "NP: {<DT>?<JJ>*<NN>}"

# create RegexpParser object here
chunk_parser = RegexpParser(chunk_grammar)

# create a list to hold noun-phrase chunked sentences
np_chunked_oz = list()

# create a for loop through each pos-tagged sentence in pos_tagged_oz here
np_chunked_oz = []
for sentence in pos_tagged_oz:
  result = chunk_parser.parse(sentence)
  # chunk each sentence and append to np_chunked_oz here
  np_chunked_oz.append(result)

# store and print the most common np-chunks here
# NOTE: np_chunk_counter function was already built
most_common_np_chunks = np_chunk_counter(np_chunked_oz)
print(most_common_np_chunks)

Chunking Verb Phrases
The first structure begins with a verb VB of any tense, followed by a noun phrase, and ends with an optional adverb RB of any form. The second structure switches the order of the verb and the noun phrase, but also ends with an optional adverb.

In [None]:
chunk_grammar = "VP: {<VB.*><DT>?<JJ>*<NN><RB.?>?}"

VP is the user-defined name of the chunk you are searching for. In this case VP stands for verb phrase
VB.*> matches any verb using the . as a wildcard and the * quantifier to match 0 or more occurrences of any character. This ensures matching verbs of any tense (ex. VB for present tense, VBD for past tense, or VBN for past participle)
DT>?JJ>*NN> matches any noun phrase
RB.?> matches any adverb using the . as a wildcard and the optional quantifier to match 0 or 1 occurrence of any character. This ensures matching any form of adverb (regular RB, comparative RBR, or superlative RBS)
? is an optional quantifier, matching either 0 or 1 adverbs

In [None]:
chunk_grammar = "VP: {<DT>?<JJ>*<NN><VB.*><RB.?>?}"
# The chunk grammar for the second form of verb phrase is given below:

In [None]:
from nltk import RegexpParser
from pos_tagged_oz import pos_tagged_oz
from vp_chunk_counter import vp_chunk_counter

# define verb phrase chunk grammar here
chunk_grammar = "VP: {<DT>?<JJ>*<NN><VB.*><RB.?>?}"

# create RegexpParser object here
chunk_parser = RegexpParser(chunk_grammar)

# create a list to hold verb-phrase chunked sentences
vp_chunked_oz = list()

# create for loop through each pos-tagged sentence in pos_tagged_oz here
for sentence in pos_tagged_oz:
  result = chunk_parser.parse(sentence)
  # chunk each sentence and append to vp_chunked_oz here
  vp_chunked_oz.append(result)
  
# store and print the most common vp-chunks here
most_common_vp_chunks = vp_chunk_counter(vp_chunked_oz)
print(most_common_vp_chunks)

Chunk Filtering
Chunk filtering lets you define what parts of speech you do not want in a chunk and remove them.

A popular method for performing chunk filtering is to chunk an entire sentence together and then indicate which parts of speech are to be filtered out. If the filtered parts of speech are in the middle of a chunk, it will split the chunk into two separate chunks! 

In [None]:
chunk_grammar = """NP: {<.*>+}
                       }<VB.?|IN>+{"""

NP is the user-defined name of the chunk you are searching for. In this case NP stands for noun phrase
The brackets {} indicate what parts of speech you are chunking. .*>+ matches every part of speech in the sentence
The inverted brackets }{ indicate which parts of speech you want to filter from the chunk. VB.?|IN>+ will filter out any verbs or prepositions

In [None]:
from nltk import RegexpParser, Tree
from pos_tagged_oz import pos_tagged_oz

# define chunk grammar to chunk an entire sentence together
grammar = "Chunk: {<.*>+}"

# create RegexpParser object
parser = RegexpParser(grammar)

# chunk the pos-tagged sentence at index 230 in pos_tagged_oz
chunked_dancers = parser.parse(pos_tagged_oz[230])
print(chunked_dancers)

# define noun phrase chunk grammar using chunk filtering here
chunk_grammar = """NP: {<.*>+}
                       }<VB.?|IN>+{"""


# create RegexpParser object here
chunk_parser = RegexpParser(chunk_grammar)

# chunk and filter the pos-tagged sentence at index 230 in pos_tagged_oz here
filtered_dancers = chunk_parser.parse(pos_tagged_oz[230])
print(filtered_dancers)

# pretty_print the chunked and filtered sentence here
Tree.fromstring(str(filtered_dancers)).pretty_print()

In [None]:
Chunk
    then/RB
    she/PRP
    sat/VBD
    upon/IN
    a/DT
    settee/NN
    and/CC
    watched/VBD
    the/DT
    people/NNS
    dance/NN

In [None]:
# instead of finding NP-chunks or VP-chunks, define your own chunk grammar using regular expressions in between the curly braces {}. 
# Feel free to add any chunk filtering in between the inverted braces }{ if you so desire!

from nltk import RegexpParser
from pos_tagged_oz import pos_tagged_oz
from chunk_counter import chunk_counter

# define your own chunk grammar here
chunk_grammar = '''Chunk: {<VB.*><DT>?<JJ>*<NN><RB.?>?}
													}<DT.?|IN|RB|CC>+{'''

# create RegexpParser object
chunk_parser = RegexpParser(chunk_grammar)

# create a list to hold chunked sentences
chunked_oz = list()

# create a for loop through each pos-tagged sentence in pos_tagged_oz
for pos_tagged_sentence in pos_tagged_oz:
  # chunk each sentence and append to chunked_oz
  chunked_oz.append(chunk_parser.parse(pos_tagged_sentence))

# store and print the most common chunks
most_common_chunks = chunk_counter(chunked_oz)
print(most_common_chunks)

Bag of words language model

from spam_data import training_spam_docs, training_doc_tokens, training_labels
from sklearn.naive_bayes import MultinomialNB
from preprocessing import preprocess_text

# Add your email text to test_text between the triple quotes:
test_text = """
Ja és tradició començar l’any amb el bon propòsit de la música. Et convidem a compondre el teu 2023 amb una selecció de grans compositors que protagonitzen la programació dels propers mesos. Energies renovades per endinsar-te en els genis, que no n’entenen, del pas del temps, i també en els compositors convidats, que obren nous camins.
"""
test_tokens = preprocess_text(test_text)

def create_features_dictionary(document_tokens):
  features_dictionary = {}
  index = 0
  for token in document_tokens:
    if token not in features_dictionary:
      features_dictionary[token] = index
      index += 1
  return features_dictionary

def tokens_to_bow_vector(document_tokens, features_dictionary):
  bow_vector = [0] * len(features_dictionary)
  for token in document_tokens:
    if token in features_dictionary:
      feature_index = features_dictionary[token]
      bow_vector[feature_index] += 1
  return bow_vector

bow_sms_dictionary = create_features_dictionary(training_doc_tokens)
training_vectors = [tokens_to_bow_vector(training_doc, bow_sms_dictionary) for training_doc in training_spam_docs]
test_vectors = [tokens_to_bow_vector(test_tokens, bow_sms_dictionary)]

spam_classifier = MultinomialNB()
spam_classifier.fit(training_vectors, training_labels)

predictions = spam_classifier.predict(test_vectors)

print("Looks like a normal email!" if predictions[0] == 0 else "You've got spam!")

Discover Insights into Classic Texts

In [None]:
from nltk import pos_tag, RegexpParser
from tokenize_words import word_sentence_tokenize
from chunk_counters import np_chunk_counter, vp_chunk_counter

# import text of choice here
text = open("dorian_gray.txt", encoding = 'utf-8').read().lower()

# sentence and word tokenize text here
word_tokenized_text = word_sentence_tokenize(text)

# store and print any word tokenized sentence here
single_word_tokenized_sentence = word_tokenized_text[1]
# print(single_word_tokenized_sentence)

# create a list to hold part-of-speech tagged sentences here
pos_tagged_text = []

# create a for loop through each word tokenized sentence here
for sentence in word_tokenized_text:
  # part-of-speech tag each sentence and append to list of pos-tagged sentences here
  pos_tagged_text.append(pos_tag(sentence))

# store and print any part-of-speech tagged sentence here
single_pos_sentence = pos_tagged_text[1]
# print(single_pos_sentence)

# define noun phrase chunk grammar here
np_chunk_grammar = "NP: {<DT>?<JJ>*<NN>}"
# create noun phrase RegexpParser object here
np_chunk_parser = RegexpParser(np_chunk_grammar)

# define verb phrase chunk grammar here
vp_chunk_grammar = "VP: {<DT>?<JJ>*<NN><VB.*><RB.?>?}"

# create verb phrase RegexpParser object here
vp_chunk_parser = RegexpParser(vp_chunk_grammar)

# create a list to hold noun phrase chunked sentences and a list to hold verb phrase chunked sentences here
np_chunked_text = []
vp_chunked_text = []

# create a for loop through each pos-tagged sentence here
for sentence in pos_tagged_text:
  # chunk each sentence and append to lists here
  np_chunked_text.append(np_chunk_parser.parse(sentence))
  vp_chunked_text.append(vp_chunk_parser.parse(sentence))
  

# store and print the most common NP-chunks here
most_common_np_chunk = np_chunk_counter(np_chunked_text)
print(most_common_np_chunk)
print("***")
# store and print the most common VP-chunks here
most_common_vp_chunk = vp_chunk_counter(vp_chunked_text)
print(most_common_vp_chunk)

Bag of the words

Preprocessing function

In [None]:
# Preprocessing function
import nltk, re
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from collections import Counter

stop_words = stopwords.words('english')
normalizer = WordNetLemmatizer()

def get_part_of_speech(word):
  probable_part_of_speech = wordnet.synsets(word)
  pos_counts = Counter()
  pos_counts["n"] = len(  [ item for item in probable_part_of_speech if item.pos()=="n"]  )
  pos_counts["v"] = len(  [ item for item in probable_part_of_speech if item.pos()=="v"]  )
  pos_counts["a"] = len(  [ item for item in probable_part_of_speech if item.pos()=="a"]  )
  pos_counts["r"] = len(  [ item for item in probable_part_of_speech if item.pos()=="r"]  )
  most_likely_part_of_speech = pos_counts.most_common(1)[0][0]
  return most_likely_part_of_speech

def preprocess_text(text):
  cleaned = re.sub(r'\W+', ' ', text).lower()
  tokenized = word_tokenize(cleaned)
  normalized = [normalizer.lemmatize(token, get_part_of_speech(token)) for token in tokenized]
  return normalized

In [None]:
from preprocessing import preprocess_text
# Define text_to_bow() below:
def text_to_bow(some_text):
  bow_dictionary = {}
  tokens = preprocess_text(some_text)
  for token in tokens:
    if token in bow_dictionary:
      bow_dictionary[token] += 1
    else:
      bow_dictionary[token] = 1
  return bow_dictionary


print(text_to_bow("I love fantastic flying fish. These flying fish are just ok, so maybe I will find another few fantastic fish..."))

Vectors and Building a Features Dictionary

In [None]:
# Building a Features Dictionary
from preprocessing import preprocess_text
# Define create_features_dictionary() below:


training_documents = ["Five fantastic fish flew off to find faraway functions.", "Maybe find another five fantastic fish?", "Find my fish with a function please!"]

def create_features_dictionary(documents):
  features_dictionary = {}
  merged = " ".join(documents)
  tokens = preprocess_text(merged)
  index = 0
  for token in tokens:
    if token not in features_dictionary:
      features_dictionary[token] = index
      index += 1
  return features_dictionary,tokens

print(create_features_dictionary(training_documents)[0])

In [None]:
# output is the dictionary with all the words and their respective indexes
# {'five': 0, 'fantastic': 1, 'fish': 2, 'fly': 3, 'off': 4, 'to': 5, 'find': 6, 'faraway': 7, 'function': 8, 'maybe': 9, 'another': 10, 'my': 11, 'with': 12, 'a': 13, 'please': 14}

In [None]:
# Building a BoW Vector
from preprocessing import preprocess_text
# Define text_to_bow_vector() below:
def text_to_bow_vector(some_text, features_dictionary):
  bow_vector = len(features_dictionary)*[0]
  tokens = preprocess_text(some_text)
  for token in tokens:
    feature_index = features_dictionary[token]
    bow_vector[feature_index] += 1
  return bow_vector, tokens

features_dictionary = {'function': 8, 'please': 14, 'find': 6, 'five': 0, 'with': 12, 'fantastic': 1, 'my': 11, 'another': 10, 'a': 13, 'maybe': 9, 'to': 5, 'off': 4, 'faraway': 7, 'fish': 2, 'fly': 3}

text = "Another five fish find another faraway fish."

print(text_to_bow_vector(text, features_dictionary)[0])

In [None]:
from spam_data import training_spam_docs, training_doc_tokens, training_labels, test_labels, test_spam_docs, training_docs, test_docs
from sklearn.naive_bayes import MultinomialNB

def create_features_dictionary(document_tokens):
  features_dictionary = {}
  index = 0
  for token in document_tokens:
    if token not in features_dictionary:
      features_dictionary[token] = index
      index += 1
  return features_dictionary

def tokens_to_bow_vector(document_tokens, features_dictionary):
  bow_vector = [0] * len(features_dictionary)
  for token in document_tokens:
    if token in features_dictionary:
      feature_index = features_dictionary[token]
      bow_vector[feature_index] += 1
  return bow_vector

# Define bow_sms_dictionary:
bow_sms_dictionary = create_features_dictionary(training_doc_tokens)

# Define training_vectors:
training_vectors = [tokens_to_bow_vector(training_doc, bow_sms_dictionary) for training_doc in training_spam_docs]
# Define test_vectors:
test_vectors = [tokens_to_bow_vector(test_doc, bow_sms_dictionary) for test_doc in test_spam_docs]

spam_classifier = MultinomialNB()

def spam_or_not(label):
  return "spam" if label else "not spam"

# Uncomment the code below when you're done:
spam_classifier.fit(training_vectors, training_labels)

predictions = spam_classifier.score(test_vectors, test_labels)

'''print("The predictions for the test data were {0}% accurate.\n\nFor example, '{1}' 
was classified as {2}.\n\nMeanwhile, '{3}' was classified as {4}.".format(predictions * 100,
test_docs[0], spam_or_not(test_labels[0]), test_docs[10], spam_or_not(test_labels[10])))
'''

For text_to_bow(), you can approximate the functionality with the collections module’s Counter() function:

In [None]:
from collections import Counter
 
tokens = ['another', 'five', 'fish', 'find', 'another', 'faraway', 'fish']
print(Counter(tokens))
 
# Counter({'fish': 2, 'another': 2, 'find': 1, 'five': 1, 'faraway': 1})

For vectorization, you can use CountVectorizer from the machine learning library scikit-learn. You can use fit() to train the features dictionary and then transform() to transform text into a vector:

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
 
training_documents = ["Five fantastic fish flew off to find faraway functions.", "Maybe find another five fantastic fish?", "Find my fish with a function please!"]
test_text = ["Another five fish find another faraway fish."]
bow_vectorizer = CountVectorizer()
bow_vectorizer.fit(training_documents)
bow_vector = bow_vectorizer.transform(test_text)
print(bow_vector.toarray())
# [[2 0 1 1 2 1 0 0 0 0 0 0 0 0 0]]

In [None]:
from spam_data import training_spam_docs, training_doc_tokens, training_labels, test_labels, test_spam_docs, training_docs, test_docs
from sklearn.naive_bayes import MultinomialNB
# Import CountVectorizer from sklearn:
from sklearn.feature_extraction.text import CountVectorizer

# Define bow_vectorizer:
bow_vectorizer = CountVectorizer()

# Define training_vectors:
training_vectors = bow_vectorizer.fit_transform(training_docs)
# Define test_vectors:
test_vectors = bow_vectorizer.transform(test_docs)

spam_classifier = MultinomialNB()

def spam_or_not(label):
  return "spam" if label else "not spam"

# Uncomment the code below when you're done:
spam_classifier.fit(training_vectors, training_labels)

predictions = spam_classifier.score(test_vectors, test_labels)

'''print("The predictions for the test data were {0}% accurate.\n\nFor example, 
'{1}' was classified as {2}.\n\nMeanwhile, '{3}' was classified as {4}.".format(
  predictions * 100, test_docs[7], spam_or_not(test_labels[7]), test_docs[15], spam_or_not(test_labels[15])))
'''

Because bag-of-words relies on single words, rather than sequences of words, there are more examples of each unit of language in the training corpus. More examples means the model has less data sparsity (i.e., it has more training knowledge to draw from) than other statistical models.

Overfitting (adapting a model too strongly to training data, akin to our highly tailored shirt) is a common problem for statistical language models. While BoW still suffers from overfitting in terms of vocabulary, it overfits less than other statistical models, allowing for more flexibility in grammar and word choice.

The combination of low data sparsity and less overfitting makes the bag-of-words model more reliable with smaller training data sets than other statistical models.

In [None]:
from preprocessing import preprocess_text
from nltk.util import ngrams
from collections import Counter

text = "It's exciting to watch flying fish after a hard day's work. I don't know why some fish prefer flying and other fish would rather swim. It seems like the fish just woke up one day and decided, 'hey, today is the day to fly away.'"
tokens = preprocess_text(text)

# Bigram approach:
bigrams_prepped = ngrams(tokens, 2)
bigrams = Counter(bigrams_prepped)
print("Three most frequent word sequences and the number of occurrences according to Bigrams:")
print(bigrams.most_common(3))

# Bag-of-Words approach:
# Define bag_of_words here:
bag_of_words = Counter(tokens)
most_common_three = bag_of_words.most_common(3)
print("\nThree most frequent words and number of occurrences according to Bag-of-Words:")
print(most_common_three)

 BoW is NOT a great primary model for text prediction. If that sort of “sentence” isn’t your bag, it’s because bag-of-words has high perplexity, meaning that it’s not a very accurate model for language prediction. The probability of the following word is always just the most frequently used words.
 
Like all statistical models, BoW suffers from overfitting when it comes to vocabulary.

There are several ways that NLP developers have tackled this issue. A common approach is through language smoothing in which some probability is siphoned from the known words and given to unknown words.

In [None]:
import nltk, re, random
from nltk.tokenize import word_tokenize
from collections import defaultdict, deque, Counter
from document import oscar_wilde_thoughts

# Change sequence_length:
sequence_length = 1

class MarkovChain:
  def __init__(self):
    self.lookup_dict = defaultdict(list)
    self.most_common = []
    self._seeded = False
    self.__seed_me()

  def __seed_me(self, rand_seed=None):
    if self._seeded is not True:
      try:
        if rand_seed is not None:
          random.seed(rand_seed)
        else:
          random.seed()
        self._seeded = True
      except NotImplementedError:
        self._seeded = False
    
  def add_document(self, str):
    preprocessed_list = self._preprocess(str)
    self.most_common = Counter(preprocessed_list).most_common(50)
    pairs = self.__generate_tuple_keys(preprocessed_list)
    for pair in pairs:
      self.lookup_dict[pair[0]].append(pair[1])
  
  def _preprocess(self, str):
    cleaned = re.sub(r'\W+', ' ', str).lower()
    tokenized = word_tokenize(cleaned)
    return tokenized

  def __generate_tuple_keys(self, data):
    if len(data) < sequence_length:
      return

    for i in range(len(data) - 1):
      yield [ data[i], data[i + 1] ]
      
  def generate_text(self, max_length=50):
    context = deque()
    output = []
    if len(self.lookup_dict) > 0:
      self.__seed_me(rand_seed=len(self.lookup_dict))
      chain_head = [list(self.lookup_dict)[0]]
      context.extend(chain_head)
      if sequence_length > 1:
        while len(output) < (max_length - 1):
          next_choices = self.lookup_dict[context[-1]]
          if len(next_choices) > 0:
            next_word = random.choice(next_choices)
            context.append(next_word)
            output.append(context.popleft())
          else:
            break
        output.extend(list(context))
      else:
        while len(output) < (max_length - 1):
          next_choices = [word[0] for word in self.most_common]
          next_word = random.choice(next_choices)
          output.append(next_word)
    return " ".join(output)

my_markov = MarkovChain()
my_markov.add_document(oscar_wilde_thoughts)
random_oscar_wilde = my_markov.generate_text()
print(random_oscar_wilde)

Review of Bag-of-Words
You made it! And you’ve learned plenty about the bag-of-words language model along the way:

Bag-of-words (BoW) — also referred to as the unigram model — is a statistical language model based on word count.
There are loads of real-world applications for BoW.
BoW can be implemented as a Python dictionary with each key set to a word and each value set to the number of times that word appears in a text.
For BoW, training data is the text that is used to build a BoW model.
BoW test data is the new text that is converted to a BoW vector using a trained features dictionary.
A feature vector is a numeric depiction of an item’s salient features.
Feature extraction (or vectorization) is the process of turning text into a BoW vector.
A features dictionary is a mapping of each unique word in the training data to a unique index. This is used to build out BoW vectors.
BoW has less data sparsity than other statistical models. It also suffers less from overfitting.
BoW has higher perplexity than other models, making it less ideal for language prediction.
One solution to overfitting is language smoothing, in which a bit of probability is taken from known words and allotted to unknown words.

In [None]:
# APPLY NATURAL LANGUAGE PROCESSING WITH PYTHON
# Mystery Friend

from goldman_emma_raw import goldman_docs
from henson_matthew_raw import henson_docs
from wu_tingfang_raw import wu_docs
# import sklearn modules here:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

# Setting up the combined list of friends' writing samples
friends_docs = goldman_docs + henson_docs + wu_docs
# Setting up labels for your three friends
friends_labels = [1] *len(goldman_docs) + [2] *len(henson_docs) + [3] *len(wu_docs)
# Print out a document from each friend:


mystery_postcard = """
I do not pretend to write a history. Removed by fifty or a hundred years from the events he is describing, the historian may seem to be objective. But real history is not a compilation of mere data. It is valueless without the human[Pg vii] element which the historian necessarily gets from the writings of the contemporaries of the events in question. It is the personal reactions of the participants and observers which lend vitality to all history and make it vivid and alive. Thus, numerous histories have been written of the French Revolution; yet there are only a very few that stand out true and convincing, illuminative in the degree in which the historian has felt his subject through the medium of human documents left by the contemporaries of the period.
"""

# Create bow_vectorizer:
bow_vectorizer = CountVectorizer()

# Define friends_vectors:
friends_vectors = bow_vectorizer.fit_transform(friends_docs)

# Define mystery_vector: 
# mystery_postcard is a string, while the vectorizer expects a list as an argument
mystery_vector =  bow_vectorizer.transform([mystery_postcard])
# Define friends_classifier:
friends_classifier = MultinomialNB()

# Train the classifier:
friends_classifier.fit(friends_vectors, friends_labels)

# Change predictions:
#predictions = friends_classifier.predict(mystery_vector)
predictions = friends_classifier.predict(mystery_vector)
mystery_friend = predictions[0] if predictions[0] else "someone else"

# Uncomment the print statement:
print("The postcard was from {}!".format(mystery_friend))
print(predictions)

In [None]:
# What does mystery_bow_function() do?
# The dictionary is comprised of words with their corresponding counts.

def mystery_bow_function(training_data):
  bow_dictionary = {}
  tokens = preprocess_text(training_data)
  for token in tokens:
    if token in bow_dictionary:
      bow_dictionary[token] += 1
    else:
      bow_dictionary[token] = 1
  return bow_dictionary
 
print(mystery_bow_function("Squealing suitcase squids are not like regular squids."))
 
# {'regular': 1, 'squeal': 1, 'squid': 2, 'be': 1, 'like': 1, 'suitcase': 1, 'not': 1}

TERM FREQUENCY–INVERSE DOCUMENT FREQUENCY

Tf-idf is another powerful tool in your NLP toolkit that has a variety of use cases included:

ranking results in a search engine
text summarization
building smarter chatbots

Term frequency-inverse document frequency is a numerical statistic used to indicate how important a word is to each document in a collection of documents, or a corpus.

Tf-idf relies on two different metrics in order to come up with an overall score:

term frequency, or how often a word appears in a document. This is the same as bag-of-words’ word count.
inverse document frequency, which is a measure of how often a word appears in the overall corpus. By penalizing the score of words that appear throughout a corpus, tf-idf can give better insight into how important a word is to a particular document of a corpus.
We will dig into each component of tf-idf in the next two exercises.

In [None]:
import codecademylib3_seaborn
from preprocessing import preprocess_text
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

# sample documents
document_1 = "This is a sample sentence!"
document_2 = "This is my second sentence."
document_3 = "Is this my third sentence?"

# corpus of documents
corpus = [document_1, document_2, document_3]

# preprocess documents
processed_corpus = [preprocess_text(doc) for doc in corpus]

# initialize and fit TfidfVectorizer
vectorizer = TfidfVectorizer(norm=None)
tf_idf_scores = vectorizer.fit_transform(processed_corpus)

# get vocabulary of terms
feature_names = vectorizer.get_feature_names()
corpus_index = [n for n in processed_corpus]

# create pandas DataFrame with tf-idf scores
df_tf_idf = pd.DataFrame(tf_idf_scores.T.todense(), index=feature_names, columns=corpus_index)
print(df_tf_idf)

In [None]:
import codecademylib3_seaborn
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from preprocessing import preprocess_text

poem = '''
Success is counted sweetest
By those who ne'er succeed.
To comprehend a nectar
Requires sorest need.

Not one of all the purple host
Who took the flag to-day
Can tell the definition,
So clear, of victory,

As he, defeated, dying,
On whose forbidden ear
The distant strains of triumph
Break, agonized and clear!'''

# define clear_count:
clear_count = 2

# preprocess text
processed_poem = preprocess_text(poem)

# initialize and fit CountVectorizer
vectorizer = CountVectorizer()
term_frequencies = vectorizer.fit_transform([processed_poem])

# get vocabulary of terms
feature_names = vectorizer.get_feature_names()

# create pandas DataFrame with term frequencies
try:
  df_term_frequencies = pd.DataFrame(term_frequencies.T.todense(), index=feature_names, columns=['Term Frequency'])
  print(df_term_frequencies)
except:
  pass

Inverse document frequency can be calculated on a group of documents using scikit-learn’s TfidfTransformer:

In [None]:
transformer = TfidfTransformer(norm=None)
transformer.fit(term_frequencies)
inverse_doc_frequency = transformer.idf_

a TfidfTransformer object is initialized. Don’t worry about the norm=None keyword argument for now, we will dig into this in the next exercise
the TfidfTransformer is fit (trained) on a term-document matrix of term frequencies
the .idf_ attribute of the TfidfTransformer stores the inverse document frequencies of the terms as a NumPy array

In [None]:
# term_frequency
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from preprocessing import preprocess_text
from poems import poems

# preprocess text
processed_poems = [preprocess_text(poem) for poem in poems]

# initialize and fit CountVectorizer
vectorizer = CountVectorizer()
term_frequencies = vectorizer.fit_transform(processed_poems)

# get vocabulary of terms
feature_names = vectorizer.get_feature_names()

# get corpus index
corpus_index = [f"Poem {i+1}" for i in range(len(poems))]

# create pandas DataFrame with term frequencies
df_term_frequencies = pd.DataFrame(term_frequencies.T.todense(), index=feature_names, columns=corpus_index)

In [None]:
import codecademylib3_seaborn
import pandas as pd
from sklearn.feature_extraction.text import TfidfTransformer
from term_frequency import term_frequencies, feature_names, df_term_frequencies

# display term-document matrix of term frequencies
print(df_term_frequencies)

# initialize and fit TfidfTransformer
transformer = TfidfTransformer(norm=None)
transformer.fit(term_frequencies)
idf_values = transformer.idf_

# create pandas DataFrame with inverse document frequencies
try:
  df_idf = pd.DataFrame(idf_values, index = feature_names, columns=['Inverse Document Frequency'])
  print(df_idf)
except:
  pass

We can easily calculate the tf-idf values for each term-document pair in our corpus using scikit-learn’s TfidfVectorizer:

In [None]:
vectorizer = TfidfVectorizer(norm=None)
tfidf_vectorizer = vectorizer.fit_transform(corpus)

a TfidfVectorizer object is initialized. The norm=None keyword argument prevents scikit-learn from modifying the multiplication of term frequency and inverse document frequency
the TfidfVectorizer object is fit and transformed on the corpus of data, returning the tf-idf scores for each term-document pair

Converting Bag-of-Words to Tf-idf
In addition to directly calculating the tf-idf scores for a set of terms across a corpus, you can also convert a bag-of-words model you have already created into tf-idf scores. Scikit-learn’s TfidfTransformer is up to the task of converting your bag-of-words model to tf-idf. You begin by initializing a TfidfTransformer object.

In [None]:
tf_idf_transformer = TfidfTransformer(norm=False)

Given a bag-of-words matrix count_matrix, you can now multiply the term frequencies by their inverse document frequency to get the tf-idf scores as follows:

In [None]:
tf_idf_scores = tfidf_transformer.fit_transform(count_matrix)

This is very similar to how we calculated inverse document frequency, except this time we are fitting and transforming the TfidfTransformer to the term frequencies/bag-of-words vectors rather than just fitting the TfidfTransformer to them.

In [None]:
import codecademylib3_seaborn
import pandas as pd
from sklearn.feature_extraction.text import TfidfTransformer
from term_frequency import bow_matrix, feature_names, df_bag_of_words, corpus_index

# display term-document matrix of term frequencies (bag-of-words)
print(df_bag_of_words)

# initialize and fit TfidfTransformer, transform bag-of-words matrix
transformer = TfidfTransformer(norm=False)
tfidf_scores = transformer.fit_transform(bow_matrix)

# create pandas DataFrame with tf-idf scores
try:
  df_tf_idf = pd.DataFrame(tfidf_scores.T.todense(), index = feature_names, columns=corpus_index)
  print(df_tf_idf)
except:
  pass

In [None]:
the_raven_stanzas = the_raven.split('.')

In [None]:
import codecademylib3_seaborn
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from raven import the_raven_stanzas
from preprocessing import preprocess_text

# view first stanza
print(the_raven_stanzas[0])

# preprocess documents
processed_stanzas = [preprocess_text(stanza) for stanza in the_raven_stanzas]

# initialize and fit TfidfVectorizer
vectorizer = TfidfVectorizer(norm=None)
tfidf_scores = vectorizer.fit_transform(processed_stanzas)

# get vocabulary of terms
feature_names = vectorizer.get_feature_names()

# get stanza index
stanza_index = [f"Stanza {i+1}" for i in range(len(the_raven_stanzas))]

# create pandas DataFrame with tf-idf scores
try:
  df_tf_idf = pd.DataFrame(tfidf_scores.T.todense(), index=feature_names, columns=stanza_index)
  print(df_tf_idf)
except:
  pass

Working with Text Data | scikit-learn | From Occurrences to Frequencies
Working with Text Data | scikit-learn | From Occurrences to Frequencies
In this documentation, you will learn how to use scikit-learn to conduct tf-idf. This is helpful if you are trying to determine topics or themes within text data.
https://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html#from-occurrences-to-frequencies

In [None]:
import codecademylib3_seaborn
import pandas as pd
import numpy as np
from articles import articles
from preprocessing import preprocess_text
# import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer,TfidfVectorizer

# view article
# print(articles[1])

# preprocess articles
processed_articles = []
for article in articles:
  processed_articles.append(preprocess_text(article))
# print(processed_articles)
# or it can be done like this
# processed_articles = [preprocess_text(article) for article in articles]

# initialize and fit CountVectorizer
vectorizer = CountVectorizer()
counts = vectorizer.fit_transform(processed_articles)
# convert counts to tf-idf
transformer = TfidfTransformer(norm=None)

# initialize and fit TfidfVectorizer
tfidf_scores_transformed = transformer.fit_transform(counts)
# check if tf-idf scores are equal
vectorizer = TfidfVectorizer(norm=None)
tfidf_scores = vectorizer.fit_transform(processed_articles)

# Let’s confirm that the tf-idf scores given by TfidfTransformer and TfidfVectorizer are the same.
if np.allclose(tfidf_scores_transformed.todense(), tfidf_scores.todense()):
  print(pd.DataFrame({'Are the tf-idf scores the same?':['YES']}))
else:
  print(pd.DataFrame({'Are the tf-idf scores the same?':['No, something is wrong :(']}))



# get vocabulary of terms
try:
  feature_names = vectorizer.get_feature_names()
except:
  pass

# get article index
try:
  article_index = [f"Article {i+1}" for i in range(len(articles))]
except:
  pass

# create pandas DataFrame with word counts
try:
  df_word_counts = pd.DataFrame(counts.T.todense(), index=feature_names, columns=article_index)
  print(df_word_counts)
except:
  pass

# create pandas DataFrame(s) with tf-idf scores
try:
  df_tf_idf = pd.DataFrame(tfidf_scores_transformed.T.todense(), index=feature_names, columns=article_index)
  print(df_tf_idf)
except:
  pass

try:
  df_tf_idf = pd.DataFrame(tfidf_scores.T.todense(), index=feature_names, columns=article_index)
  print(df_tf_idf)
except:
  pass

# get highest scoring tf-idf term for each article
# The Pandas Series method .idxmax() is a helpful tool for returning the index of the highest value in a DataFrame column.
for i in range(1,11):
  print(df_tf_idf[[f'Article {i}']].idxmax())


WORD EMBEDDINGS

In [None]:
import spacy
from scipy.spatial.distance import cosine

# load model
nlp = spacy.load('en')

# define vectors
summer_vec = nlp("summer").vector
winter_vec = nlp("winter").vector

# compare similarity
print(f"The cosine distance between the word embeddings for 'summer' and 'winter' is: {cosine(summer_vec, winter_vec)}\n")

# define vectors
mustard_vec = nlp("mustard").vector
amazing_vec = nlp("amazing").vector

# compare similarity
print(f"The cosine distance between the word embeddings for 'mustard' and 'amazing' is: {cosine(mustard_vec, amazing_vec)}\n")

# display word embeddings
print(f"'summer' in vector form: {summer_vec}")
print(f"'winter' in vector form: {winter_vec}")
# print(f"'mustard' in vector form: {mustard_vec}")
# print(f"'amazing' in vector form: {amazing_vec}")

We can easily represent vectors in Python using NumPy arrays. To create a vector containing the odd numbers from 1 to 9, we can use NumPy’s .array() method:

odd_vector = np.array([1, 3, 5, 7, 9])

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import codecademylib3_seaborn

# define score vectors
scores_xavier = np.array([88, 92])
scores_niko = np.array([94, 87])
scores_alena = np.array([90, 48])

# plot vectors
try:
  plt.arrow(0, 0, scores_xavier[0], scores_xavier[1], width=1, color='blue')
except:
  pass
try:
  plt.arrow(0, 0, scores_niko[0], scores_niko[1], width=1, color='orange')
except:
  pass
try:
  plt.arrow(0, 0, scores_alena[0], scores_alena[1], width=1, color='purple')
except:
  pass
plt.axis([0, 100, 0, 100])
plt.show()

Word embeddings are vector representations of a word.

They allow us to take all the information that is stored in a word, like its meaning and its part of speech, and convert it into a numeric form that is more understandable to a computer.

We can load a basic English word embedding model using spaCy as follows:

nlp = spacy.load('en') Note: the convention is to load spaCy models into a variable named nlp.

To get the vector representation of a word, we call the model with the desired word as an argument and can use the .vector attribute.

nlp('love').vector

In [None]:
import spacy
# load word embedding model
nlp = spacy.load('en')

# define word embedding vectors
happy_vec = nlp('happy').vector
sad_vec = nlp('sad').vector
angry_vec = nlp('angry').vector

# find vector length here
vector_length = len(happy_vec)
print(vector_length)

The key at the heart of word embeddings is distance. We can easily calculate the Manhattan, Euclidean, and cosine distances between vectors using helper functions from SciPy. When working with vectors that have a large number of dimensions, such as word embeddings, the distances calculated by Manhattan and Euclidean distance can become rather large. Thus, calculations using cosine distance are preferred!

In Manhattan distance, also known as city block distance, distance is defined as the sum of the differences across each individual dimension of the vectors.

Another common distance metric is called the Euclidean distance, also known as straight line distance. With this distance metric, we take the square root of the sum of the squares of the differences in each dimension.

The final distance we will consider is the cosine distance. Cosine distance is concerned with the angle between two vectors, rather than by looking at the distance between the points, or ends, of the vectors. Two vectors that point in the same direction have no angle between them, and have a cosine distance of 0. Two vectors that point in opposite directions, on the other hand, have a cosine distance of 1

In [None]:
from scipy.spatial.distance import cityblock, euclidean, cosine
 
vector_a = np.array([1,2,3])
vector_b = np.array([2,4,6])
 
# Manhattan distance:
manhattan_d = cityblock(vector_a,vector_b) # 6
 
# Euclidean distance:
euclidean_d = euclidean(vector_a,vector_b) # 3.74
 
# Cosine distance:
cosine_d = cosine(vector_a,vector_b) # 0.0

In [None]:
import numpy as np
from scipy.spatial.distance import cityblock, euclidean, cosine
import spacy

# load word embedding model
nlp = spacy.load('en')

# define word embedding vectors
happy_vec = nlp('happy').vector
sad_vec = nlp('sad').vector
angry_vec = nlp('angry').vector

# calculate Manhattan distance
man_happy_sad = cityblock(happy_vec,sad_vec)
man_sad_angry = cityblock(sad_vec, angry_vec)
print(man_happy_sad)
print(man_sad_angry)


# calculate Euclidean distance
euc_happy_sad = euclidean(happy_vec,sad_vec)
euc_sad_angry = euclidean(sad_vec, angry_vec)
print(euc_happy_sad)
print(euc_sad_angry)

# calculate cosine distance
cos_happy_sad = cosine(happy_vec,sad_vec)
cos_sad_angry = cosine(sad_vec, angry_vec)
print(cos_happy_sad)
print(cos_sad_angry)

The cosine distance values, however, remain low and bounded between 0 and 1, where the Manhattan and Euclidean distances are rather large (and continue to grow as more dimensions are added to a vector).

The idea behind word embeddings is a theory known as the distributional hypothesis. This hypothesis states that words that co-occur in the same contexts tend to have similar meanings. With word embeddings, we map words that exist with the same context to similar places in our vector space (math-speak for the area in which our vectors exist).

In [None]:
import spacy
from scipy.spatial.distance import cosine
from processing import most_common_words, vector_list

# print word and vector representation at index 347
# print(most_common_words[347])
# print(vector_list[347])

# define find_closest_words
def find_closest_words(word_list, vector_list, word_to_check):
    return sorted(word_list,
                  key=lambda x: cosine(vector_list[word_list.index(word_to_check)], vector_list[word_list.index(x)]))[:10]

# find closest words to food
close_to_food = find_closest_words(most_common_words, vector_list,"food")
print(close_to_food)

# find closest words to summer

close_to_summer = find_closest_words(most_common_words, vector_list,"summer")
print(close_to_summer)

Word2vec

Step in word2vec! Word2vec is a statistical learning algorithm that develops word embeddings from a corpus of text. Word2vec uses one of two different model architectures to come up with the values that define a collection of word embeddings.

One method is to use the continuous bag-of-words (CBOW) representation of a piece of text. The word2vec model goes through each word in the training corpus, in order, and tries to predict what word comes at each position based on applying bag-of-words to the words that surround the word in question. In this approach, the order of the words does not matter!

The other method word2vec can use to create word embeddings is continuous skip-grams. Skip-grams function similarly to n-grams, except instead of looking at groupings of n-consecutive words in a text, we can look at sequences of words that are separated by some specified distance between them.

When using continuous skip-grams, the order of context is taken into consideration! Because of this, the time it takes to train the word embeddings is slower than when using continuous bag-of-words. The results, however, are often much better!

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

sentence = "It was the best of times, it was the worst of times."
print(sentence)

# preprocessing
sentence_lst = [word.lower().strip(".") for word in sentence.split()]

# set context_length
# This indicates that when finding our bag-of-words and skip-gram representations, we are only looking 2 words to the left and 2 words to the right of our word we are focusing on.
# to increase number here means that when word2vec trains our model, a larger context will be taken into consideration!
context_length = 3

# function to get cbows
def get_cbows(sentence_lst, context_length):
  cbows = list()
  for i, val in enumerate(sentence_lst):
    if i < context_length:
      pass
    elif i < len(sentence_lst) - context_length:
      context = sentence_lst[i-context_length:i] + sentence_lst[i+1:i+context_length+1]
      vectorizer = CountVectorizer()
      vectorizer.fit_transform(context)
      context_no_order = vectorizer.get_feature_names()
      cbows.append((val,context_no_order))
  return cbows

# define cbows here:
cbows = get_cbows(sentence_lst, context_length)


# function to get cbows
def get_skip_grams(sentence_lst, context_length):
  skip_grams = list()
  for i, val in enumerate(sentence_lst):
    if i < context_length:
      pass
    elif i < len(sentence_lst) - context_length:
      context = sentence_lst[i-context_length:i] + sentence_lst[i+1:i+context_length+1]
      skip_grams.append((val, context))
  return skip_grams

# define skip_grams here:
skip_grams = get_skip_grams(sentence_lst,context_length)

try:
  print('\nContinuous Bag of Words')
  for cbow in cbows:
    print(cbow)
except:
  pass
try:
  print('\nSkip Grams')
  for skip_gram in skip_grams:
    print(skip_gram)
except:
  pass

While the words themselves do not vary between the output of get_cbows() and get_skip_grams(), the order is different! With continuous skip-grams we do care about word order, so the order is preserved in our output.

Gensim

When we want to train our own word2vec model on a corpus of text, we can use the gensim package! With gensim, however, we are able to build our own word embeddings on any corpus of text we like.

To easily train a word2vec model on our own corpus of text, we can use gensim’s Word2Vec() function.

In [None]:
model = gensim.models.Word2Vec(corpus, size=100, window=5, min_count=1, workers=2, sg=1)

corpus is a list of lists, where each inner list is a document in the corpus and each element in the inner lists is a word token
size determines how many dimensions our word embeddings will include. Word embeddings often have upwards of 1,000 dimensions! Here we will create vectors of 100-dimensions to keep things simple.

To view the entire vocabulary used to train the word embedding model, we can use the .wv.vocab.items() method.

In [None]:
vocabulary_of_model = list(model.wv.vocab.items())

To easily find which vectors gensim placed close together in its word embedding model, we can use the .most_similar() method.

In [None]:
model.most_similar("my_word_here", topn=100)

"my_word_here" is the target word token we want to find most similar words to
topn is a keyword argument that indicates how many similar word vectors we want returned

In [None]:
model.doesnt_match(["asia", "mars", "pluto"])

when given a list of terms in the vocabulary as an argument, .doesnt_match() returns which term is furthest from the others.

In [None]:
import gensim
from nltk.corpus import stopwords
from romeo_juliet import romeo_and_juliet

# load stop words
stop_words = stopwords.words('english')

# preprocess text
romeo_and_juliet_processed = [[word for word in romeo_and_juliet.lower().split() if word not in stop_words]]

# view inner list of romeo_and_juliet_processed
print(romeo_and_juliet_processed[0][:20])

# train word embeddings model
model = gensim.models.Word2Vec(romeo_and_juliet_processed, size=100, window=5, min_count=1, workers=2, sg=1)

# view vocabulary
vocabulary = list(model.wv.vocab.items())
#print(vocabulary)

# similar to romeo
similar_to_romeo = model.most_similar("romeo", topn=20)
print(similar_to_romeo)


# one is not like the others
not_star_crossed_lover = model.doesnt_match(["romeo", "juliet", "mercutio"])
print(not_star_crossed_lover)

U.S.A. Presidential Vocabulary project

helper

In [None]:
import os
from nltk.tokenize import PunktSentenceTokenizer
from collections import Counter

def read_file(file_name):
  with open(file_name, 'r+', encoding='utf-8') as file:
    file_text = file.read()
  return file_text

def process_speeches(speeches):
  word_tokenized_speeches = list()
  for speech in speeches:
    sentence_tokenizer = PunktSentenceTokenizer()
    sentence_tokenized_speech = sentence_tokenizer.tokenize(speech)
    word_tokenized_sentences = list()
    for sentence in sentence_tokenized_speech:
      word_tokenized_sentence = [word.lower().strip('.').strip('?').strip('!') for word in sentence.replace(",","").replace("-"," ").replace(":","").split()]
      word_tokenized_sentences.append(word_tokenized_sentence)
    word_tokenized_speeches.append(word_tokenized_sentences)
  return word_tokenized_speeches

def merge_speeches(speeches):
  all_sentences = list()
  for speech in speeches:
    for sentence in speech:
      all_sentences.append(sentence)
  return all_sentences

def get_president_sentences(president):
  files = sorted([file for file in os.listdir() if president.lower() in file.lower()])
  speeches = [read_file(file) for file in files]
  processed_speeches = process_speeches(speeches)
  all_sentences = merge_speeches(processed_speeches)
  return all_sentences

def get_presidents_sentences(presidents):
  all_sentences = list()
  for president in presidents:
    files = sorted([file for file in os.listdir() if president.lower() in file.lower()])
    speeches = [read_file(file) for file in files]
    processed_speeches = process_speeches(speeches)
    all_prez_sentences = merge_speeches(processed_speeches)
    all_sentences.extend(all_prez_sentences)
  return all_sentences

def most_frequent_words(list_of_sentences):
  all_words = [word for sentence in list_of_sentences for word in sentence]
  return Counter(all_words).most_common()

After calling process_speeches() with speeches as an argument, the data will be formatted such that:

processed_speeches[0] represents the first inaugural address in processed_speeches.
processed_speeches[0][0] represents the first sentence in the first inaugural address in processed_speeches.
processed_speeches[0][0][0] represents the first word in the first sentence in the first inaugural address in processed_speeches.

In [None]:
import os
import gensim
import spacy
from president_helper import read_file, process_speeches, merge_speeches, get_president_sentences, get_presidents_sentences, most_frequent_words

# get list of all speech files
files = sorted([file for file in os.listdir() if file[-4:] == '.txt'])
# print(files)

# read each speech file
speeches = []
for file in files:
  speeches.append(read_file(file))

# preprocess each speech
processed_speeches = process_speeches(speeches)
# merge speeches
all_sentences = merge_speeches(processed_speeches)

# view most frequently used words
most_freq_words = most_frequent_words(all_sentences)
# print(most_freq_words)

# create gensim model of all speeches
all_prez_embeddings = gensim.models.Word2Vec(all_sentences, size=96, window=5, min_count=1, workers=2, sg=1)

# view words similar to freedom
similar_to_freedom = all_prez_embeddings.most_similar("freedom", topn = 20)
# print(similar_to_freedom)

similar_to_problems = all_prez_embeddings.most_similar("problems", topn = 20)
# print(similar_to_problems)

# get President Roosevelt sentences
roosevelt_sentences = get_president_sentences("franklin-d-roosevelt")

# view most frequently used words of Roosevelt
roosevelt_most_freq_words = most_frequent_words(roosevelt_sentences)
# print(roosevelt_most_freq_words)

# create gensim model for Roosevelt
roosevelt_embeddings = gensim.models.Word2Vec(roosevelt_sentences, size=96, window=5, min_count=1, workers=2, sg=1)

# view words similar to freedom for Roosevelt
roosevelt_similar_to_freedom = roosevelt_embeddings.most_similar("problems", topn = 20)
# print(roosevelt_similar_to_freedom)

# get sentences of multiple presidents
rushmore_prez_sentences = get_presidents_sentences(["washington","jefferson","lincoln","theodore-roosevelt"])

# view most frequently used words of presidents
rushmore_most_freq_words = most_frequent_words(rushmore_prez_sentences) 
#print(rushmore_most_freq_words)

# create gensim model for the presidents
rushmore_embeddings =  gensim.models.Word2Vec(rushmore_prez_sentences, size=96, window=5, min_count=1, workers=2, sg=1)

# view words similar to freedom for presidents
rushmore_similar_to_freedom = rushmore_embeddings.most_similar("freedom", topn = 20)
print(rushmore_similar_to_freedom)

rushmore_similar_to_problems = rushmore_embeddings.most_similar("problems", topn = 20)
print(rushmore_similar_to_problems)

GENERATING TEXT WITH DEEP LEARNING

Preprocessing for seq2seq

there are a few neural network libraries, we’ll be using TensorFlow with the Keras API to build a pretty limited English-to-Spanish translator 

In [None]:
from tensorflow import keras

We’ll need the following for our Keras implementation:

vocabulary sets for both our input (English) and target (Spanish) data
the total number of unique word tokens we have for each set
the maximum sentence length we’re using for each language

In [None]:
from tensorflow import keras
import re
# Importing our translations
data_path = "span-eng.txt"
# Defining lines as a list of each line
with open(data_path, 'r', encoding='utf-8') as f:
  lines = f.read().split('\n')

# Building empty lists to hold sentences
input_docs = []
target_docs = []
# Building empty vocabulary sets
input_tokens = set()
target_tokens = set()

for line in lines:
  # Input and target sentences are separated by tabs
  input_doc, target_doc = line.split('\t')
  # Appending each input sentence to input_docs
  input_docs.append(input_doc)
  # Splitting words from punctuation
  target_doc = " ".join(re.findall(r"[\w']+|[^\s\w]", target_doc))
  # Redefine target_doc below 
  target_doc = "<START> " + target_doc + " <END>"
  # and append it to target_docs:
  target_docs.append(target_doc)
  
  # Now we split up each sentence into words
  # and add each unique word to our vocabulary set
  for token in re.findall(r"[\w']+|[^\s\w]", input_doc):
    print(token)
    # Add your code here:
    if token not in input_tokens:
      input_tokens.add(token)
    
  for token in target_doc.split():
    print(token)
    # And here:
    if token not in target_tokens:
      target_tokens.add(token)
    

input_tokens = sorted(list(input_tokens))
target_tokens = sorted(list(target_tokens))

# Create num_encoder_tokens and num_decoder_tokens:
num_encoder_tokens = len(input_tokens)
num_decoder_tokens = len(target_tokens)

try:
  max_encoder_seq_length = max([len(re.findall(r"[\w']+|[^\s\w]", input_doc)) for input_doc in input_docs])
  max_decoder_seq_length = max([len(re.findall(r"[\w']+|[^\s\w]", target_doc)) for target_doc in target_docs])
except ValueError:
  pass

Training Setup (part 1)
 In a one-hot vector, every token in our set is represented by a 0 except for the current token which is represented by a 1. For example given the vocabulary ["the", "dog", "licked", "me"], a one-hot vector for “dog” would look like [0, 1, 0, 0].
 Because each matrix is almost all zeros, we’ll use numpy.zeros() from the NumPy library to build them out.

In [None]:
import numpy as np
 
encoder_input_data = np.zeros(
    (len(input_docs), max_encoder_seq_length, num_encoder_tokens),
    dtype='float32')

the shape of the matrix — in our case the number of documents (or sentences) by the maximum token sequence length (the longest sentence we want to see) by the number of unique tokens (or words)
the data type we want — in our case NumPy’s float32, which can speed up our processing a bit

In [None]:
from tensorflow import keras
import re
# Importing our translations
data_path = "span-eng.txt"
# Defining lines as a list of each line
with open(data_path, 'r', encoding='utf-8') as f:
  lines = f.read().split('\n')

# Building empty lists to hold sentences
input_docs = []
target_docs = []
# Building empty vocabulary sets
input_tokens = set()
target_tokens = set()

for line in lines:
  # Input and target sentences are separated by tabs
  input_doc, target_doc = line.split('\t')
  # Appending each input sentence to input_docs
  input_docs.append(input_doc)
  # Splitting words from punctuation
  target_doc = " ".join(re.findall(r"[\w']+|[^\s\w]", target_doc))
  # Redefine target_doc below 
  # and append it to target_docs:
  target_doc = '<START> ' + target_doc + ' <END>'
  target_docs.append(target_doc)
  
  # Now we split up each sentence into words
  # and add each unique word to our vocabulary set
  for token in re.findall(r"[\w']+|[^\s\w]", input_doc):
    print(token)
    # Add your code here:
    if token not in input_tokens:
      input_tokens.add(token)
  for token in target_doc.split():
    print(token)
    # And here:
    if token not in target_tokens:
      target_tokens.add(token)

input_tokens = sorted(list(input_tokens))
target_tokens = sorted(list(target_tokens))

# Create num_encoder_tokens and num_decoder_tokens:
num_encoder_tokens = len(input_tokens)
num_decoder_tokens = len(target_tokens)

max_encoder_seq_length = max([len(re.findall(r"[\w']+|[^\s\w]", input_doc)) for input_doc in input_docs])
max_decoder_seq_length = max([len(re.findall(r"[\w']+|[^\s\w]", target_doc)) for target_doc in target_docs])


In [None]:
from tensorflow import keras
import numpy as np
from preprocessing import input_docs, target_docs, input_tokens, target_tokens, num_encoder_tokens, num_decoder_tokens, max_encoder_seq_length, max_decoder_seq_length

print('Number of samples:', len(input_docs))
print('Number of unique input tokens:', num_encoder_tokens)
print('Number of unique output tokens:', num_decoder_tokens)
print('Max sequence length for inputs:', max_encoder_seq_length)
print('Max sequence length for outputs:', max_decoder_seq_length)

input_features_dict = dict(
    [(token, i) for i, token in enumerate(input_tokens)])
# Build out target_features_dict:
target_features_dict =  dict([(token, i) for i, token in enumerate(target_tokens)])

# Reverse-lookup token index to decode sequences back to
# something readable.
reverse_input_features_dict = dict(
    (i, token) for token, i in input_features_dict.items())
# Build out reverse_target_features_dict:
reverse_target_features_dict = dict(
    (i, token) for token, i in target_features_dict.items())

encoder_input_data = np.zeros(
    (len(input_docs), max_encoder_seq_length, num_encoder_tokens),
    dtype='float32')
print("\nHere's the first item in the encoder input matrix:\n", encoder_input_data[0], "\n\nThe number of columns should match the number of unique input tokens and the number of rows should match the maximum sequence length for input sentences.")

# Build out the decoder_input_data matrix:
decoder_input_data = np.zeros(
    (len(input_docs), max_decoder_seq_length, num_decoder_tokens),
    dtype='float32')
# Build out the decoder_target_data matrix:
decoder_target_data = np.zeros(
    (len(input_docs), max_decoder_seq_length, num_decoder_tokens),
    dtype='float32')

Training Setup (part 2)
To build out a three-dimensional NumPy matrix of one-hot vectors, we can assign a value of 1 for a given word at a given timestep in a given line:

In [None]:
matrix_name[line, timestep, features_dict[token]] = 1.

Keras will fit — or train — the seq2seq model using these matrices of one-hot vectors:

the encoder input data
the decoder input data
the decoder target data

In [None]:
from tensorflow import keras
import numpy as np
import re
from preprocessing import input_docs, target_docs, input_tokens, target_tokens, num_encoder_tokens, num_decoder_tokens, max_encoder_seq_length, max_decoder_seq_length

input_features_dict = dict(
    [(token, i) for i, token in enumerate(input_tokens)])
target_features_dict = dict(
    [(token, i) for i, token in enumerate(target_tokens)])

reverse_input_features_dict = dict(
    (i, token) for token, i in input_features_dict.items())
reverse_target_features_dict = dict(
    (i, token) for token, i in target_features_dict.items())

encoder_input_data = np.zeros(
    (len(input_docs), max_encoder_seq_length, num_encoder_tokens),
    dtype='float32')
decoder_input_data = np.zeros(
    (len(input_docs), max_decoder_seq_length, num_decoder_tokens),
    dtype='float32')
decoder_target_data = np.zeros(
    (len(input_docs), max_decoder_seq_length, num_decoder_tokens),
    dtype='float32')

for line, (input_doc, target_doc) in enumerate(zip(input_docs, target_docs)):

  for timestep, token in enumerate(re.findall(r"[\w']+|[^\s\w]", input_doc)):

    print("Encoder input timestep & token:", timestep, token)
    print(input_features_dict[token])
    # Assign 1. for the current line, timestep, & word
    # in encoder_input_data:
    encoder_input_data[line, timestep, input_features_dict[token]] = 1

  for timestep, token in enumerate(target_doc.split()):

    # decoder_target_data is ahead of decoder_input_data by one timestep
    print("Decoder input timestep & token:", timestep, token)
    # Assign 1. for the current line, timestep, & word
    # in decoder_input_data:
    decoder_input_data[line, timestep, target_features_dict[token]] = 1
    if timestep > 0:
      # decoder_target_data is ahead by 1 timestep
      # and doesn't include the start token.
      print("Decoder target timestep:", timestep)
      # Assign 1. for the current line, timestep, & word
      # in decoder_target_data:
      decoder_target_data[line, timestep - 1, target_features_dict[token]] = 1

Encoder Training Setup

Our encoder requires two layer types from Keras:

An input layer, which defines a matrix to hold all the one-hot vectors that we’ll feed to the model.
An LSTM layer, with some output dimensionality.
We can import these layers as well as the model we need like so:

In [None]:
from keras.layers import Input, LSTM
from keras.models import Model

Next, we set up the input layer, which requires some number of dimensions that we’re providing. In this case, we know that we’re passing in all the encoder tokens, but we don’t necessarily know our batch size. Fortunately, we can say None because the code is written to handle varying batch sizes, so we don’t need to specify that dimension.

In [None]:
# the shape specifies the input matrix sizes
encoder_inputs = Input(shape=(None, num_encoder_tokens))

For the LSTM layer, we need to select the dimensionality (the size of the LSTM’s hidden states, which helps determine how closely the model molds itself to the training data — something we can play around with) and whether to return the state (in this case we do):

In [None]:
encoder_lstm = LSTM(100, return_state=True)
# we're using a dimensionality of 100
# so any LSTM output matrix will have 
# shape [batch_size, 100]

Remember, the only thing we want from the encoder is its final states. We can get these by linking our LSTM layer with our input layer:

In [None]:
encoder_outputs, state_hidden, state_cell = encoder_lstm(encoder_inputs)

encoder_outputs isn’t really important for us, so we can just discard it. However, the states, we’ll save in a list:

In [None]:
encoder_states = [state_hidden, state_cell]

In [None]:
# We’ve moved the code from the previous exercises into another file

from prep import num_encoder_tokens

from tensorflow import keras
from keras.layers import Input, LSTM
from keras.models import Model

# Create the input layer:
encoder_inputs = Input(shape=(None, num_encoder_tokens))

# Create the LSTM layer:
encoder_lstm = LSTM(256, return_state=True)

# Retrieve the outputs and states:
encoder_outputs, state_hidden, state_cell = encoder_lstm(encoder_inputs)
# Put the states together in a list:
encoder_states = [state_hidden, state_cell]

The decoder looks a lot like the encoder (phew!), with an input layer and an LSTM layer that we use together:

In [None]:
decoder_inputs = Input(shape=(None, num_decoder_tokens))
decoder_lstm = LSTM(100, return_sequences=True, return_state=True)
# This time we care about full return sequences

However, with our decoder, we pass in the state data from the encoder, along with the decoder inputs. This time, we’ll keep the output instead of the states:

In [None]:
# The two states will be discarded for now
decoder_outputs, decoder_state_hidden, decoder_state_cell = 
    decoder_lstm(decoder_inputs, initial_state=encoder_states)

We also need to run the output through a final activation layer, using the Softmax function, that will give us the probability distribution — where all probabilities sum to one — for each token. The final layer also transforms our LSTM output from a dimensionality of whatever we gave it (in our case, 10) to the number of unique words within the hidden layer’s vocabulary (i.e., the number of unique target tokens, which is definitely more than 10!).

In [None]:
decoder_dense = Dense(num_decoder_tokens, activation='softmax')
 
decoder_outputs = decoder_dense(decoder_outputs)

Keras’s implementation could work with several layer types, but Dense is the least complex, so we’ll go with that. We also need to modify our import statement to include it before running the code:

In [None]:
from keras.layers import Input, LSTM, Dense

In [None]:
# we’ve already set up the decoder input and LSTM layers
from prep import num_encoder_tokens, num_decoder_tokens

from tensorflow import keras
# Add Dense to the imported layers
from keras.layers import Input, LSTM, Dense
from keras.models import Model


# Encoder training setup
encoder_inputs = Input(shape=(None, num_encoder_tokens))
encoder_lstm = LSTM(256, return_state=True)
encoder_outputs, state_hidden, state_cell = encoder_lstm(encoder_inputs)
encoder_states = [state_hidden, state_cell]

# The decoder input and LSTM layers:
decoder_inputs = Input(shape=(None, num_decoder_tokens))
decoder_lstm = LSTM(256, return_sequences=True, return_state=True)

# Retrieve the LSTM outputs and states:
decoder_outputs, decoder_state_hidden, decoder_state_cell = decoder_lstm(decoder_inputs, initial_state=encoder_states)

# Build a final Dense layer:
decoder_dense =  Dense(num_decoder_tokens, activation='softmax')

# Filter outputs through the Dense layer:
decoder_outputs = decoder_dense(decoder_outputs)

Build and Train seq2seq

First, we define the seq2seq model using the Model() function we imported from Keras. To make it a seq2seq model, we feed it the encoder and decoder inputs, as well as the decoder output:

In [None]:
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

Finally, our model is ready to train. First, we compile everything. Keras models demand two arguments to compile:

An optimizer (we’re using RMSprop, which is a fancy version of the widely-used gradient descent) to help minimize our error rate (how bad the model is at guessing the true next word given the previous words in a sentence).
A loss function (we’re using the logarithm-based cross-entropy function) to determine the error rate.
Because we care about accuracy, we’re adding that into the metrics to pay attention to while training. Here’s what the compiling code looks like:

In [None]:
model.compile(optimizer='rmsprop', 
              loss='categorical_crossentropy',
              metrics=['accuracy'])

Next we need to fit the compiled model. To do this, we give the .fit() method the encoder and decoder input data (what we pass into the model), the decoder target data (what we expect the model to return given the data we passed in), and some numbers we can adjust as needed:

batch size (smaller batch sizes mean more time, and for some problems, smaller batch sizes will be better, while for other problems, larger batch sizes are better)
the number of epochs or cycles of training (more epochs mean a model that is more trained on the dataset, and that the process will take more time)
validation split (what percentage of the data should be set aside for validating — and determining when to stop training your model — rather than training)
Keras will take it from here to get you a (hopefully) nicely trained seq2seq model:

In [None]:
model.fit([encoder_input_data, decoder_input_data], 
          decoder_target_data,
          batch_size=10,
          epochs=100,
          validation_split=0.2)

In [None]:
from prep import num_encoder_tokens, num_decoder_tokens, decoder_target_data, encoder_input_data, decoder_input_data, decoder_target_data

from tensorflow import keras
# Add Dense to the imported layers
from keras.layers import Input, LSTM, Dense
from keras.models import Model

# Encoder training setup
encoder_inputs = Input(shape=(None, num_encoder_tokens))
encoder_lstm = LSTM(256, return_state=True)
encoder_outputs, state_hidden, state_cell = encoder_lstm(encoder_inputs)
encoder_states = [state_hidden, state_cell]

# Decoder training setup:
decoder_inputs = Input(shape=(None, num_decoder_tokens))
decoder_lstm = LSTM(256, return_sequences=True, return_state=True)
decoder_outputs, decoder_state_hidden, decoder_state_cell = decoder_lstm(decoder_inputs, initial_state=encoder_states)
decoder_dense = Dense(num_decoder_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Building the training model:
training_model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

print("Model summary:\n")
training_model.summary()
print("\n\n")

# Compile the model:
training_model.compile(optimizer='rmsprop',loss='categorical_crossentropy',metrics=['accuracy'])

# Choose the batch size 
# and number of epochs:
# Because we don’t want to crash this exercise, we’ll make the batch size large and the number of epochs very small. (Note that small batch sizes are more prone to crashing a deep learning program in general, but in our case we care about time.)
batch_size = 50
epochs = 50

print("Training the model:\n")
# Train the model:
training_model.fit([encoder_input_data, decoder_input_data],decoder_target_data, batch_size=batch_size, epochs=epochs,validation_split=0.2)

Setup for Testing

However, to generate some original output text, we need to redefine the seq2seq architecture in pieces. The model we used for training our network only works when we already know the target sequence. This time, we have no idea what the Spanish should be for the English we pass in! So we need a model that will decode step-by-step instead of using teacher forcing. To do this, we need a seq2seq network in individual pieces.

To start, we’ll build an encoder model with our encoder inputs and the placeholders for the encoder’s output states:

encoder_model = Model(encoder_inputs, encoder_states)
Next up, we need placeholders for the decoder’s input states, which we can build as input layers and store together. Why? We don’t know what we want to decode yet or what hidden state we’re going to end up with, so we need to do everything step-by-step. We need to pass the encoder’s final hidden state to the decoder, sample a token, and get the updated hidden state back. Then we’ll be able to (manually) pass the updated hidden state back into the network:

latent_dim = 256
decoder_state_input_hidden = Input(shape=(latent_dim,))
 
decoder_state_input_cell = Input(shape=(latent_dim,))
 
decoder_states_inputs = [decoder_state_input_hidden, decoder_state_input_cell]
Using the decoder LSTM and decoder dense layer (with the activation function) that we trained earlier, we’ll create new decoder states and outputs:

decoder_outputs, state_hidden, state_cell = 
    decoder_lstm(decoder_inputs, 
    initial_state=decoder_states_inputs)
 
# Saving the new LSTM output states:
decoder_states = [state_hidden, state_cell]
 
# Below, we redefine the decoder output
# by passing it through the dense layer:
decoder_outputs = decoder_dense(decoder_outputs)
Finally, we can set up the decoder model. This is where we bring together:

the decoder inputs (the decoder input layer)
the decoder input states (the final states from the encoder)
the decoder outputs (the NumPy matrix we get from the final output layer of the decoder)
the decoder output states (the memory throughout the network from one word to the next)
decoder_model = Model(
  [decoder_inputs] + decoder_states_inputs,
  [decoder_outputs] + decoder_states)

In [None]:
from training import encoder_inputs, decoder_inputs, encoder_states, decoder_lstm, decoder_dense

from tensorflow import keras
from keras.layers import Input, LSTM, Dense
from keras.models import Model, load_model

training_model = load_model('training_model.h5')
# These next lines are only necessary
# because we're using a saved model:
encoder_inputs = training_model.input[0]
encoder_outputs, state_h_enc, state_c_enc = training_model.layers[2].output
encoder_states = [state_h_enc, state_c_enc]

# Building the encoder test model:
encoder_model = Model(encoder_inputs, encoder_states)

latent_dim = 256
# Building the two decoder state input layers:
decoder_state_input_hidden = Input(shape=(latent_dim,))

decoder_state_input_cell = Input(shape=(latent_dim,))

# Put the state input layers into a list:
decoder_states_inputs = [decoder_state_input_hidden, decoder_state_input_cell]

# Call the decoder LSTM:
decoder_outputs, state_hidden, state_cell = decoder_lstm(decoder_inputs,initial_state=decoder_states_inputs)
 # Saving the new LSTM output states:
decoder_states = [state_hidden, state_cell]

# Redefine the decoder outputs:
decoder_outputs = decoder_dense(decoder_outputs)

# Build the decoder test model:
decoder_model = Model([decoder_inputs] + decoder_states_inputs,[decoder_outputs] + decoder_states)

The Test Function

The Test Function
Finally, we can get to testing our model! To do this, we need to build a function that:

accepts a NumPy matrix representing the test English sentence input
uses the encoder and decoder we’ve created to generate Spanish output
Inside the test function, we’ll run our new English sentence through the encoder model. The .predict() method takes in new input (as a NumPy matrix) and gives us output states that we can pass on to the decoder:

# test_input is a NumPy matrix
# representing an English sentence
states = encoder.predict(test_input)
Next, we’ll build an empty NumPy array for our Spanish translation, giving it three dimensions:

# batch size: 1
# number of tokens to start with: 1
# number of tokens in our target vocabulary
target_sequence = np.zeros((1, 1, num_decoder_tokens))
Luckily, we already know the first value in our Spanish sentence — "<Start>"! So we can give "<Start>" a value of 1 at the first timestep:

target_sequence[0, 0, target_features_dict['<START>']] = 1.
Before we get decoding, we’ll need a string where we can add our translation to, word by word:

decoded_sentence = ''
This is the variable that we will ultimately return from the function.

In [None]:
from training import encoder_inputs, decoder_inputs, encoder_states, decoder_lstm, decoder_dense, encoder_input_data, num_decoder_tokens

from prep import target_features_dict, reverse_target_features_dict, max_decoder_seq_length, input_docs, target_docs, target_tokens

from tensorflow import keras
from keras.layers import Input, LSTM, Dense
from keras.models import Model, load_model
import numpy as np

training_model = load_model('training_model.h5')
encoder_inputs = training_model.input[0]
encoder_outputs, state_h_enc, state_c_enc = training_model.layers[2].output
encoder_states = [state_h_enc, state_c_enc]
encoder_model = Model(encoder_inputs, encoder_states)

latent_dim = 256
decoder_state_input_hidden = Input(shape=(latent_dim,))
decoder_state_input_cell = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_hidden, decoder_state_input_cell]
decoder_outputs, state_hidden, state_cell = decoder_lstm(decoder_inputs, initial_state=decoder_states_inputs)
decoder_states = [state_hidden, state_cell]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = Model([decoder_inputs] + decoder_states_inputs, [decoder_outputs] + decoder_states)

def decode_sequence(test_input):
  # Encode the input as state vectors:
  encoder_states_value = encoder_model.predict(test_input)
  # Set decoder states equal to encoder final states
  decoder_states_value = encoder_states_value

  # Generate empty target sequence of length 1:
  target_seq = np.zeros((1, 1, num_decoder_tokens))
  
  # Populate the first token of target sequence with the start token:
  target_seq[0, 0, target_features_dict['<START>']] = 1.
  
  decoded_sentence = ''

  return decoded_sentence

for seq_index in range(10):
  test_input = encoder_input_data[seq_index: seq_index + 1]
  decoded_sentence = decode_sequence(test_input)
  print('-')
  print('Input sentence:', input_docs[seq_index])
  print('Decoded sentence:', decoded_sentence)

Test Function (part 2)
At long last, it’s translation time. Inside the test function, we’ll decode the sentence word by word using the output state that we retrieved from the encoder (which becomes our decoder’s initial hidden state). We’ll also update the decoder hidden state after each word so that we use previously decoded words to help decode new ones.

To tackle one word at a time, we need a while loop that will run until one of two things happens (we don’t want the model generating words forever):

The current token is "<END>".
The decoded Spanish sentence length hits the maximum target sentence length.
Inside the while loop, the decoder model can use the current target sequence (beginning with the "<START>" token) and the current state (initially passed to us from the encoder model) to get a bunch of possible next words and their corresponding probabilities. In Keras, it looks something like this:

output_tokens, new_decoder_hidden_state, new_decoder_cell_state = 
    decoder_model.predict(
    [target_seq] + decoder_states_value)
Next, we can use NumPy’s .argmax() method to determine the token (word) with the highest probability and add it to the decoded sentence:

# slicing [0, -1, :] gives us a
# specific token vector within the
# 3d NumPy matrix
sampled_token_index = np.argmax(
    output_tokens[0, -1, :])
 
# The reverse features dictionary
# translates back from index to Spanish
sampled_token = reverse_target_features_dict[
    sampled_token_index]
 
decoded_sentence += " " + sampled_token
Our final step is to update a few values for the next word in the sequence:

# Move to the next timestep 
# of the target sequence:
target_seq = np.zeros((1, 1, num_decoder_tokens))
target_seq[0, 0, sampled_token_index] = 1.
 
# Update the states with values from
# the most recent decoder prediction:
decoder_states_value = [
    new_decoder_hidden_state,
    new_decoder_cell_state]
And now we can test it all out!

You may recall that, because of platform constraints here, we’re using very little data. As a result, we can only expect our model to translate a handful of sentences coherently. Luckily, you will have an opportunity to try this out on your own computer with far more data to see some much more impressive results.

In [None]:
from training import encoder_inputs, decoder_inputs, encoder_states, decoder_lstm, decoder_dense, encoder_input_data, num_decoder_tokens

from prep import target_features_dict, reverse_target_features_dict, max_decoder_seq_length, input_docs, target_docs, target_tokens

from tensorflow import keras
from keras.layers import Input, LSTM, Dense
from keras.models import Model, load_model
import numpy as np

training_model = load_model('training_model.h5')
encoder_inputs = training_model.input[0]
encoder_outputs, state_h_enc, state_c_enc = training_model.layers[2].output
encoder_states = [state_h_enc, state_c_enc]
encoder_model = Model(encoder_inputs, encoder_states)

latent_dim = 256
decoder_state_input_hidden = Input(shape=(latent_dim,))
decoder_state_input_cell = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_hidden, decoder_state_input_cell]
decoder_outputs, state_hidden, state_cell = decoder_lstm(decoder_inputs, initial_state=decoder_states_inputs)
decoder_states = [state_hidden, state_cell]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = Model([decoder_inputs] + decoder_states_inputs, [decoder_outputs] + decoder_states)

def decode_sequence(test_input):
  encoder_states_value = encoder_model.predict(test_input)
  decoder_states_value = encoder_states_value
  target_seq = np.zeros((1, 1, num_decoder_tokens))
  target_seq[0, 0, target_features_dict['<START>']] = 1.
  decoded_sentence = ''
  
  stop_condition = False
  while not stop_condition:
    # Run the decoder model to get possible 
    # output tokens (with probabilities) & states
    output_tokens, new_decoder_hidden_state, new_decoder_cell_state = decoder_model.predict([target_seq] + decoder_states_value)

    # Choose token with highest probability
    sampled_token_index = np.argmax(output_tokens[0, -1, :])
    sampled_token = reverse_target_features_dict[sampled_token_index]
    decoded_sentence += " " + sampled_token

    # Exit condition: either hit max length
    # or find stop token.
    if (sampled_token == '<END>' or len(decoded_sentence) > max_decoder_seq_length):
      stop_condition = True

    # Update the target sequence (of length 1).
    target_seq = np.zeros((1, 1, num_decoder_tokens))
    target_seq[0, 0, sampled_token_index] = 1.
    # Update states
    decoder_states_value = [
    new_decoder_hidden_state,
    new_decoder_cell_state]

  return decoded_sentence

for seq_index in range(10):
  test_input = encoder_input_data[seq_index: seq_index + 1]
  decoded_sentence = decode_sequence(test_input)
  print('-')
  print('Input sentence:', input_docs[seq_index])
  print('Decoded sentence:', decoded_sentence)

Coursera

In [None]:
# UNQ_C1 GRADED FUNCTION: sigmoid
def sigmoid(z): 
    '''
    Input:
        z: is the input (can be a scalar or an array)
    Output:
        h: the sigmoid of z
    '''
    
    ### START CODE HERE ###
    # calculate the sigmoid of z
    h = None
    h = 1/(1 + np.exp(-z))
    ### END CODE HERE ###
    
    return h

In [None]:
# UNQ_C2 GRADED FUNCTION: gradientDescent
def gradientDescent(x, y, theta, alpha, num_iters):
    '''
    Input:
        x: matrix of features which is (m,n+1)
        y: corresponding labels of the input matrix x, dimensions (m,1)
        theta: weight vector of dimension (n+1,1)
        alpha: learning rate
        num_iters: number of iterations you want to train your model for
    Output:
        J: the final cost
        theta: your final weight vector
    Hint: you might want to print the cost to make sure that it is going down.
    '''
    ### START CODE HERE ###
    # get 'm', the number of rows in matrix x
    m = len(x)
    for i in range(0, num_iters):
        
        # get z, the dot product of x and theta
        z = np.dot(x, theta)
        
        # get the sigmoid of z
        h = 1/(1 + np.exp(-z))


        # calculate the cost function
        J = -1/m*(np.dot(y.T, np.log(h)) + np.dot((1-y).T, np.log(1-h)))       

        # update the weights theta
        theta = theta - (alpha / m) * np.dot(x.T, (h - y))
        
    ### END CODE HERE ###
    J = float(J)
    return J, theta