In [3]:
import nltk # https://www.nltk.org/install.html  
import numpy # https://www.scipy.org/install.html  
import matplotlib.pyplot # https://matplotlib.org/downloads.html  

#You also need to run nltk.download() in order to download NLTK before proceeding:
#nltk.download()
from nltk.book import *
# could also be
# from nltk.corpus import brown
# brown.words()

# I run the code underneath to get output for all code and not just the last line
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

*** Introductory Examples for the NLTK Book ***
Loading text1, ..., text9 and sent1, ..., sent9
Type the name of the text or sentence to view it.
Type: 'texts()' or 'sents()' to list the materials.
text1: Moby Dick by Herman Melville 1851
text2: Sense and Sensibility by Jane Austen 1811
text3: The Book of Genesis
text4: Inaugural Address Corpus
text5: Chat Corpus
text6: Monty Python and the Holy Grail
text7: Wall Street Journal
text8: Personals Corpus
text9: The Man Who Was Thursday by G . K . Chesterton 1908


* Part I:

☼ Try using the Python interpreter as a calculator, and typing expressions like 12 / (4 + 1).

☼ Given an alphabet of 26 letters, there are 26 to the power 10, or 26 ** 10, ten-letter strings we can form. That works out to 141167095653376. How many hundred-letter strings are possible?

☼ The Python multiplication operation can be applied to lists. What happens when you type ['Monty', 'Python'] * 20, or 3 * sent1?


In [None]:
# 1.1) 
12 / (4 + 1)
# 1.2) 
26**10
# 1.2) 
26**100
# 1.3) 
['Monty', 'Python'] * 20
LOL = ('Mony', 'Python')
3 * LOL

* Part II:

☼ Review 1 on computing with language. How many words are there in text2? How many distinct words are there?

☼ Compare the lexical diversity scores for humor and romance fiction in 1.1. Which genre is more lexically diverse?

☼ Produce a dispersion plot of the four main protagonists in Sense and Sensibility: Elinor, Marianne, Edward, and Willoughby. What can you observe about the different roles played by the males and females in this novel? Can you identify the couples?

☼ Find the collocations in text5.

☼ Consider the following Python expression: len(set(text4)). State the purpose of this expression. Describe the two steps involved in performing this computation.

In [None]:
# 2.1, first total number of tokens, then the total number of words, and then the number of unique words:
len(text2) # 141576 tokens in total (signs, words, entities of strings in the text, e.g. "mom", "?", ",", ".", "--")
words_t2 = [w for w in text2 if w.isalpha() == True] # looping though the list to get the number of words - that is without any signs 
len(words_t2) # 120733 total number of words (sings excluded)
# list(words_t2) # list of all the words in the text (excluding signs)

#set(words_t2) # list/presentation of distrinct words / vocabulary
#sorted(set(words_t2)) # the alphabetically sorted different words / vocabulary 
len(set(words_t2))# 6833 unique words - length/number of unique words 

#Now, let's calculate a measure of the lexical richness of the text. 
# The next example shows us that the number of distinct words is just 6% of the total number of words, 
# or equivalently that each word is used 16 times on average 
len(set(words_t2)) / len(words_t2) # 0.055602030927749665



In [None]:
# 2.2 lexical diversity
# lexical diversity 
# To repeat calculations on several texts, without retyping a formula, you can come up with
# your own name for a task, like "lexical_diversity" or "percentage", 
# and associate it with a block of code. Now you only have to type a short name instead of
# one or more complete lines of Python code, and you can re-use it as often as you like. 
# The block of code that does a task for us is called a function, 
# and we define a short name for our function with the keyword def. 

# function that calculates lexical diversity 
def lexical_diversity(text):
    return len(set(text)) / len(text)
# Calculates what percentage of the text is taken up by a specific word
def percentage(count, total):
    return 100 * count / total
    # e.g. percentage(text4.count('a'), len(text4))
    
# compare lexical diversity between text 2 and text 6 because text 2 is more romance 
# and text 6 is more humour 
words_t6 = [w for w in text6 if w.isalpha() == True] # looping though the list to get the number of words - that is without any signs 

print(lexical_diversity(words_t2)) # 0.055602030927749665
print(lexical_diversity(words_t6)) # 0.18427947598253275
# this means that the number of distinct words is only used 6% of the total number of words in text 2,
# while the number of distinct words is 18%  of the total number of words in thext 6. 
# Therefore the lexical diversity seems to be grater in the humor text



In [None]:
#2.3 Produce a dispersion plot of the four main protagonists in Sense and Sensibility (text 2): 
# Elinor, Marianne, Edward, and Willoughby. What can you observe about the different roles
# played by the males and females in this novel? Can you identify the couples?
text2.dispersion_plot(["Elinor", "Marianne", "Edward", "Willoughby"])
# example: this function let us see the different contexts in which the word occur
#text2.concordance("affection")
# this function lets us see the words that appear in a similar range of contexts
#text2.similar("Elinor")
# The term common_contexts allows us to examine just the contexts that are shared by two 
# or more words:
text2.common_contexts(["Elinor", "Willoughby"])
text2.common_contexts(["Elinor", "Edward"])
text2.common_contexts(["Marianne", "Willoughby"])
text2.common_contexts(["Marianne", "Edward"])
#text2_collocations()
fdist2 = FreqDist(text2)
fdist2.most_common(50)
#new = text2-fdist2
# wtf!! new = text2[-fdist2]
#[1:9] = ['Second', 'Third'] [2]



In [4]:
# 2.4 Find the Collocations in text 5
# A collocation is a sequence of words that occur together unusually often. Thus red wine is 
# a collocation, whereas the wine is not. A characteristic of collocations is that they are
# resistant to substitution with words that have similar senses. (bigrams = list of word pairs)
# collocations are essentially just frequent bigrams, except that we want to pay more attention 
# to the cases that involve rare words. In particular, we want to find bigrams that occur more
# often than we would expect based on the frequency of the individual words
#text4.collocations()
text5.collocation_list()

['wanna chat',
 'PART JOIN',
 'MODE #14-19teens',
 'JOIN PART',
 'PART PART',
 'cute.-ass MP3',
 'MP3 player',
 'JOIN JOIN',
 'times .. .',
 'ACTION watches',
 'guys wanna',
 'song lasts',
 'last night',
 'ACTION sits',
 '-...)...- S.M.R.',
 'Lime Player',
 'Player 12%',
 'dont know',
 'lez gurls',
 'long time']

In [None]:
# Consider the following Python expression: len(set(text4)). State the purpose of this 
# expression. Describe the two steps involved in performing this computation.
unique_words_t4 = set(text4) # # list/presentation of distrinct words / vocabulary
len(unique_words_t4)# 6833 unique words - length/number of unique words 
#sorted(set(text4)) # the alphabetically sorted different words / vocabulary 
# len(set(text4))



* Part III:

☼ Review 2 on lists and strings.

Define a string and assign it to a variable, e.g., my_string = 'My String' (but put something more interesting in the string). Print the contents of this variable in two ways, first by simply typing the variable name and pressing enter, then by using the print statement.
Try adding the string to itself using my_string + my_string, or multiplying it by a number, e.g., my_string * 3. Notice that the strings are joined together without any spaces. How could you fix this?    

☼ Define a variable my_sent to be a list of words, using the syntax my_sent = ["My", "sent"] (but with your own words, or a favorite saying).
Use ' '.join(my_sent) to convert this into a string.
Use split() to split the string back into the list form you had to start with.    

☼ Define several variables containing lists of words, e.g., phrase1, phrase2, and so on. Join them together in various combinations (using the plus operator) to form whole sentences. What is the relationship between len(phrase1 + phrase2) and len(phrase1) + len(phrase2)?

☼ Consider the following two expressions, which have the same value. Which one will typically be more relevant in NLP? Why?  
- "Monty Python"[6:12]
- ["Monty", "Python"][1]

☼ We have seen how to represent a sentence as a list of words, where each word is a sequence of characters. What does sent1[2][2] do? Why? Experiment with other index values.

☼ The first sentence of text3 is provided to you in the variable sent3. The index of the in sent3 is 1, because sent3[1] gives us 'the'. What are the indexes of the two other occurrences of this word in sent3?

In [None]:
# 3.1 string
my_string = "birthday"
my_string
print(my_string)
my_string + my_string
my_string*3
# how can we make space between the birthday words? 
# adding a space in the orignial string?
my_string = my_string + " "
my_string*3

In [None]:
# 3.2 variable 
#my_sent = ["That", "was", "an", "amazing", "weekend"]
# Use ' '.join(my_sent) to convert this into a string
new_my_sent = ' '.join(my_sent)
# Use split() to split the string back into the list form you had to start with
new_my_sent.split()


In [None]:
# 3.3 
phrase1 = "This is a good view "
phrase2 = "and the weather is so nice"
phrase1 + phrase2
len(phrase1 + phrase2) 
len(phrase1) + len(phrase2)
# same result 

In [None]:
# 3.4 
# Consider the following two expressions which have the same value
# Which one will typically be more relevant in NLP? Why?
"Monty Python"[6:12]
["Monty", "Python"][1]
# second? beacuse we mostly work with lists or sets of words and acces them through indexing, 
# instead of working with strings??? I don't know :D 

In [None]:
# 3.5
# We have seen how to represent a sentence as a list of words, where each word is a sequence of characters. 
# What does sent1[2][2] do? Why? Experiment with other index values.
sent1[2][2] # this indexing accesses the 3rd word in the list, and the 3rd letter in that word
sent1[2]
sent1

In [5]:
# 3.6
# The first sentence of text3 is provided to you in the variable sent3.
# The index of the in sent3 is 1, because sent3[1] gives us 'the'. 
# What are the indexes of the two other occurrences of this word in sent3?
sent3

sent3.index('the') # GIVER KUN DEN FØRSTE OCCURENCE
sent3_the = [i for i, x in enumerate(sent3) if x == "the"]
sent3_the

['In',
 'the',
 'beginning',
 'God',
 'created',
 'the',
 'heaven',
 'and',
 'the',
 'earth',
 '.']

1

[1, 5, 8]

* Part V:

◑ Review the discussion of looping with conditions in 4. Use a combination of for and if statements to loop over the words of the movie script for Monty Python and the Holy Grail (text6) and print all the uppercase words, one per line.

◑ Write expressions for finding all words in text6 that meet the conditions listed below. The result should be in the form of a list of words: ['word1', 'word2', ...].

Ending in ise
Containing the letter z
Containing the sequence of letters pt
Having all lowercase letters except for an initial capital (i.e., titlecase)  
◑ Define sent to be the list of words ['she', 'sells', 'sea', 'shells', 'by', 'the', 'sea', 'shore']. Now write code to perform the following tasks:

Print all words beginning with sh
Print all words longer than four characters  
◑ What does the following Python code do?  sum(len(w) for w in text1) Can you use it to work out the average word length of a text?

◑ Define a function called vocab_size(text) that has a single parameter for the text, and which returns the vocabulary size of the text.

◑ Define a function percent(word, text) that calculates how often a given word occurs in a text, and expresses the result as a percentage.

◑ We have been using sets to store vocabularies. Try the following Python expression: set(sent3) < set(text1). Experiment with this using different arguments to set(). What does it do? Can you think of a practical application for this?

In [13]:
# 5.1 
# Review the discussion of looping with conditions in 4. Use a combination of for and if 
# statements to loop over he words of the movie script for Monty Python and the 
# Holy Grail (text6) and print all the uppercase words, one per line.
# from the book ch. 4.1

upper_words = [w for w in text6 if w.isupper() == True]
len(upper_words)

#sent7
#[w for w in sent7 if len(w) < 4]
#[w for w in sent7 if len(w) <= 4]
#[w for w in sent7 if len(w) == 4]
#[w for w in sent7 if len(w) != 4]

1772

In [105]:
# 5.2
# Write expressions for finding all words in text6 that meet the conditions listed below. 
# The result should be in the form of a list of words: ['word1', 'word2', ...].
# Ending in ise 
# Containing the letter z 
# Containing the sequence of letters pt 
# Having all lowercase letters except for an initial capital (i.e., titlecase)
"""
for w in text6:
    if w.endswith('ise'):
        print(w, end = ", ")
    elif 'z' in w: 
        print(w, end = ", ")
    elif 'pt' in w: 
        print(w, end = ", ")
    elif w.istitle():
        print(w, end = ", ")
"""
"""
words_52 = []
for w in text6:
    if w.endswith('ise'):
        words_52.append(w)
    elif 'z' in w: 
        words_52.append(w)
    elif 'pt' in w: 
        words_52.append(w)
    elif w.istitle():
        words_52.append(w)
#print(words_52)
"""
wor = []
for w in text6:
    if w.endswith('ise') and 'z' in w and 'pt' in w and w.istitle():
        wor.append(w)
print(wor)
len(wor)


'\nfor w in text6:\n    if w.endswith(\'ise\'):\n        print(w, end = ", ")\n    elif \'z\' in w: \n        print(w, end = ", ")\n    elif \'pt\' in w: \n        print(w, end = ", ")\n    elif w.istitle():\n        print(w, end = ", ")\n'

"\nwords_52 = []\nfor w in text6:\n    if w.endswith('ise'):\n        words_52.append(w)\n    elif 'z' in w: \n        words_52.append(w)\n    elif 'pt' in w: \n        words_52.append(w)\n    elif w.istitle():\n        words_52.append(w)\n#print(words_52)\n"

[]


0

In [45]:
# 5.3
sent = ['she', 'sells', 'sea', 'shells', 'by', 'the', 'sea', 'shore']
# Print all words beginning with sh 
# Print all words longer than four characters
words_53 = []
for w in sent:
    if w.startswith('sh'):
        words_53.append(w)
    elif len(w) > 4: 
        words_53.append(w)   
print(words_53)

# 5.4 - what does the code do
sum(len(w) for w in text1) # makes a list with the length of each word in text1 and 
# sums the total of all these numbers AKA gives the total number of characters 
# (signs and letters) in text1
# Can you use it to work out the average word length of a text?
sum(len(w) for w in text1)/len(text1) # average word length is 3.8 characters



['she', 'sells', 'shells', 'shore']


999044

3.830411128023649

In [91]:
# 5.5
# Define a function called vocab_size(text) 
#that has a single parameter for the text, 
#and which returns the vocabulary size of the text.

def vocab_size(text):
  return len(set(text))

#vocab_size(text1)

# 5.6  Define a function percent(word, text) that calculates how often a given word occurs
# in a text, and expresses the result as a percentage.
def percent(word, text):
  return text.count(word) / len(text)*100

percent('Hello', text6)

# 5.7
# We have been using sets to store vocabularies. 
# Try the following Python expression: set(sent3) < set(text1). 
# Experiment with this using different arguments to set(). 
# What does it do? Can you think of a practical application for this?
set(sent3) < set(text1) # checks if the statement is true 
# Can compare the vocabulary of different texts???


0.15323863971238286

True

11

260819


* Extra:

If you've been through all the exercises, here are 2 retro-engineering tasks:

- Create a *homemade_bigrams* function that takes as input a tokenized text and outputs a list of bigrams (2 words combinations in text). Can you extend it to n-grams?

- Create a *FrequencyDistribution* class similar to the FreqDist class object in the NLTK package. (Don't check the original code!)

In [None]:
def homemade_bigrams(tokenized_text):
    return list(bigrams(tokenized_text))

token_text2 = [w for w in text2 if w.isalpha() == True]

homemade_bigrams(token_text2)


In [None]:
def FrequencyDistribution(text, number): 
    

In [None]:
# cleantext2 =[w for w in text2 if w.isaphla()]
#cleantext2 = []
#for w in text2: 
  #  if w.isaplha():
   #     cleantext2.append(w)
        

# you can index from strings
# string1 = "nice"
#string1[0] = "?"


