text_preprocessing

For this exercise you must use some raw text. Choose a text from https://www.gutenberg.org

Ex 1
Download it through python (inside the code, so you don't have to upload the file too when you send the solution for this exercise) with urlopen() from module urllib and read the entire text in one single string. If the download takes too much time at each running, download the file, but leave the former instructions in a comment (to show that you know how to access an online file)

In [None]:
import urllib.request

url = 'http://www.gutenberg.org/files/73115/73115-0.txt'

with urllib.request.urlopen(url) as file:
    text = file.read().decode('utf-8')



In [None]:
text



In [None]:
cleaned_string = text.replace('\n', '').replace('\r', '').replace('\\', '')
cleaned_string



Ex 2 Remove the header (keep only the text starting from the title)

In [None]:
start_index = cleaned_string.find('THE HOUSE OF BONDAGE')

cleaned_string = cleaned_string[start_index:]
cleaned_string



Ex 3 Print the number of sentences in the text. Print the average length (number of words) of a sentence.

In [None]:
abbr = ['etc.', 'Dr.', 'Mr.', 'Mrs.']
for substring in abbr:
    cleaned_string = cleaned_string.replace(substring, '')


In [None]:
import re

def sentences_words(text):
    sentences = re.split(r'[.!?]', text)
    num_sentences = len(sentences)
    total_words = sum(len(sentence.split()) for sentence in sentences)
    avg_sentence_length = total_words / num_sentences
    return num_sentences, avg_sentence_length

num_sentences, num_words = sentences_words(cleaned_string)
print(f"Number of sentences: {num_sentences}")
print(f"Number of words: {num_words:.2f} words")


Number of sentences: 7449
Number of words: 14.95 words


Ex 4 Find the collocations in the text (bigram and trigram). Use the nltk.collocations module You will print them only once not each time they appear.

In [None]:
import string
import nltk
from nltk.collocations import BigramCollocationFinder, TrigramCollocationFinder
from nltk.metrics import BigramAssocMeasures, TrigramAssocMeasures
from nltk.tokenize import word_tokenize

nltk.download('punkt')

translator = str.maketrans('', '', string.punctuation)
cleaned_string = cleaned_string.translate(translator)

tokens = word_tokenize(cleaned_string)

bigram_finder = BigramCollocationFinder.from_words(tokens)
trigram_finder = TrigramCollocationFinder.from_words(tokens)

bigrams = bigram_finder.nbest(BigramAssocMeasures.likelihood_ratio, 10)
trigrams = trigram_finder.nbest(TrigramAssocMeasures.likelihood_ratio, 10)

print("Bigrams:", bigrams)
print("Trigrams:", trigrams)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Bigrams: [('did', 'not'), ('had', 'been'), ('of', 'the'), ('she', 'had'), ('in', 'the'), ('New', 'York'), ('to', 'be'), ('he', 'said'), ('she', 'said'), ('at', 'last')]
Trigrams: [('the', 'other', 'of'), ('the', 'door', 'of'), ('the', 'girl', 'of'), ('the', 'sort', 'of'), ('the', 'street', 'of'), ('the', 'first', 'of'), ('the', 'girls', 'of'), ('the', 'world', 'of'), ('the', 'room', 'of'), ('the', 'portion', 'of')]


Ex 5 Create a list of all the words (in lower case) from the text, without the punctuation.

In [None]:
import string

words = cleaned_string.lower().split()

print(words)




Ex 6 Print the first N most frequent words (alphanumeric strings) together with their number of appearances.

In [None]:
import string
from collections import Counter

def frequent_words(words, N):
    word_counts = Counter(words)
    most_common = word_counts.most_common(N)
    return most_common

common_words = frequent_words(words, 5)
common_words

[('the', 5294), ('to', 2840), ('of', 2767), ('and', 2652), ('a', 2289)]

Ex 7 Remove stopwords and assign the result to variable lws

In [None]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
def remove_stopwords(words):
    stop_words = set(stopwords.words('english'))
    return [word for word in words if word not in stop_words]

lws = remove_stopwords(words)
print(lws)



Ex 8 Apply stemming (Porter) on the list of words (lws). Print the first 200 words. Do you see any words that don't appear in the dictionary?

In [None]:
import nltk
from nltk.stem import PorterStemmer, LancasterStemmer, SnowballStemmer

porter = PorterStemmer()

stemmed_words = [porter.stem(word) for word in lws]

print(stemmed_words[:200])

['hous', 'bondageia', 'spring', 'ownth', 'local', 'weatherprophetsth', 'capeco', 'mennonit', 'thebeard', 'amishmen', 'came', 'town', 'markethad', 'said', 'withchor', 'unanim', 'spring', 'would', 'brief', 'sudden', 'thesumm', 'parch', 'intensealreadi', 'though', 'april', 'dawn', 'pink', 'arbutu', 'bloomedand', 'wither', 'pale', 'first', 'violet', 'peep', 'purpl', 'andfragr', 'among', 'lush', 'grass', 'front', 'yard', 'second', 'streetth', 'annual', 'oriol', 'full', 'fortnight', 'ahead', 'time', 'openinghi', 'summerhous', 'hickorytre', 'southwark', 'lawn', 'upin', 'drone', 'studyroom', 'highschool', 'windowswer', 'wide', 'lazi', 'sunlight', 'miss', 'england', 'begun', 'week', 'todirect', 'thought', 'dwindl', 'seniorclass', 'toward', 'subjectsof', 'graduat', 'essayssway', 'easi', 'languid', 'grace', 'unstudi', 'young', 'animalmari', 'denbigh', 'morningsess', 'end', 'turn', 'graveledwalk', 'schoolground', 'littl', 'town', 'chiefthoroughfarenobodi', 'ever', 'call', 'pretti', 'light', 'serg'

Ex 9 Print a table of three columns (of size N, where N is the maximum length for the words in the text). The columns will be separated with the character "|". The head of the table will be:
Porter    |Lancaster |Snowball
The table will contain only the words that give different stemming results for the three stemmers (for example, suppose that we have both "runs" and "being" inside the text. The word "runs" should not appear in the list, as all three results are "run"; however "being" should appear in the table). The stemming result for the word for each stemmer will appear in the table according to the head of the table. The table will contain the results for the first NW words from the text (the number of rows will obviously be less than NW, as not all words match the requirements). For example, NW=500. Try to print only distinct results inside the table (for example, if a word has two occurnces inside the text, and matches the requirments for appearing in the table, it should have only one corresponding row).

In [None]:
lancaster = LancasterStemmer()
snowball = SnowballStemmer('english')


no_dup_words = []
for word in words:
    if word not in no_dup_words:
        no_dup_words.append(word)

stemmed_words = {
    'Porter': [porter.stem(word) for word in no_dup_words],
    'Lancaster': [lancaster.stem(word) for word in no_dup_words],
    'Snowball': [snowball.stem(word) for word in no_dup_words],
}

table_rows = []
for i, word in enumerate(no_dup_words):
    if len(set([stemmed_words['Porter'][i], stemmed_words['Lancaster'][i], stemmed_words['Snowball'][i]])) > 1:
        table_rows.append(f"{stemmed_words['Porter'][i]} | {stemmed_words['Lancaster'][i]} | {stemmed_words['Snowball'][i]}")

print("Porter | Lancaster | Snowball")
print("\n".join(table_rows))

Porter | Lancaster | Snowball
were | wer | were
all | al | all
your | yo | your
local | loc | local
amishmen | amishm | amishmen
came | cam | came
withchor | withch | withchor
sudden | sud | sudden
thesumm | thesum | thesumm
intensealreadi | intensealready | intensealreadi
arbutu | arbut | arbutus
wither | with | wither
pale | pal | pale
streetth | streetthe | streetth
annual | an | annual
wa | was | was
full | ful | full
hi | his | his
time | tim | time
openinghi | openingh | openinghi
summerhous | summerh | summerhous
hickorytre | hickoryt | hickorytre
drone | dron | drone
where | wher | where
windowswer | windowsw | windowswer
wide | wid | wide
lazi | lazy | lazi
thi | thi | this
graduat | gradu | graduat
easi | easy | easi
grace | grac | grace
unstudi | unstudy | unstudi
animalmari | animalm | animalmari
befor | bef | befor
schoolground | schoolgrounds | schoolground
chiefthoroughfarenobodi | chiefthoroughfarenobody | chiefthoroughfarenobodi
ever | ev | ever
call | cal | call
prett

Ex 10 Print a table of two columns, simillar to the one above, that will compare the results of stemming and lemmatization. The head of the table will contain the values: "Snowball" and "WordNetLemmatizer". The table must contain only words that give different results in the process of stemming and lemmatization (for example, the word "running"). The table will contain the results for the first NW words from the text (the number of rows will obviously be less than NW, as not all words match the requirements). For example, NW=500. Try to print only distinct results inside the table (for example, if a word has two occurnces inside the text, and matches the requirments for appearing in the table, it should have only one corresponding row).

In [None]:
!pip install nltk
nltk.download('wordnet')



[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
import nltk
from nltk.stem import WordNetLemmatizer

wordnet_lemmatizer = WordNetLemmatizer()

lemmatized_words = [wordnet_lemmatizer.lemmatize(word) for word in no_dup_words]

diff_stem_lemma = [
    (stem, lemma)
    for stem, lemma in zip(stemmed_words['Snowball'], lemmatized_words)
    if stem != lemma
]

print('Snowball | WordNetLemmatizer')

for i, (stem, lemma) in enumerate(diff_stem_lemma):
    if i < 500:
        print(f'{stem} | {lemma}')
    else:
        break

Snowball | WordNetLemmatizer
hous | house
bondageia | bondageias
ownth | ownthe
weatherprophetsth | weatherprophetsthe
capeco | capecoated
mennonit | mennonite
thebeard | thebearded
withchor | withchoral
unanim | unanimity
thesumm | thesummer
parch | parching
intensealreadi | intensealready
dawn | dawned
wither | withered
peep | peeping
purpl | purple
andfragr | andfragrant
streetth | streetthe
oriol | oriole
was | wa
openinghi | openinghis
summerhous | summerhouse
hickorytre | hickorytree
southwark | southwarks
drone | droning
windowswer | windowswere
lazi | lazy
dwindl | dwindling
graduat | graduation
essayssway | essaysswaying
easi | easy
unstudi | unstudied
animalmari | animalmary
morningsess | morningsession
end | ended
turn | turned
befor | before
schoolground | schoolgrounds
littl | little
chiefthoroughfarenobodi | chiefthoroughfarenobody
call | called
pretti | pretty
serg | serge
lengthen | lengthened
ankl | ankle
mari | mary
wholli | wholly
ofth | ofthe
lith | lithe
stronglimb

In [None]:
from nltk import pos_tag
from nltk.corpus import wordnet

nltk.download('averaged_perceptron_tagger')

def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

tagged_words = pos_tag(no_dup_words)

lemmatized_words = [wordnet_lemmatizer.lemmatize(word, pos=get_wordnet_pos(pos)) for word, pos in tagged_words]

diff_stem_lemma = [
    (stem, lemma)
    for stem, lemma in zip(stemmed_words['Snowball'], lemmatized_words)
    if stem != lemma
]

print('Snowball | WordNetLemmatizer')

for i, (stem, lemma) in enumerate(diff_stem_lemma):
    if i < 500:
        print(f'{stem} | {lemma}')
    else:
        break

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


Snowball | WordNetLemmatizer
hous | house
bondageia | bondageias
were | be
ownth | ownthe
weatherprophetsth | weatherprophetsthe
capeco | capecoated
mennonit | mennonite
thebeard | thebearded
came | come
said | say
withchor | withchoral
unanim | unanimity
thesumm | thesummer
intensealreadi | intensealready
had | have
purpl | purple
andfragr | andfragrant
streetth | streetthe
oriol | oriole
was | be
openinghi | openinghis
summerhous | summerhouse
hickorytre | hickorytree
southwark | southwarks
drone | droning
windowswer | windowswere
lazi | lazy
begun | begin
dwindl | dwindle
graduat | graduation
essayssway | essaysswaying
easi | easy
unstudi | unstudied
animalmari | animalmary
morningsess | morningsession
befor | before
schoolground | schoolgrounds
littl | little
chiefthoroughfarenobodi | chiefthoroughfarenobody
pretti | pretty
serg | serge
been | be
ankl | ankle
mari | mary
wholli | wholly
ofth | ofthe
lith | lithe
stronglimb | stronglimbed
opencountri | opencountry
suffici | sufficie

Ex 11 Print the first N most frequent lemmas (after the removal of stopwords) together with their number of appearances.

In [None]:
lemma_frequency = Counter(lemmatized_words)

most_common_lemmas = lemma_frequency.most_common(10)

for lemma, count in most_common_lemmas:
    print(f"{lemma}: {count}")

be: 8
know: 6
grow: 6
go: 5
do: 5
show: 5
work: 5
take: 5
give: 5
bar: 5


Ex 12 Change all the numbers from lws into words. Print the number of changes, and also the portion of list that contains first N changes (for example N=10).

In [None]:
pip install inflect




In [None]:
import inflect

p = inflect.engine()

def is_number(s):
    try:
        float(s)
        return True
    except ValueError:
        return False

num_changes = 0

changes_indices = []

converted_lws = []
for i, item in enumerate(lws):
    if is_number(item):
        word = p.number_to_words(item)
        converted_lws.append(word)
        changes_indices.append(i)
        num_changes += 1
    else:
        converted_lws.append(item)

N = 2

if num_changes >= N:
    end_index = changes_indices[N-1] + 1
else:
    end_index = len(converted_lws)
print(f"Number of changes: {num_changes}")
print(f"List: {converted_lws[:end_index]}")


Number of changes: 24


Ex 13 Create a function that receives an integer N and a word W as parameter (it can also receive the list of words from the text). We want to print the concordance data for that word. This means printing the window of text (words on consecutive positions) of length N, that has the givend word W in the middle. For example, for the text ""I have two dogs and a cat. Do you have pets too? My cat likes to chase mice. My dogs like to chase my cat." and a window of length 3, the concordance data for the word "cat" would be ["dogs", "cat", "pets"] and ["pets","cat", "likes"] (we consider the text without stopwords and punctuation). However, as you can see, the window of text may contain words from different sentences. Create a second function that prints windows of texts that contain words only from the phrase containing word W. We want to print concordance data for all the inflexions of word W.

In [None]:
def concordance(N, W, words):
    W = wordnet_lemmatizer.lemmatize(W)
    for index, word in enumerate(words):
        if word == W:
            start = max(index - N, 0)
            end = min(index + N + 1, len(words))
            neig = words[start:end]
            concordance = ' '.join(neig)
            print(concordance)

concordance(1, "gives", lemmatized_words)

want give daughter
coal give look
senior give drudgery
ofgame give welldivide
costigankatie give parentage


In [None]:
def second_concordance(N, word, text):

    def split_into_sentences(text):
        sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|!)\s', text)
        return [sentence.strip() for sentence in sentences if sentence]
    def get_words(sentence, word, N):
        words = sentence.split()
        length = len(words)
        target_indices = [i for i, w in enumerate(words) if re.fullmatch(word, w, re.IGNORECASE)]
        pairs = []
        for index in target_indices:
            half_N = N // 2
            start = index - half_N
            end = index + half_N + 1
            if start >= 0 and end <= length:
                pairs.append(' '.join(words[start:end]))
        return pairs

    sentences = split_into_sentences(text)

    result = []
    for sentence in sentences:
        result.extend(get_words(sentence, word, N))

    return result

text = "I have two dogs and a cat. Do you have pets too? My cat likes to chase mice. My dogs like to chase my cat"
N = 3
word = "chase"
print(second_concordance(N, word, text))


['to chase mice.', 'to chase my']
