### Load the book

In [1]:
with open("miracle_in_the_andes.txt", "r", encoding="utf8") as file:
    book = file.read()

### How many chapters are there in the book?

##### With string method

In [2]:
book.count("Chapter") # not accurate, because it catches cases where the word in used in the text, too

11

##### With regex

In [3]:
import re

In [4]:
pattern = re.compile("Chapter [0-9]")

In [5]:
re.findall(pattern, book)

['Chapter 1',
 'Chapter 2',
 'Chapter 3',
 'Chapter 4',
 'Chapter 5',
 'Chapter 6',
 'Chapter 7',
 'Chapter 8',
 'Chapter 9',
 'Chapter 1']

In [6]:
pattern = re.compile("Chapter [0-9]+")

In [7]:
findings = re.findall(pattern, book)

In [8]:
findings

['Chapter 1',
 'Chapter 2',
 'Chapter 3',
 'Chapter 4',
 'Chapter 5',
 'Chapter 6',
 'Chapter 7',
 'Chapter 8',
 'Chapter 9',
 'Chapter 10']

In [9]:
len(findings)

10

### Which are the sentences where "love" is used?

In [10]:
# [^.] means everythong except the period sign.
# [A-Z]{1} pattern should have exactly one capital letter
# * is 0 or more times
# + is 1 or more times
# {1} is exactly one time
pattern = re.compile("[A-Z]{1}[^.]*[^a-zA-Z]love[^a-zA-Z][^.]*.") 
findings = re.findall(pattern, book)
len(findings)

67

### What are the most used words?

In [11]:
pattern = re.compile("[a-zA-Z]+")
findings = re.findall(pattern, book.lower()) # using lower() to avoid counting capitalized strings as different words
len(findings)

86798

In [12]:
# Below code creates a dict where keys are words, and values represent the count of how many times each word has appeared

dictionary = {}
for word in findings:
    if word in dictionary.keys():
        dictionary[word] = dictionary[word] + 1
    else:
        dictionary[word] = 1

In [13]:
# Create list with the values in the first position, followed by the keyword

dictionary_list = [(value, key) for (key, value) in dictionary.items()]

# We sort the list and put the most occurences on top

sorted(dictionary_list, reverse=True)[:10] # [:10] shows only first 10 items from the list

[(5346, 'the'),
 (2795, 'and'),
 (2729, 'i'),
 (2400, 'to'),
 (2060, 'of'),
 (1566, 'a'),
 (1430, 'was'),
 (1419, 'in'),
 (1226, 'we'),
 (1169, 'my')]

### Extract paragraphs where love is used

In [28]:
# Regex pattern using lookbehind and lookahead:
# (?<=\n\n)        - Positive lookbehind: Asserts that the match is preceded by two newline characters,
#                    indicating the start of a new paragraph.
# (.*?\blove\b.*?)  - Captures any text that contains the word "love" as a whole word, 
#                    allowing for any characters (including newlines) before and after "love".
# (?=\n\n)         - Positive lookahead: Asserts that the match is followed by two newline characters,
#                    indicating the end of a paragraph.
pattern = re.compile(r"(?<=\n\n)(.*?\blove\b.*?)(?=\n\n)")
findings = re.findall(pattern, book)
len(findings)
#findings

40

### Extract chapter titles

In [35]:
# The dot (.) matches any single character except newline characters.
# The plus sign (+) is a quantifier in regex that indicates one or more occurrences of the preceding element.

pattern = re.compile(r"Chapter [0-9]+\n\n(.+)")
findings = re.findall(pattern, book)
findings

['Before',
 'Everything Precious',
 'A Promise',
 'Breathe Once More',
 'Abandoned',
 'Tomb',
 'East',
 'The Opposite of Death',
 'I See a Man',
 'After']

### Function that finds the occurence of any word

In [49]:
def finder(word):
    pattern = re.compile(r"\b" + word + r"\b")
    findings = re.findall(pattern, book.lower())
    if findings:
        print (len(findings))
    else:
        print(f'The book doesn\'t contain the word "{word}"')

# looks like we can use an fstring + rstring combo as well:
# pattern = re.compile(fr'\b{word}\b')

### Call the function

In [47]:
finder("love")

83


In [48]:
finder("hate")

The book doesn't contain the word "hate"
