## Loading the book

In [1]:
with open("miracle_in_the_andes.txt", "r") as file:
    book = file.read()

## Find the most used words

In [65]:
import re
pattern = re.compile("[a-zA-Z]+")
findings = re.findall(pattern, book.lower())

# Findings all the words and their count
word_dict = {}
for word in findings:
    if word in word_dict.keys():
        word_dict[word] += 1
    else:
        word_dict[word] = 1

# Format and sort the list
sorted_word_list = [(value, key) for key, value in word_dict.items()]
sorted_word_list = sorted(sorted_word_list, reverse=True)

sorted_word_list[0]

(5346, 'the')

## Find the most used words that are non-articles

In [56]:
import nltk
from nltk.corpus import stopwords
# NOTE: You may need to uncomment this line if you get a lookup error
# nltk.download('stopwords')

# Stopwords are used very often like pronouns or function words (as, am, the, etc)
en_stopwords = stopwords.words("english")

# filtered_word_dict has 143 less words
filtered_word_dict = {}

for word, value in word_dict.items():
    if not word in en_stopwords:
        filtered_word_dict[word] = value

sorted_filtered_word_list = [(value, key) for key, value in filtered_word_dict.items()]
sorted_filtered_word_list = sorted(sorted_filtered_word_dict, reverse=True)

print(f"The most used non-article word is \"{sorted_filtered_word_list[0][1]}\". It is used {sorted_filtered_word_list[0][0]} times.")

The most used non-article word is "would". It is used 575 times.


## Find the most positive and negative chapter

In [189]:
import re
from nltk.sentiment import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()

# .polarity_scores() will return a dictionary with estimated scores about how negative, positive, and neutral the input is.
# {'neg': 0.0, 'neu': 0.0, 'pos': 0.0, 'compound': 0.0}
# compound can range from -1 to +1. If it is above 0 it is more positive, below 0 is more negative


pattern = re.compile("Chapter [0-9]+")
chapters = re.split(pattern, book)[1:]  # We remove the first item becuase it is just a blank string

# Find their scores
chapter_scores = {}
index = 0
for chapter in chapters:
    score = analyzer.polarity_scores(chapter)
    chapter_scores[f"Chapter {index+1}"] = score
    index += 1

# Find most positive and negative
most_pos_chap = {"name": "chapter_name", "score": {'neg': 0, 'neu': 0, 'pos': 0, 'compound': 0}}
most_neg_chap = {"name": "chapter_name", "score": {'neg': 0, 'neu': 0, 'pos': 0, 'compound': 0}}

for key, value in chapter_scores.items():
    if value["pos"] > most_pos_chap["score"]["pos"]:
        most_pos_chap["name"] = key
        most_pos_chap["score"] = value
    
    if value["neg"] > most_neg_chap["score"]["neg"]:
        most_neg_chap["name"] = key
        most_neg_chap["score"] = value
   
print("Most Positive: ", most_pos_chap)  # Chapter 10
print("Most Negitive: ", most_neg_chap)  # Chapter 3

Most Positive:  {'name': 'Chapter 10', 'score': {'neg': 0.086, 'neu': 0.733, 'pos': 0.181, 'compound': 1.0}}
Most Negitive:  {'name': 'Chapter 3', 'score': {'neg': 0.145, 'neu': 0.751, 'pos': 0.105, 'compound': -0.9999}}


## Find the mood of each chapter (positive and negative)

In [194]:
new_chapter_scores = {}
for key, value in chapter_scores.items():
    new_chapter_scores[key] = "positive" if value["pos"] > value["neg"] else "negative"

new_chapter_scores

{'Chapter 1': 'positive',
 'Chapter 2': 'positive',
 'Chapter 3': 'negative',
 'Chapter 4': 'negative',
 'Chapter 5': 'positive',
 'Chapter 6': 'negative',
 'Chapter 7': 'negative',
 'Chapter 8': 'negative',
 'Chapter 9': 'negative',
 'Chapter 10': 'positive'}