In [5]:
import re
from collections import Counter

# Load the book

In [6]:
with open("miracle_in_the_andes.txt", "r", encoding="utf-8") as file:
    book_content = file.read()

# The most used words (non article)

In [8]:
pattern = r"[a-z]+"
all_words = re.findall(pattern, book_content.lower(), re.IGNORECASE)
word_counts = Counter(all_words)
# the top 10 most common words
most_common = word_counts.most_common(10)
words_and_their_frequecy = word_counts.most_common()

In [10]:
most_common

[('the', 5346),
 ('and', 2795),
 ('i', 2729),
 ('to', 2400),
 ('of', 2060),
 ('a', 1566),
 ('was', 1430),
 ('in', 1419),
 ('we', 1226),
 ('my', 1169)]

In [11]:
from  platform import python_version
python_version()

'3.10.6'

In [2]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [23]:
english_stopwords = stopwords.words("english")

In [9]:
len(words_and_their_frequecy)

6992

In [11]:
words_and_their_frequecy[0:4]

[('the', 5346), ('and', 2795), ('i', 2729), ('to', 2400)]

In [21]:
frequency = lambda item: item[1]
words_and_their_frequecy = sorted(words_and_their_frequecy, reverse=True, key=frequency)
words_and_their_frequecy[:4]

[('the', 5346), ('and', 2795), ('i', 2729), ('to', 2400)]

In [24]:
## generic way of doing things
# filtered_word = []
# for word , count in words_and_their_frequecy:
#     if word not in english_stopwords:
#         filtered_word.append((word, count))

In [29]:
filtered_words = [(word, count) for word, count in words_and_their_frequecy if word not in english_stopwords]

In [32]:
len(filtered_word)

6849

In [34]:
filtered_word[:5]

[('would', 575), ('us', 519), ('said', 292), ('roberto', 284), ('could', 252)]

## Sentiment analysis: What is the most positive and most negative chapter?

In [41]:
from nltk.sentiment import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...


True

In [42]:
analyzer = SentimentIntensityAnalyzer()
analyzer

<nltk.sentiment.vader.SentimentIntensityAnalyzer at 0x2826241b310>

In [44]:
type(analyzer)

nltk.sentiment.vader.SentimentIntensityAnalyzer

In [43]:
dir(analyzer)

['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_amplify_ep',
 '_amplify_qm',
 '_but_check',
 '_idioms_check',
 '_least_check',
 '_never_check',
 '_punctuation_emphasis',
 '_sift_sentiment_scores',
 'constants',
 'lexicon',
 'lexicon_file',
 'make_lex_dict',
 'polarity_scores',
 'score_valence',
 'sentiment_valence']

In [45]:
help(analyzer.polarity_scores)

Help on method polarity_scores in module nltk.sentiment.vader:

polarity_scores(text) method of nltk.sentiment.vader.SentimentIntensityAnalyzer instance
    Return a float for sentiment strength based on the input text.
    Positive values are positive valence, negative value are negative
    valence.
    
    :note: Hashtags are not taken into consideration (e.g. #BAD is neutral). If you
        are interested in processing the text in the hashtags too, then we recommend
        preprocessing your data to remove the #, after which the hashtag text may be
        matched as if it was a normal word in the sentence.



In [72]:
a = input()

 I love dogs, I love everyone, I am happy and rejoicing. I do not love cats.


In [73]:
scores = analyzer.polarity_scores(a)
scores

{'neg': 0.0, 'neu': 0.258, 'pos': 0.742, 'compound': 0.9686}

In [101]:
def text_analyzer(text):
    scores = analyzer.polarity_scores(text)
    if scores["pos"] > scores["neg"]:
        return(f"It is a positive chapter, {scores}")
    elif scores["pos"] < scores["neg"]:
        return(f"It is a negative chapter, {scores}")
    else:
        return(f"It is a neutral chapter, {scores}")

# Chapters sentiments analysis

In [89]:
patterns = r"Chapter [0-9]+"
chapters_paragraph = re.split(patterns, book_content)

In [90]:
len(chapters_paragraph)

11

In [91]:
all_chapters_paragraph = chapters_paragraph[1:]

In [92]:
len(all_chapters_paragraph)

10

In [103]:
for index, chapter in enumerate(all_chapters_paragraph):
    print(f"chapter {index+1}, {text_analyzer(chapter)}")

chapter 1, It is a positive chapter, {'neg': 0.061, 'neu': 0.779, 'pos': 0.16, 'compound': 1.0}
chapter 2, It is a positive chapter, {'neg': 0.12, 'neu': 0.726, 'pos': 0.154, 'compound': 0.9991}
chapter 3, It is a negative chapter, {'neg': 0.145, 'neu': 0.751, 'pos': 0.105, 'compound': -0.9999}
chapter 4, It is a negative chapter, {'neg': 0.141, 'neu': 0.721, 'pos': 0.138, 'compound': -0.9963}
chapter 5, It is a positive chapter, {'neg': 0.118, 'neu': 0.742, 'pos': 0.141, 'compound': 0.9997}
chapter 6, It is a negative chapter, {'neg': 0.124, 'neu': 0.761, 'pos': 0.115, 'compound': -0.9979}
chapter 7, It is a negative chapter, {'neg': 0.136, 'neu': 0.761, 'pos': 0.103, 'compound': -0.9999}
chapter 8, It is a negative chapter, {'neg': 0.12, 'neu': 0.786, 'pos': 0.094, 'compound': -0.9998}
chapter 9, It is a negative chapter, {'neg': 0.097, 'neu': 0.824, 'pos': 0.079, 'compound': -0.9996}
chapter 10, It is a positive chapter, {'neg': 0.086, 'neu': 0.733, 'pos': 0.181, 'compound': 1.0}
