# Work with Nature Language Programming Book Analysis

## Load the book

In [1]:
from glob import glob
filepath: str = glob('*.txt')[0]
with open(filepath, encoding='utf-8') as output:
    book: str = output.read()

## The most used words(non-articles)

In [7]:
# Primeiro temos de descobrir as palavras mais usadas com articles, como fizemos no outro ficheiro:
import re
pattern = re.compile('[a-zA-Z]+')
findings: list = re.findall(pattern, book.lower())
d: dict = {}
for word in findings:
    if word in d:
        d[word] += 1
    else:
        d[word] = 1
d_list: list = sorted([(value, key) for key, value in d.items()], reverse=True)

In [11]:
# from platform import python_version
# python_version()

In [14]:
# !pip3.12 install nltk



In [25]:
# Vamos importar nlp library, para trabalhar com natural language programming
# Sobre o módulo nltk:
# O Natural Language Toolkit (nltk) é uma biblioteca de código aberto e popular para natural language Processing (PLN) em Python. Ele fornece uma ampla gama de ferramentas para analisar,
# entender e gerar linguagem humana.
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

english_stopwords = stopwords.words('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\cmmon\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [33]:
filtered_words: list = []
for count, word in d_list:
    if word not in english_stopwords:
        filtered_words.append((count, word))
filtered_words[:5]

[(575, 'would'), (519, 'us'), (292, 'said'), (284, 'roberto'), (252, 'could')]

## Sentimental Analysis

### An example

In [37]:
nltk.download('vader_lexicon')
from nltk.sentiment import SentimentIntensityAnalyzer

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\cmmon\AppData\Roaming\nltk_data...


In [45]:
analyzer = SentimentIntensityAnalyzer()
# dir(analyzer)

In [46]:
analyzer.polarity_scores('Hey, look how beautiful the trees are.')

{'neg': 0.0, 'neu': 0.606, 'pos': 0.394, 'compound': 0.5994}

In [43]:
analyzer.polarity_scores('Hey, look how beautiful the trees are.I love them')

{'neg': 0.0, 'neu': 0.464, 'pos': 0.536, 'compound': 0.8442}

In [58]:
analyzer.polarity_scores('Hey, look how beautiful the trees are.I love them. I really love them. Its look like heaven')

{'neg': 0.0, 'neu': 0.378, 'pos': 0.622, 'compound': 0.959}

In [59]:
scrore = analyzer.polarity_scores('Hey, look how bad the trees are.I hate them. I really hate them. Its look like hell')

In [60]:
if scores['pos'] > scores['neg']:
    print('It is a positive text')
else:
    print('It is a negative text')

It is a positive text


In [61]:
analyzer.polarity_scores(book)

{'neg': 0.116, 'neu': 0.76, 'pos': 0.125, 'compound': 1.0}

### What is the most positive chapter and the most negative chapter

In [68]:
import re

pattern = re.compile('Chapter [0-9]+')
chapters = re.split(pattern, book)[1:]
# vai criar uma lista, onde cada elemento é um capítulo
analyzer.polarity_scores(book)
positive = {}
for number, chapter in enumerate(chapters):
    scores = analyzer.polarity_scores(chapter)
    print(number + 1, scores)

1 {'neg': 0.061, 'neu': 0.779, 'pos': 0.16, 'compound': 1.0}
2 {'neg': 0.12, 'neu': 0.726, 'pos': 0.154, 'compound': 0.9991}
3 {'neg': 0.145, 'neu': 0.751, 'pos': 0.105, 'compound': -0.9999}
4 {'neg': 0.141, 'neu': 0.721, 'pos': 0.138, 'compound': -0.9963}
5 {'neg': 0.118, 'neu': 0.742, 'pos': 0.141, 'compound': 0.9997}
6 {'neg': 0.124, 'neu': 0.761, 'pos': 0.115, 'compound': -0.9979}
7 {'neg': 0.136, 'neu': 0.761, 'pos': 0.103, 'compound': -0.9999}
8 {'neg': 0.12, 'neu': 0.786, 'pos': 0.094, 'compound': -0.9998}
9 {'neg': 0.097, 'neu': 0.824, 'pos': 0.079, 'compound': -0.9996}
10 {'neg': 0.086, 'neu': 0.733, 'pos': 0.181, 'compound': 1.0}
