<a href="https://colab.research.google.com/github/Cachipa/Topicos-3/blob/main/AS01_Pr%C3%A9_processamentoTextual.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Normalização**

In [None]:
import re
import unicodedata
import gdown
# Download do Shakespeare.txt
url = 'https://drive.google.com/file/d/1lBzL2rDT7ksw80ibzkW_1q5XcpdHJcC0/view?usp=drive_link'
output = 'Shakespeare.txt'
gdown.download( url, output, fuzzy=True)

# Carregar o arquivo Shakespeare.txt
with open('Shakespeare.txt', 'r', encoding='utf-8') as file:
    shakespeare_text = file.read()

# Lower case reduction
shakespeare_text = shakespeare_text.lower()

# Accent and diacritic removal
def remove_accents(input_str):
    nfkd_form = unicodedata.normalize('NFKD', input_str)
    return u"".join([c for c in nfkd_form if not unicodedata.combining(c)])

shakespeare_text = remove_accents(shakespeare_text)

# Canonicalizing of acronyms, currency, date and hyphenated words
# Punctuation removal (except currency and date).
# Special characters removal
nospecial_text = re.sub('\.(?!(\$[^. ])|\d)', '', shakespeare_text)
nospecial_text = re.sub('(?<!\d)[.,;!?\'\(\)#:-](?!\d)', '', nospecial_text)
nospecial_text = re.sub(' +', ' ', nospecial_text)

with open('Shakespeare_Normalized.txt', 'w', encoding='utf-8') as file:
    file.write(nospecial_text)


Downloading...
From: https://drive.google.com/uc?id=1lBzL2rDT7ksw80ibzkW_1q5XcpdHJcC0
To: /content/Shakespeare.txt
100%|██████████| 100k/100k [00:00<00:00, 26.6MB/s]


**Tokenização**

In [None]:
import nltk
nltk.download('punkt')
from nltk.tokenize import TreebankWordTokenizer, wordpunct_tokenize, TweetTokenizer, MWETokenizer
from textblob import TextBlob
import spacy
import gensim
from keras.preprocessing.text import text_to_word_sequence

with open('Shakespeare_Normalized.txt', 'r', encoding='utf-8') as file:
    normalized_text = file.read()

# 01. White Space Tokenization
tokenized_whitespace = normalized_text.split()

# 02. NLTK: Word Tokenizer
nltk_tokenizer = nltk.word_tokenize(normalized_text)

# 03. NLTK: Tree Bank Tokenizer
treebank_tokenizer = TreebankWordTokenizer().tokenize(normalized_text)

# 04. NLTK: Word Punctuation Tokenizer
word_punct_tokenizer = wordpunct_tokenize(normalized_text)

# 05. NLTK: Tweet Tokenizer
tweet_tokenizer = TweetTokenizer().tokenize(normalized_text)

# 06. NLTK: MWE Tokenizer
mwe_tokenizer = MWETokenizer().tokenize(normalized_text.split())

# 07. TextBlob Word Tokenizer
textblob_tokenizer = TextBlob(normalized_text).words

# 08. spaCy Tokenizer
nlp = spacy.load("en_core_web_sm")
doc = nlp(normalized_text)
spacy_tokenizer = [token.text for token in doc]

# 09. Gensim Word Tokenizer
gensim_tokenizer = list(gensim.utils.simple_tokenize(normalized_text))

# 10. Keras Tokenization
keras_tokenization = text_to_word_sequence(normalized_text)

# Salvar os resultados em arquivos separados
tokenization_methods = [tokenized_whitespace, nltk_tokenizer, treebank_tokenizer, word_punct_tokenizer,
                        tweet_tokenizer, mwe_tokenizer, textblob_tokenizer, spacy_tokenizer,
                        gensim_tokenizer, keras_tokenization]
file_names = ["Shakespeare_Normalized_Tokenized{:02d}.txt".format(i+1) for i in range(len(tokenization_methods))]

for method, file_name in zip(tokenization_methods, file_names):
    with open(file_name, 'w', encoding='utf-8') as file:
        file.write(' '.join(method))

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


**Stop-words Removal**

In [None]:
from nltk.corpus import stopwords
nltk.download('stopwords')

with open('Shakespeare_Normalized_Tokenized02.txt', 'r', encoding='utf-8') as file:
    tokenized_text = file.read().split()

# Remover stop-words
stop_words = set(stopwords.words('english'))
filtered_text = [word for word in tokenized_text if word.lower() not in stop_words]

with open('Shakespeare_Normalized_Tokenized_StopWord.txt', 'w', encoding='utf-8') as file:
    file.write(' '.join(filtered_text))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


**Text Lemmatization**

In [None]:
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

with open('Shakespeare_Normalized_Tokenized_StopWord.txt', 'r', encoding='utf-8') as file:
    stopword_removed_text = file.read().split()

# Lematização
lemmatizer = WordNetLemmatizer()
lemmatized_text = [lemmatizer.lemmatize(word) for word in stopword_removed_text]

with open('Shakespeare_Normalized_Tokenized_StopWord_Lemmatized.txt', 'w', encoding='utf-8') as file:
    file.write(' '.join(lemmatized_text))

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


**Text Stemming**

In [None]:
from nltk.stem import PorterStemmer, SnowballStemmer

with open('Shakespeare_Normalized_Tokenized_StopWord_Lemmatized.txt', 'r', encoding='utf-8') as file:
    lemmatized_text = file.read().split()

# Aplicar Stemming com Porter Stemmer
porter_stemmer = PorterStemmer()
stemmed_text_porter = [porter_stemmer.stem(word) for word in lemmatized_text]

# Aplicar Stemming com Snowball Stemmer
snowball_stemmer = SnowballStemmer('english')
stemmed_text_snowball = [snowball_stemmer.stem(word) for word in lemmatized_text]

# Salvar os resultados em arquivos separados
with open('Shakespeare_Normalized_Tokenized_StopWord_Lemmatized_Stemming01.txt', 'w', encoding='utf-8') as file:
    file.write(' '.join(stemmed_text_porter))
with open('Shakespeare_Normalized_Tokenized_StopWord_Lemmatized_Stemming02.txt', 'w', encoding='utf-8') as file:
    file.write(' '.join(stemmed_text_snowball))

**Análise do Vocabulário**

In [None]:
import csv
from collections import Counter

def analyze_vocabulary(tokens, filename):

    token_counts = Counter(tokens)
    token_lengths = {token: len(token) for token in token_counts.keys()}

    # Escrevendo os resultados em um arquivo CSV
    with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
        fieldnames = ['Token', 'Ocorrências', 'Tamanho do Token']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        for token, count in token_counts.items():
            writer.writerow({'Token': token, 'Ocorrências': count, 'Tamanho do Token': token_lengths[token]})

# Carregar o texto lematizado e stemming
with open('Shakespeare_Normalized_Tokenized_StopWord_Lemmatized_Stemming01.txt', 'r', encoding='utf-8') as file:
    porter_stemmed_text = file.read().split()
with open('Shakespeare_Normalized_Tokenized_StopWord_Lemmatized_Stemming02.txt', 'r', encoding='utf-8') as file:
    snowball_stemmed_text = file.read().split()
with open('Shakespeare_Normalized_Tokenized_StopWord_Lemmatized.txt', 'r', encoding='utf-8') as file:
    lemmatized_text = file.read().split()

# Realizar análise de vocabulário
analyze_vocabulary(porter_stemmed_text, 'Shakespeare_Vocabulary_Porter.csv')
analyze_vocabulary(snowball_stemmed_text, 'Shakespeare_Vocabulary_Snowball.csv')
analyze_vocabulary(lemmatized_text, 'Shakespeare_Vocabulary_Lemmatized.csv')

def analyze_csv(filename):
    with open(filename, 'r', newline='', encoding='utf-8') as csvfile:
        reader = csv.DictReader(csvfile)
        tokens = []
        occurrences = []
        token_lengths = []
        for row in reader:
            tokens.append(row['Token'])
            occurrences.append(int(row['Ocorrências']))
            token_lengths.append(int(row['Tamanho do Token']))

        total_tokens = len(tokens)
        avg_occurrences = sum(occurrences) / total_tokens
        avg_token_length = sum(token_lengths) / total_tokens

        return total_tokens, avg_occurrences, avg_token_length

porter_total_tokens, porter_avg_occurrences, porter_avg_length = analyze_csv('Shakespeare_Vocabulary_Porter.csv')
snowball_total_tokens, snowball_avg_occurrences, snowball_avg_length = analyze_csv('Shakespeare_Vocabulary_Snowball.csv')
lemmatized_total_tokens, lemmatized_avg_occurrences, lemmatized_avg_length = analyze_csv('Shakespeare_Vocabulary_Lemmatized.csv')

# Escrevendo o arquivo final
with open('Shakespeare_Vocabulary_Analysis.txt', 'w', encoding='utf-8') as analysis_file:
    analysis_file.write("Comparação de Análise de Vocabulário\n\n")
    analysis_file.write("Método de Stemming: Porter\n")
    analysis_file.write(f"Tamanho do Vocabulário: {porter_total_tokens}\n")
    analysis_file.write(f"Número Médio de Ocorrências: {porter_avg_occurrences:.2f}\n")
    analysis_file.write(f"Tamanho Médio dos Tokens: {porter_avg_length:.2f}\n\n")

    analysis_file.write("Método de Stemming: Snowball\n")
    analysis_file.write(f"Tamanho do Vocabulário: {snowball_total_tokens}\n")
    analysis_file.write(f"Número Médio de Ocorrências: {snowball_avg_occurrences:.2f}\n")
    analysis_file.write(f"Tamanho Médio dos Tokens: {snowball_avg_length:.2f}\n\n")

    analysis_file.write("Método de Lematização\n")
    analysis_file.write(f"Tamanho do Vocabulário: {lemmatized_total_tokens}\n")
    analysis_file.write(f"Número Médio de Ocorrências: {lemmatized_avg_occurrences:.2f}\n")
    analysis_file.write(f"Tamanho Médio dos Tokens: {lemmatized_avg_length:.2f}\n")