<a href="https://colab.research.google.com/github/0Augusto/Topicos_ComputacaoIII/blob/main/AS01.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Cells
A notebook is a list of cells. Cells contain either explanatory text or executable code and its output. Click a cell to select it.

## Code cells
Below is a **code cell**. Once the toolbar button indicates CONNECTED, click in the cell to select it and execute the contents in the following ways:

* Click the **Play icon** in the left gutter of the cell;
* Type **Cmd/Ctrl+Enter** to run the cell in place;
* Type **Shift+Enter** to run the cell and move focus to the next cell (adding one if none exists); or
* Type **Alt+Enter** to run the cell and insert a new code cell immediately below it.

There are additional options for running some or all cells in the **Runtime** menu.


In [19]:
#Normalizacao
import re
import unicodedata

def normalize_text(text):
    # Reduzir o texto para letras minúsculas
    text = text.lower()

    # Remover acentos e diacríticos
    text = unicodedata.normalize('NFD', text)
    text = ''.join([c for c in text if not unicodedata.combining(c)])

    # Canonicalizar acrônimos, símbolos de moeda, datas e palavras hifenizadas
    text = re.sub(r'\b(\w+)\-(\w+)\b', r'\1\2', text)  # Remover hífens em palavras hifenizadas
    text = re.sub(r'\b(\d{1,2})/(\d{1,2})/(\d{2,4})\b', r'\1-\2-\3', text)  # Formatar datas
    text = re.sub(r'\$', 'USD', text)  # Canonicalizar símbolo de moeda

    # Remover pontuação (exceto para símbolos de moeda e datas)
    text = re.sub(r'[^\w\s\.\-\$]', '', text)

    # Remover caracteres especiais
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)

    return text

# Carregar e normalizar o texto
with open('/content/sample_data/Shakespeare.txt', 'r') as file:
    text = file.read()

normalized_text = normalize_text(text)

# Salvar o texto normalizado em um arquivo
with open('/content/sample_data/Shakespeare_Normalized.txt', 'w') as file:
    file.write(normalized_text)


In [20]:
#Tokenização
import nltk
from nltk.tokenize import word_tokenize, TreebankWordTokenizer, WordPunctTokenizer, TweetTokenizer
from nltk.tokenize.mwe import MWETokenizer
from textblob import TextBlob
import spacy
from gensim.utils import tokenize as gensim_tokenize
from keras.preprocessing.text import Tokenizer

# Carregar o texto normalizado
with open('/content/sample_data/Shakespeare_Normalized.txt', 'r') as file:
    normalized_text = file.read()

# 1. Tokenização por espaço em branco
tokenized_text_01 = normalized_text.split()
with open('/content/sample_data/Shakespeare_Normalized_Tokenized01.txt', 'w') as file:
    file.write(' '.join(tokenized_text_01))

# 2. Tokenização usando o Word Tokenizer do NLTK
tokenized_text_02 = word_tokenize(normalized_text)
with open('/content/sample_data/Shakespeare_Normalized_Tokenized02.txt', 'w') as file:
    file.write(' '.join(tokenized_text_02))

# 3. Tokenização usando o Treebank Word Tokenizer do NLTK
treebank_tokenizer = TreebankWordTokenizer()
tokenized_text_03 = treebank_tokenizer.tokenize(normalized_text)
with open('/content/sample_data/Shakespeare_Normalized_Tokenized03.txt', 'w') as file:
    file.write(' '.join(tokenized_text_03))

# 4. Tokenização usando o Word Punctuation Tokenizer do NLTK
word_punct_tokenizer = WordPunctTokenizer()
tokenized_text_04 = word_punct_tokenizer.tokenize(normalized_text)
with open('/content/sample_data/Shakespeare_Normalized_Tokenized04.txt', 'w') as file:
    file.write(' '.join(tokenized_text_04))

# 5. Tokenização usando o Tweet Tokenizer do NLTK
tweet_tokenizer = TweetTokenizer()
tokenized_text_05 = tweet_tokenizer.tokenize(normalized_text)
with open('/content/sample_data/Shakespeare_Normalized_Tokenized05.txt', 'w') as file:
    file.write(' '.join(tokenized_text_05))

# 6. Tokenização usando o MWE Tokenizer (Tokenizador de múltiplas palavras) do NLTK
mwe_tokenizer = MWETokenizer([('multi', 'word'), ('two', 'word')])
tokenized_text_06 = mwe_tokenizer.tokenize(normalized_text.split())
with open('/content/sample_data/Shakespeare_Normalized_Tokenized06.txt', 'w') as file:
    file.write(' '.join(tokenized_text_06))

# 7. Tokenização usando o TextBlob
tokenized_text_07 = TextBlob(normalized_text).words
with open('/mnt/data/Shakespeare_Normalized_Tokenized07.txt', 'w') as file:
    file.write(' '.join(tokenized_text_07))

# 8. Tokenização usando o spaCy
nlp = spacy.load("en_core_web_sm")
doc = nlp(normalized_text)
tokenized_text_08 = [token.text for token in doc]
with open('/content/sample_data/Shakespeare_Normalized_Tokenized08.txt', 'w') as file:
    file.write(' '.join(tokenized_text_08))

# 9. Tokenização usando o Gensim
tokenized_text_09 = list(gensim_tokenize(normalized_text))
with open('/content/sample_data/Shakespeare_Normalized_Tokenized09.txt', 'w') as file:
    file.write(' '.join(tokenized_text_09))

# 10. Tokenização usando o Keras Tokenization
keras_tokenizer = Tokenizer()
keras_tokenizer.fit_on_texts([normalized_text])
tokenized_text_10 = keras_tokenizer.texts_to_sequences([normalized_text])[0]
with open('/content/sample_data/Shakespeare_Normalized_Tokenized10.txt', 'w') as file:
    file.write(' '.join(map(str, tokenized_text_10)))


ModuleNotFoundError: No module named 'keras.preprocessing.text'

In [None]:
#Remoção de Stop-words
from nltk.corpus import stopwords

# Baixar as stopwords do NLTK
nltk.download('stopwords')

# Carregar o texto tokenizado da tarefa 02 (NLTK Word Tokenizer)
with open('/content/sample_data/Shakespeare_Normalized_Tokenized02.txt', 'r') as file:
    tokenized_text = file.read().split()

# Remover stop-words usando a lista do NLTK
stop_words = set(stopwords.words('english'))
filtered_text = [word for word in tokenized_text if word.lower() not in stop_words]

# Salvar o texto sem stop-words em um arquivo
with open('/content/sample_data/Shakespeare_Normalized_Tokenized_StopWord.txt', 'w') as file:
    file.write(' '.join(filtered_text))


In [None]:
#Lematização
from nltk.stem import WordNetLemmatizer

# Baixar os recursos do WordNet
nltk.download('wordnet')

# Carregar o texto sem stop-words
with open('/content/sample_data/Shakespeare_Normalized_Tokenized_StopWord.txt', 'r') as file:
    filtered_text = file.read().split()

# Lematizar o texto
lemmatizer = WordNetLemmatizer()
lemmatized_text = [lemmatizer.lemmatize(word) for word in filtered_text]

# Salvar o texto lematizado em um arquivo
with open('/content/sample_data/Shakespeare_Normalized_Tokenized_StopWord_Lemmatized.txt', 'w') as file:
    file.write(' '.join(lemmatized_text))


In [None]:
#Stemming
from nltk.stem import PorterStemmer, SnowballStemmer

# Carregar o texto lematizado
with open('/content/sample_data/Shakespeare_Normalized_Tokenized_StopWord_Lemmatized.txt', 'r') as file:
    lemmatized_text = file.read().split()

# 1. Aplicar o Porter Stemmer
porter_stemmer = PorterStemmer()
stemmed_text_01 = [porter_stemmer.stem(word) for word in lemmatized_text]
with open('/content/sample_data/Shakespeare_Normalized_Tokenized_StopWord_Lemmatized_Stemming01.txt', 'w') as file:
    file.write(' '.join(stemmed_text_01))

# 2. Aplicar o Snowball Stemmer
snowball_stemmer = SnowballStemmer("english")
stemmed_text_02 = [snowball_stemmer.stem(word) for word in lemmatized_text]
with open('/content/sample_data/Shakespeare_Normalized_Tokenized_StopWord_Lemmatized_Stemming02.txt', 'w') as file:
    file.write(' '.join(stemmed_text_02))


In [None]:
#Análise do Vocabulário
import csv
from collections import Counter

def analyze_vocabulary(text, output_file):
    # Contar a frequência de cada token no texto
    counter = Counter(text)
    # Salvar a análise em um arquivo CSV
    with open(output_file, 'w', newline='') as csvfile:
        fieldnames = ['Token', 'Occurrences', 'Length']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        for token, count in counter.items():
            writer.writerow({'Token': token, 'Occurrences': count, 'Length': len(token)})

# Análise do vocabulário para o texto lematizado
analyze_vocabulary(lemmatized_text, '/content/sample_data/Shakespeare_Vocabulary_Lemmatized.csv')

# Análise do vocabulário para o texto com Porter Stemmer
analyze_vocabulary(stemmed_text_01, '/content/sample_data/Shakespeare_Vocabulary_Porter.csv')

# Análise do vocabulário para o texto com Snowball Stemmer
analyze_vocabulary(stemmed_text_02, '/content/sample_data/Shakespeare_Vocabulary_Snowball.csv')


In [None]:
#Comparação do Vocabulário
def compare_vocabulary(lemmatized_file, porter_file, snowball_file, output_file):
    with open(output_file, 'w') as file:
        file.write("Vocabulary Analysis\n\n")
        for method, filename in [("Lemmatizer", lemmatized_file), ("Porter Stemmer", porter_file), ("Snowball Stemmer", snowball_file)]:
            with open(filename, 'r') as csvfile:
                reader = csv.DictReader(csvfile)
                tokens = [row['Token'] for row in reader]
                avg_occurrences = sum(int(row['Occurrences']) for row in reader) / len(tokens)
                avg_length = sum(int(row['Length']) for row in reader) / len(tokens)
                file.write(f"{method}:\n")
                file.write(f"Tamanho do Vocabulário: {len(tokens)}\n")
                file.write(f"Média de Ocorrências: {avg_occurrences:.2f}\n")
                file.write(f"Tamanho Médio dos Tokens: {avg_length:.2f}\n\n")

compare_vocabulary(
    '/content/sample_data/Shakespeare_Vocabulary_Lemmatized.csv',
    '/content/sample_data/Shakespeare_Vocabulary_Porter.csv',
    '/content/sample_data/Shakespeare_Vocabulary_Snowball.csv',
    '/content/sample_data/Shakespeare_Vocabulary_Analysis.txt'
)
