## Install Libraries

- `nltk`: For natural language processing tasks, such as tokenization and stopword removal.
- `re`: For cleaning text using regular expressions.
- `collections.Counter`: For counting word frequencies.

In [34]:
import re
import string
import nltk
from collections import Counter
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords

# Ensure the necessary NLTK resources are downloaded
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /Users/bampatra/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/bampatra/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Cleaning the text

1. clean_text function:
Converts text to lowercase, removes punctuation, numbers, extra spaces, and stopwords, then joins the cleaned words back into a string.

2. tokenize_text_into_sentences_and_words function:
Splits text into sentences, then tokenizes each sentence into words.

3. Main workflow:
Reads alice29.txt, cleans the text, and writes the cleaned text to cleaned.txt.

In [35]:
def clean_text(text):
    # Convert text to lowercase
    text = text.lower()

    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))

    # Remove numbers
    text = re.sub(r'\d+', '', text)

    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()

    # Tokenize the text (split into words)
    tokens = word_tokenize(text)

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    cleaned_tokens = [word for word in tokens if word not in stop_words]

    # Join the cleaned tokens back into a string
    cleaned_text = ' '.join(cleaned_tokens)
    
    return cleaned_text

def tokenize_text_into_sentences_and_words(text):
    # Tokenize the cleaned text into sentences
    sentences = sent_tokenize(text)

    # Tokenize each sentence into words
    sentence_words = [word_tokenize(sentence) for sentence in sentences]

    return sentence_words

# Read the content of alice29.txt
with open('resource/alice29.txt', 'r') as file:
    text = file.read()

# Clean the text
cleaned_text = clean_text(text)

# Write the cleaned text to cleaned.txt
with open('cleaned.txt', 'w') as cleaned_file:
    cleaned_file.write(cleaned_text)

print("Cleaning complete! The cleaned text has been written to 'cleaned.txt'.")

Cleaning complete! The cleaned text has been written to 'cleaned.txt'.


## Tokenizing it into sentences and words

1. Tokenize cleaned text: The code tokenizes the cleaned text into sentences and words using tokenize_text_into_sentences_and_words.

2. Write tokens to file: It writes each word from the tokenized sentences to word.txt, with each word on a new line.

3. Completion message: It prints a message confirming that the word tokenization is complete.

In [36]:
# Tokenize the cleaned text into sentences and words
sentence_words = tokenize_text_into_sentences_and_words(cleaned_text)

# Write the word tokens into word.txt
with open('word.txt', 'w') as word_file:
    for sentence in sentence_words:
        for word in sentence:
            word_file.write(word + '\n')

print("Tokenization complete! The words have been written to 'word.txt'.")

Tokenization complete! The words have been written to 'word.txt'.


## Basic frequency analysis to identify the most common words in the text.

1. tokenize_text_into_sentences_and_words: Tokenizes the cleaned text into sentences, then splits each sentence into words.

2. perform_frequency_analysis: Counts the frequency of each word in the list of tokens using Counter.

In [39]:
def clean_text(text):
    # Convert text to lowercase
    text = text.lower()

    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))

    # Remove numbers
    text = re.sub(r'\d+', '', text)

    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()

    # Tokenize the text (split into words)
    tokens = word_tokenize(text)

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    cleaned_tokens = [word for word in tokens if word not in stop_words]

    # Return the cleaned text (joined as a string of words)
    return cleaned_tokens

def tokenize_text_into_sentences_and_words(text):
    # Tokenize the cleaned text into sentences
    sentences = sent_tokenize(text)

    # Tokenize each sentence into words
    sentence_words = [word_tokenize(sentence) for sentence in sentences]

    return sentence_words

def perform_frequency_analysis(tokens):
    # Count the frequency of each word using Counter
    word_counts = Counter(tokens)

    return word_counts

# Read the content of alice29.txt
try:
    with open('resource/alice29.txt', 'r') as file:
        text = file.read()
except FileNotFoundError:
    print("The file 'alice29.txt' was not found. Please check the file path.")
    exit()

# Clean the text and get the list of cleaned tokens (words)
cleaned_tokens = clean_text(text)

# Write the cleaned tokens to cleaned.txt (each word on a new line)
with open('cleaned.txt', 'w') as cleaned_file:
    cleaned_file.write(' '.join(cleaned_tokens))

print("Cleaning complete! The cleaned text has been written to 'cleaned.txt'.")

# Perform frequency analysis on the cleaned tokens
word_counts = perform_frequency_analysis(cleaned_tokens)

# Get the top 10 most common words and their counts
top_10_words = word_counts.most_common(10)

# Write the top 10 words and their frequencies to top_10_words.txt
with open('top_10_words.txt', 'w') as top_words_file:
    for word, count in top_10_words:
        top_words_file.write(f"{word}: {count}\n")

print("Frequency analysis complete! The top 10 words have been written to 'top_10_words.txt'.")

Cleaning complete! The cleaned text has been written to 'cleaned.txt'.
Frequency analysis complete! The top 10 words have been written to 'top_10_words.txt'.
