In [None]:
import nltk
nltk.download('all') # There's a problem with punkt package it returns a LookUpError error so I try download all package in the nltk and it's work
# nltk.download('punkt')

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/abc.zip.
[nltk_data]    | Downloading package alpino to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/alpino.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_eng to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       taggers/averaged_perceptron_tagger_eng.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       taggers/averaged_perceptron_tagger_ru.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_rus to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |  

True

# Use nltk module

In [6]:
import re
import time
from nltk.tokenize import sent_tokenize, word_tokenize
from collections import Counter
from nltk.corpus import stopwords

# Start the timer
start_time = time.time()

# Open and read the file
with open('sources/alice29.txt', 'r', encoding='utf-8') as file:
    text = file.read()

# Clean the text by removing non-alphabetic characters and converting to lowercase
text = re.sub('[^a-zA-Z]', ' ', text)  # Only keep alphabetic characters and spaces
text = text.lower()  # Convert text to lowercase

# Tokenize the text into sentences
sentences = sent_tokenize(text)

# Tokenize each sentence into words
tokenized_words = [word_tokenize(sentence) for sentence in sentences]

# Get the list of stopwords
stop_words = set(stopwords.words('english'))

# Remove stopwords from the tokenized words
filtered_words = [
    [word for word in sentence if word not in stop_words]
    for sentence in tokenized_words
]

# Save the tokenized sentences to a file as a comma-separated list
with open('TokenizedSentences.txt', 'w', encoding='utf-8') as file:
    file.write(', '.join(sentences) + '\n')  # Comma-separated sentences

# Save the tokenized words to a file, with words separated by commas
with open('TokenizedWords.txt', 'w', encoding='utf-8') as file:
    for sentence in filtered_words:
        file.write(', '.join(sentence) + '\n')  # Comma-separated words

# Count the most common words
flattened_words = [word for sentence in filtered_words for word in sentence]
word_counts = Counter(flattened_words)

# Get the 10 most common words and their counts
most_common_words = word_counts.most_common(10)

# Print the most common words and their counts
print("\n10 Most Common Words and their Counts:")
for word, count in most_common_words:
    print(f"{word}: {count}")

# End the timer
end_time = time.time()

# Calculate and print the elapsed time
elapsed_time = end_time - start_time
print(f"\nElapsed Time: {elapsed_time:.5f} seconds")



10 Most Common Words and their Counts:
said: 462
alice: 398
little: 128
one: 104
know: 88
like: 85
would: 83
went: 83
could: 77
queen: 75

Elapsed Time: 0.06181 seconds


# Use textblob module

In [None]:
from textblob import TextBlob
from collections import Counter
from nltk.corpus import stopwords
import re
import time

# Start the timer
start_time = time.time()

# Load stopwords from NLTK
stop_words = set(stopwords.words('english'))

# Open and read the file
try:
    with open('resources/alice29.txt', 'r', encoding='utf-8') as file:
        text = file.read()
except FileNotFoundError:
    print("File not found. Please check the file path.")
    raise

# Clean the text: remove non-alphabetic characters and convert to lowercase
cleaned_text = re.sub(r'[^a-zA-Z]', ' ', text).lower()

# Create a TextBlob object
blob = TextBlob(cleaned_text)

# Tokenize the text into words, removing stopwords
filtered_words = [
    word for word in blob.words if word not in stop_words
]

# Count the most common words
word_counts = Counter(filtered_words)

# Get the 10 most common words and their counts
most_common_words = word_counts.most_common(10)

# Save tokenized words and sentences to files
with open('TokenizedSentences.txt', 'w', encoding='utf-8') as sent_file, \
     open('TokenizedWords.txt', 'w', encoding='utf-8') as word_file:
    for sentence in blob.sentences:
        sent_file.write(str(sentence) + '\n')
    word_file.write(', '.join(filtered_words) + '\n')

# Print the most common words and their counts
print("\n10 Most Common Words and their Counts:")
for word, count in most_common_words:
    print(f"{word}: {count}")

# End the timer
end_time = time.time()

# Calculate and print the elapsed time
elapsed_time = end_time - start_time
print(f"\nElapsed Time: {elapsed_time:.5f} seconds")


ModuleNotFoundError: No module named 'textblob'

# Use spacy module

In [None]:
import spacy
from collections import Counter
import re
import time

# Start the timer
start_time = time.time()

# Load the spaCy English language model
nlp = spacy.load('en_core_web_sm')

# Open and read the file
try:
    with open('/content/drive/MyDrive/NLP_Assignment/resources/alice29.txt', 'r', encoding='utf-8') as file:
        text = file.read()
except FileNotFoundError:
    print("File not found. Please check the file path.")
    raise

# Clean the text using regular expressions
cleaned_text = re.sub(r'[^a-zA-Z\s]', ' ', text).lower()

# Process the cleaned text using spaCy
doc = nlp(cleaned_text)

# Tokenize the text into sentences
sentences = list(doc.sents)

# Tokenize each sentence into words, removing stopwords and punctuation
filtered_words = [
    [token.text for token in sentence if not token.is_stop and not token.is_punct and token.is_alpha]
    for sentence in sentences
]

# Flatten the list of filtered words for counting
flattened_words = [word for sentence in filtered_words for word in sentence]

# Count the most common words
word_counts = Counter(flattened_words)

# Get the 10 most common words and their counts
most_common_words = word_counts.most_common(10)

# Save the tokenized sentences to a file
with open('TokenizedSentences.txt', 'w', encoding='utf-8') as file:
    for sentence in sentences:
        file.write(sentence.text.strip() + '\n')

# Save the tokenized words to a file, with words separated by commas
with open('TokenizedWords.txt', 'w', encoding='utf-8') as file:
    for sentence in filtered_words:
        file.write(', '.join(sentence) + '\n')

# Print the 10 most common words and their counts
print("\n10 Most Common Words and their Counts:")
for word, count in most_common_words:
    print(f"{word}: {count}")

# End the timer
end_time = time.time()

# Calculate and print the elapsed time
elapsed_time = end_time - start_time
print(f"\nElapsed Time: {elapsed_time:.5f} seconds")



10 Most Common Words and their Counts:
said: 462
alice: 398
t: 218
s: 201
little: 128
know: 88
like: 85
went: 83
queen: 75
thought: 74

Elapsed Time: 7.52701 seconds
