In [3]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.probability import FreqDist

# Downloads (Fixes LookupError)
nltk.download('punkt')
nltk.download('punkt_tab')   # ✅ required in latest NLTK
nltk.download('stopwords')

# Load document
def load_document(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.read()

# Tokenization
def tokenize_document(document):
    tokens = word_tokenize(document)
    return [word.lower() for word in tokens if word.isalpha()]  # Remove punctuation and convert to lowercase

# Remove stopwords
def remove_stopwords(tokens):
    stop_words = set(stopwords.words('english'))
    return [word for word in tokens if word not in stop_words]

# Morphology (word frequency distribution)
def find_morphology(tokens, top_n=10):
    fdist = FreqDist(tokens)
    return fdist.most_common(top_n)  # ✅ return only top N most common words

# Main execution
document_path = "/content/drive/MyDrive/NLP LAB TASK/NLP Task 4 text.txt"  # ✅ corrected path
document = load_document(document_path)
tokens = tokenize_document(document)
tokens_without_stopwords = remove_stopwords(tokens)
morphology = find_morphology(tokens_without_stopwords, top_n=15)  # ✅ top 15 common words

# Print results
print("Morphology of the document (Top 15 words):")
for word, frequency in morphology:
    print(f"{word}: {frequency}")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


Morphology of the document (Top 15 words):
language: 2
artificial: 1
intelligence: 1
transforming: 1
industries: 1
worldwide: 1
natural: 1
processing: 1
allows: 1
machines: 1
understand: 1
process: 1
human: 1
effectively: 1
data: 1


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
