In [1]:
import nltk
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet

# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')




[nltk_data] Downloading package punkt to /Users/aiwaziri/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/aiwaziri/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/aiwaziri/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/aiwaziri/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [2]:
# Initialize stemmer and lemmatizer
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

def get_wordnet_pos(word):
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ, "N": wordnet.NOUN, "V": wordnet.VERB, "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)  # Default to NOUN



In [3]:
# Read text from a file
file_path = 'wiki_mountain_def.txt'

with open(file_path, 'r', encoding='utf-8') as file:
    text = file.read()

# Tokenize the text
tokens = word_tokenize(text)


In [4]:
# Stemming
stemmed_words = [stemmer.stem(word) for word in tokens]
print("Stemmed Words:", stemmed_words)



Stemmed Words: ['there', 'is', 'no', 'univers', 'accept', 'definit', 'of', 'a', 'mountain', '.', 'elev', ',', 'volum', ',', 'relief', ',', 'steep', ',', 'space', 'and', 'continu', 'have', 'been', 'use', 'as', 'criteria', 'for', 'defin', 'a', 'mountain', '.', '[', '4', ']', 'in', 'the', 'oxford', 'english', 'dictionari', 'a', 'mountain', 'is', 'defin', 'as', '``', 'a', 'natur', 'elev', 'of', 'the', 'earth', 'surfac', 'rise', 'more', 'or', 'less', 'abruptli', 'from', 'the', 'surround', 'level', 'and', 'attain', 'an', 'altitud', 'which', ',', 'rel', 'to', 'the', 'adjac', 'elev', ',', 'is', 'impress', 'or', 'notabl', '.', '``', '[', '4', ']', 'whether', 'a', 'landform', 'is', 'call', 'a', 'mountain', 'may', 'depend', 'on', 'local', 'usag', '.', 'john', 'whittow', "'s", 'dictionari', 'of', 'physic', 'geographi', '[', '5', ']', 'state', '``', 'some', 'author', 'regard', 'emin', 'abov', '600', 'metr', '(', '1,969', 'ft', ')', 'as', 'mountain', ',', 'those', 'below', 'be', 'refer', 'to', 'as',

In [5]:
# Lemmatization
lemmatized_words = [lemmatizer.lemmatize(word, get_wordnet_pos(word)) for word in tokens]
print("Lemmatized Words:", lemmatized_words)

Lemmatized Words: ['There', 'be', 'no', 'universally', 'accepted', 'definition', 'of', 'a', 'mountain', '.', 'Elevation', ',', 'volume', ',', 'relief', ',', 'steepness', ',', 'space', 'and', 'continuity', 'have', 'be', 'use', 'a', 'criterion', 'for', 'define', 'a', 'mountain', '.', '[', '4', ']', 'In', 'the', 'Oxford', 'English', 'Dictionary', 'a', 'mountain', 'be', 'define', 'a', '``', 'a', 'natural', 'elevation', 'of', 'the', 'earth', 'surface', 'rise', 'more', 'or', 'less', 'abruptly', 'from', 'the', 'surround', 'level', 'and', 'attain', 'an', 'altitude', 'which', ',', 'relatively', 'to', 'the', 'adjacent', 'elevation', ',', 'be', 'impressive', 'or', 'notable', '.', '``', '[', '4', ']', 'Whether', 'a', 'landform', 'be', 'call', 'a', 'mountain', 'may', 'depend', 'on', 'local', 'usage', '.', 'John', 'Whittow', "'s", 'Dictionary', 'of', 'Physical', 'Geography', '[', '5', ']', 'state', '``', 'Some', 'authority', 'regard', 'eminence', 'above', '600', 'metre', '(', '1,969', 'ft', ')', 'a'