#### **Tokenization and Lemmatization**

In [4]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
from nltk.stem import WordNetLemmatizer
from nltk.corpus import webtext
nltk.download('all')


text_raw = webtext.raw('pirates.txt')
text_tokens = word_tokenize(text_raw)
text1 = nltk.Text(text_tokens)


[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /root/nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to /root/nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger is already up-
[nltk_data]    |       to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_eng to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_eng is already
[nltk_data]    |       up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_ru is already
[nltk_data]    |       up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_r

In [5]:
total_tokens = len(text_tokens)
print("Total Tokens:", total_tokens)

unique_tokens = len(set(text_tokens))
print("Unique Tokens:", unique_tokens)

lemmatizer = WordNetLemmatizer()
lemmatized_tokens = [lemmatizer.lemmatize(word) for word in text_tokens]
unique_lem_tokens = len(set(lemmatized_tokens))
print("Unique Tokens after Lemmatization:", unique_lem_tokens)

Total Tokens: 21987
Unique Tokens: 3111
Unique Tokens after Lemmatization: 2894


In [6]:
lexical_diversity = unique_tokens / total_tokens
print("Lexical Diversity:", lexical_diversity)

target_word = "pirate"
count_target = sum(1 for word in text_tokens if word.lower() == target_word)
percentage = (count_target / total_tokens) * 100
print(f"Percentage of '{target_word}': {percentage:.4f}%")


Lexical Diversity: 0.14149270023195526
Percentage of 'pirate': 0.0409%


In [7]:
freq_dist = FreqDist(text_tokens)
most_common_20 = freq_dist.most_common(20)
print("20 Most Frequent Tokens:", most_common_20)

frequent_tokens = [word for word, freq in freq_dist.items() if len(word) > 5 and freq > 10]
print("Tokens with length >5 and freq >10:", frequent_tokens)

longest_word = max(text_tokens, key=len)
print("Longest Word and its Length:", (longest_word, len(longest_word)))


20 Most Frequent Tokens: [('the', 1027), (':', 916), (',', 855), ('.', 802), ('[', 643), (']', 642), ('!', 430), ('a', 410), ('to', 363), ('of', 282), ('Jack', 277), ('?', 232), ('*', 222), ("'s", 218), ('I', 214), ('you', 213), ('and', 210), ('is', 200), ('JACK', 193), ('SPARROW', 193)]
Tokens with length >5 and freq >10: ['Elizabeth', 'sitting', 'holding', 'running', 'ELIZABETH', 'TURNER', 'Beckett', 'standing', 'CUTLER', 'BECKETT', 'Mister', 'Turner', 'Norrington', 'Sparrow', 'Captain', 'aboard', 'bottle', 'toward', 'bridge', 'inside', 'across', 'pistol', 'points', 'captain', 'SPARROW', 'something', 'around', 'monkey', 'crewman', 'Compass', 'Bootstrap', 'Flying', 'Dutchman', 'behind', 'crewmember', 'against', 'CANNIBAL', 'ISLAND', 'through', 'cannibal', 'cannibals', 'Ragetti', 'longboat', 'RAGETTI', 'PINTEL', 'crewmen', 'Pintel', 'BELLAMY', 'tentacles', 'Kraken', 'NORRINGTON', 'Hadrus']
Longest Word and its Length: ('Heh-heh-heh-heh-heh-heh-heh-heh', 31)
