In [19]:
!pip install nltk



In [20]:
import nltk

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [21]:
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk import pos_tag

In [23]:
text = """
Artificial Intelligence is transforming the world.
Students are learning AI and Machine Learning.
"""

In [26]:
import nltk

nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [27]:
sentences = sent_tokenize(text)
print(sentences)

['\nArtificial Intelligence is transforming the world.', 'Students are learning AI and Machine Learning.']


In [28]:
words = word_tokenize(text)
print("Words:", words)

Words: ['Artificial', 'Intelligence', 'is', 'transforming', 'the', 'world', '.', 'Students', 'are', 'learning', 'AI', 'and', 'Machine', 'Learning', '.']


In [29]:
stop_words = set(stopwords.words("english"))

filtered_words = [w for w in words if w.lower() not in stop_words]

print("After Stopword Removal:", filtered_words)

After Stopword Removal: ['Artificial', 'Intelligence', 'transforming', 'world', '.', 'Students', 'learning', 'AI', 'Machine', 'Learning', '.']


In [31]:
import nltk
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


True

In [32]:
pos_tags = pos_tag(words)
print(pos_tags)

[('Artificial', 'JJ'), ('Intelligence', 'NNP'), ('is', 'VBZ'), ('transforming', 'VBG'), ('the', 'DT'), ('world', 'NN'), ('.', '.'), ('Students', 'NNS'), ('are', 'VBP'), ('learning', 'VBG'), ('AI', 'NNP'), ('and', 'CC'), ('Machine', 'NNP'), ('Learning', 'NNP'), ('.', '.')]


In [33]:
stemmer = PorterStemmer()

stemmed_words = [stemmer.stem(w) for w in filtered_words]

print("Stemmed Words:", stemmed_words)

Stemmed Words: ['artifici', 'intellig', 'transform', 'world', '.', 'student', 'learn', 'ai', 'machin', 'learn', '.']


In [34]:
lemmatizer = WordNetLemmatizer()

lemmatized_words = [lemmatizer.lemmatize(w) for w in filtered_words]

print("Lemmatized Words:", lemmatized_words)

Lemmatized Words: ['Artificial', 'Intelligence', 'transforming', 'world', '.', 'Students', 'learning', 'AI', 'Machine', 'Learning', '.']


In [35]:
print("Total Words:", len(words))
print("Unique Words:", len(set(words)))
print("Total Sentences:", len(sentences))

Total Words: 15
Unique Words: 14
Total Sentences: 2


In [36]:
def text_analysis(text):

    words = word_tokenize(text)
    sentences = sent_tokenize(text)

    stop_words = set(stopwords.words("english"))
    filtered = [w for w in words if w.lower() not in stop_words]

    pos_tags = pos_tag(words)

    stemmer = PorterStemmer()
    stemmed = [stemmer.stem(w) for w in filtered]

    lemmatizer = WordNetLemmatizer()
    lemmatized = [lemmatizer.lemmatize(w) for w in filtered]

    print("Words:", words)
    print("\nFiltered:", filtered)
    print("\nPOS Tags:", pos_tags)
    print("\nStemmed:", stemmed)
    print("\nLemmatized:", lemmatized)
    print("\nTotal Words:", len(words))
    print("Unique Words:", len(set(words)))
    print("Total Sentences:", len(sentences))

In [37]:
text_analysis("AI is changing the future of technology and education.")

Words: ['AI', 'is', 'changing', 'the', 'future', 'of', 'technology', 'and', 'education', '.']

Filtered: ['AI', 'changing', 'future', 'technology', 'education', '.']

POS Tags: [('AI', 'NNP'), ('is', 'VBZ'), ('changing', 'VBG'), ('the', 'DT'), ('future', 'NN'), ('of', 'IN'), ('technology', 'NN'), ('and', 'CC'), ('education', 'NN'), ('.', '.')]

Stemmed: ['ai', 'chang', 'futur', 'technolog', 'educ', '.']

Lemmatized: ['AI', 'changing', 'future', 'technology', 'education', '.']

Total Words: 10
Unique Words: 10
Total Sentences: 1
