In [1]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk import ne_chunk

In [2]:
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/shreyaspandey/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/shreyaspandey/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/shreyaspandey/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/shreyaspandey/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /Users/shreyaspandey/nltk_data...
[nltk_data]   Unzipping chunkers/maxent_ne_chunker.zip.
[nltk_data] Downloading package words to
[nltk_data]     /Users/shreyaspandey/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.


True

In [3]:
text = """
this is Tajmehal Computer science is a dynamic and ever-evolving field that encompasses the study of algorithms, data structures, programming languages, and the theoretical foundations of computing. It plays a pivotal role in shaping the modern world, driving innovation across various industries. Computer scientists analyze and solve complex problems, develop cutting-edge software, and design efficient algorithms to enhance computational capabilities. From artificial intelligence and machine learning to cybersecurity and software engineering, computer science influences nearly every aspect of our daily lives. As technology continues to advance, the field of computer science remains at the forefront, paving the way for transformative breakthroughs and shaping the digital landscape of the future.
"""

In [4]:
tokens = word_tokenize(text)

In [5]:
stemmer = PorterStemmer()
stemmed_tokens = [stemmer.stem(token) for token in tokens]

In [6]:
lemmatizer = WordNetLemmatizer()
lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]

In [7]:
stop_words = set(stopwords.words('english'))
filtered_tokens = [token for token in tokens if token.lower() not in stop_words]

In [8]:
pos_tags = nltk.pos_tag(tokens)

In [9]:
ner_tags = ne_chunk(pos_tags)

In [10]:
print("Tokens:", tokens)

Tokens: ['this', 'is', 'Tajmehal', 'Computer', 'science', 'is', 'a', 'dynamic', 'and', 'ever-evolving', 'field', 'that', 'encompasses', 'the', 'study', 'of', 'algorithms', ',', 'data', 'structures', ',', 'programming', 'languages', ',', 'and', 'the', 'theoretical', 'foundations', 'of', 'computing', '.', 'It', 'plays', 'a', 'pivotal', 'role', 'in', 'shaping', 'the', 'modern', 'world', ',', 'driving', 'innovation', 'across', 'various', 'industries', '.', 'Computer', 'scientists', 'analyze', 'and', 'solve', 'complex', 'problems', ',', 'develop', 'cutting-edge', 'software', ',', 'and', 'design', 'efficient', 'algorithms', 'to', 'enhance', 'computational', 'capabilities', '.', 'From', 'artificial', 'intelligence', 'and', 'machine', 'learning', 'to', 'cybersecurity', 'and', 'software', 'engineering', ',', 'computer', 'science', 'influences', 'nearly', 'every', 'aspect', 'of', 'our', 'daily', 'lives', '.', 'As', 'technology', 'continues', 'to', 'advance', ',', 'the', 'field', 'of', 'computer'

In [11]:
print("Stemmed Tokens:", stemmed_tokens)

Stemmed Tokens: ['thi', 'is', 'tajmeh', 'comput', 'scienc', 'is', 'a', 'dynam', 'and', 'ever-evolv', 'field', 'that', 'encompass', 'the', 'studi', 'of', 'algorithm', ',', 'data', 'structur', ',', 'program', 'languag', ',', 'and', 'the', 'theoret', 'foundat', 'of', 'comput', '.', 'it', 'play', 'a', 'pivot', 'role', 'in', 'shape', 'the', 'modern', 'world', ',', 'drive', 'innov', 'across', 'variou', 'industri', '.', 'comput', 'scientist', 'analyz', 'and', 'solv', 'complex', 'problem', ',', 'develop', 'cutting-edg', 'softwar', ',', 'and', 'design', 'effici', 'algorithm', 'to', 'enhanc', 'comput', 'capabl', '.', 'from', 'artifici', 'intellig', 'and', 'machin', 'learn', 'to', 'cybersecur', 'and', 'softwar', 'engin', ',', 'comput', 'scienc', 'influenc', 'nearli', 'everi', 'aspect', 'of', 'our', 'daili', 'live', '.', 'as', 'technolog', 'continu', 'to', 'advanc', ',', 'the', 'field', 'of', 'comput', 'scienc', 'remain', 'at', 'the', 'forefront', ',', 'pave', 'the', 'way', 'for', 'transform', 'br

In [12]:
print("Lemmatized Tokens:", lemmatized_tokens)

Lemmatized Tokens: ['this', 'is', 'Tajmehal', 'Computer', 'science', 'is', 'a', 'dynamic', 'and', 'ever-evolving', 'field', 'that', 'encompasses', 'the', 'study', 'of', 'algorithm', ',', 'data', 'structure', ',', 'programming', 'language', ',', 'and', 'the', 'theoretical', 'foundation', 'of', 'computing', '.', 'It', 'play', 'a', 'pivotal', 'role', 'in', 'shaping', 'the', 'modern', 'world', ',', 'driving', 'innovation', 'across', 'various', 'industry', '.', 'Computer', 'scientist', 'analyze', 'and', 'solve', 'complex', 'problem', ',', 'develop', 'cutting-edge', 'software', ',', 'and', 'design', 'efficient', 'algorithm', 'to', 'enhance', 'computational', 'capability', '.', 'From', 'artificial', 'intelligence', 'and', 'machine', 'learning', 'to', 'cybersecurity', 'and', 'software', 'engineering', ',', 'computer', 'science', 'influence', 'nearly', 'every', 'aspect', 'of', 'our', 'daily', 'life', '.', 'As', 'technology', 'continues', 'to', 'advance', ',', 'the', 'field', 'of', 'computer', '

In [13]:
print("Filtered Tokens :", filtered_tokens)

Filtered Tokens : ['Tajmehal', 'Computer', 'science', 'dynamic', 'ever-evolving', 'field', 'encompasses', 'study', 'algorithms', ',', 'data', 'structures', ',', 'programming', 'languages', ',', 'theoretical', 'foundations', 'computing', '.', 'plays', 'pivotal', 'role', 'shaping', 'modern', 'world', ',', 'driving', 'innovation', 'across', 'various', 'industries', '.', 'Computer', 'scientists', 'analyze', 'solve', 'complex', 'problems', ',', 'develop', 'cutting-edge', 'software', ',', 'design', 'efficient', 'algorithms', 'enhance', 'computational', 'capabilities', '.', 'artificial', 'intelligence', 'machine', 'learning', 'cybersecurity', 'software', 'engineering', ',', 'computer', 'science', 'influences', 'nearly', 'every', 'aspect', 'daily', 'lives', '.', 'technology', 'continues', 'advance', ',', 'field', 'computer', 'science', 'remains', 'forefront', ',', 'paving', 'way', 'transformative', 'breakthroughs', 'shaping', 'digital', 'landscape', 'future', '.']


In [14]:
print("Part-of-Speech Tags:", pos_tags)

Part-of-Speech Tags: [('this', 'DT'), ('is', 'VBZ'), ('Tajmehal', 'NNP'), ('Computer', 'NNP'), ('science', 'NN'), ('is', 'VBZ'), ('a', 'DT'), ('dynamic', 'JJ'), ('and', 'CC'), ('ever-evolving', 'JJ'), ('field', 'NN'), ('that', 'WDT'), ('encompasses', 'VBZ'), ('the', 'DT'), ('study', 'NN'), ('of', 'IN'), ('algorithms', 'NN'), (',', ','), ('data', 'NN'), ('structures', 'NNS'), (',', ','), ('programming', 'VBG'), ('languages', 'NNS'), (',', ','), ('and', 'CC'), ('the', 'DT'), ('theoretical', 'JJ'), ('foundations', 'NNS'), ('of', 'IN'), ('computing', 'VBG'), ('.', '.'), ('It', 'PRP'), ('plays', 'VBZ'), ('a', 'DT'), ('pivotal', 'JJ'), ('role', 'NN'), ('in', 'IN'), ('shaping', 'VBG'), ('the', 'DT'), ('modern', 'JJ'), ('world', 'NN'), (',', ','), ('driving', 'VBG'), ('innovation', 'NN'), ('across', 'IN'), ('various', 'JJ'), ('industries', 'NNS'), ('.', '.'), ('Computer', 'NNP'), ('scientists', 'NNS'), ('analyze', 'VBP'), ('and', 'CC'), ('solve', 'VBP'), ('complex', 'JJ'), ('problems', 'NNS'),

In [15]:
ner_tags = ne_chunk(pos_tags)
print("Named Entities:")
for chunk in ner_tags:
    if hasattr(chunk, 'label'):
        print(chunk.label(), ' '.join(c[0] for c in chunk))

Named Entities:
PERSON Tajmehal Computer
ORGANIZATION Computer
