<a href="https://colab.research.google.com/github/DivyaNarayan0613/DivyaNarayan0613/blob/main/Text_Preprocessing_with_NLTK_%26_spaCy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import nltk
import spacy
import re
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer


In [None]:
nltk.download('punkt')  # Download tokenizer models
text = "Geoffrey Hinton is a computer scientist and cognitive psychologist, Hinton is known for his work on artificial neural networks. He works at Google Brain, the University of Toronto, and the Vector Institute of AI"
words = word_tokenize(text)
sentences = sent_tokenize(text)
print("Words:", words)
print("Sentences:", sentences)


Words: ['Geoffrey', 'Hinton', 'is', 'a', 'computer', 'scientist', 'and', 'cognitive', 'psychologist', ',', 'Hinton', 'is', 'known', 'for', 'his', 'work', 'on', 'artificial', 'neural', 'networks', '.', 'He', 'works', 'at', 'Google', 'Brain', ',', 'the', 'University', 'of', 'Toronto', ',', 'and', 'the', 'Vector', 'Institute', 'of', 'AI']
Sentences: ['Geoffrey Hinton is a computer scientist and cognitive psychologist, Hinton is known for his work on artificial neural networks.', 'He works at Google Brain, the University of Toronto, and the Vector Institute of AI']


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\divya\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
filtered_words = [word for word in words if word.lower() not in stop_words]
print("Filtered Words:", filtered_words)


Filtered Words: ['Geoffrey', 'Hinton', 'computer', 'scientist', 'cognitive', 'psychologist', ',', 'Hinton', 'known', 'work', 'artificial', 'neural', 'networks', '.', 'works', 'Google', 'Brain', ',', 'University', 'Toronto', ',', 'Vector', 'Institute', 'AI']


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\divya\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
nltk.download('wordnet')
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

stemmed = [stemmer.stem(word) for word in filtered_words]
lemmatized = [lemmatizer.lemmatize(word) for word in filtered_words]

print("Stemmed Words:", stemmed)
print("Lemmatized Words:", lemmatized)


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\divya\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Stemmed Words: ['geoffrey', 'hinton', 'comput', 'scientist', 'cognit', 'psychologist', ',', 'hinton', 'known', 'work', 'artifici', 'neural', 'network', '.', 'work', 'googl', 'brain', ',', 'univers', 'toronto', ',', 'vector', 'institut', 'ai']
Lemmatized Words: ['Geoffrey', 'Hinton', 'computer', 'scientist', 'cognitive', 'psychologist', ',', 'Hinton', 'known', 'work', 'artificial', 'neural', 'network', '.', 'work', 'Google', 'Brain', ',', 'University', 'Toronto', ',', 'Vector', 'Institute', 'AI']


Named Entity Recognition (NER) using spaCy

In [None]:
# Download the en_core_web_sm model explicitly
!python -m spacy download en_core_web_lg

Collecting en-core-web-lg==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.7.1/en_core_web_lg-3.7.1-py3-none-any.whl (587.7 MB)
     ---------------------------------------- 0.0/587.7 MB ? eta -:--:--
     -------------------------------------- 0.0/587.7 MB 330.3 kB/s eta 0:29:40
     -------------------------------------- 0.1/587.7 MB 656.4 kB/s eta 0:14:56
     ---------------------------------------- 0.2/587.7 MB 1.5 MB/s eta 0:06:39
     ---------------------------------------- 0.3/587.7 MB 1.8 MB/s eta 0:05:19
     ---------------------------------------- 0.5/587.7 MB 2.3 MB/s eta 0:04:19
     ---------------------------------------- 0.7/587.7 MB 2.4 MB/s eta 0:04:02
     ---------------------------------------- 0.8/587.7 MB 2.6 MB/s eta 0:03:47
     ---------------------------------------- 1.0/587.7 MB 2.9 MB/s eta 0:03:26
     ---------------------------------------- 1.2/587.7 MB 3.1 MB/s eta 0:03:11
     -------------------------

In [None]:
import spacy
nlp = spacy.load("en_core_web_lg")  # Small English model


In [None]:
text = "Geoffrey Hinton is a computer scientist and cognitive psychologist, Hinton is known for his work on artificial neural networks. He works at Google Brain, the University of Toronto, and the Vector Institute of AI"
doc = nlp(text)

print("Named Entities:")
for ent in doc.ents:
    print(f"{ent.text} - {ent.label_}")


Named Entities:
Geoffrey Hinton - PERSON
Hinton - PERSON
Google Brain - ORG
the University of Toronto - ORG
the Vector Institute - ORG
