# Sentence & Word Tokenization In NLTK

In [None]:
pip install nltk

In [None]:
import nltk
from nltk.tokenize import sent_tokenize

In [None]:
sent_tokenize("Mr. Ahmed likes pizza from Italy. Maria loves sushi from Tokyo!")

In [None]:
from nltk.tokenize import word_tokenize
word_tokenize("Mr. Ahmed likes pizza from Italy. Maria loves sushi from Tokyo!")

#  Sentence Tokenization In Spacy

In [None]:
pip install spacy

In [None]:
import spacy

In [None]:
nlp = spacy.load("en_core_web_sm")#Loads SpaCy's small English language model.

doc = nlp("Mr. Ahmed likes pizza from Italy. Maria loves sushi from Tokyo!")

In [None]:
doc

In [None]:
for sentence in doc.sents: #The 'doc.sents' attribute contains the individual sentences as detected by SpaCy's sentence boundary detection.
    print(sentence)

#  Word Tokenization

In [None]:
for sentence in doc.sents:
    for word in sentence:
        print(word)

# Collecting email ids of students from students information sheet using spacy

In [None]:
# Open the specified file from the Kaggle input directory and read all lines into a list

with open("/kaggle/input/test-txt/dayton_high_school_students.txt") as f:
    text = f.readlines()
text

In [None]:
# Join all lines from the list into a single string with spaces between them

text = " ".join(text)
text

In [None]:
doc = nlp(text)
emails = []
for token in doc:
    if token.like_email:
        emails.append(token.text)
emails 

# Extract URLS from text 

In [None]:
text='''
Look for data to help you address the question. Governments are good
sources because data from public research is often freely available. Good
places to start include http://www.data.gov/, and http://www.science.
gov/, and in the United Kingdom, http://data.gov.uk/.
Two of my favorite data sets are the General Social Survey at http://www3.norc.org/gss+website/, 
and the European Social Survey at http://www.europeansocialsurvey.org/.'''

In [None]:
doc = nlp(text)
url = []
for token in doc:
    if token.like_url:
        url.append(token.text)
url

#  Extract all money transaction from below sentence along with currency.

In [None]:
transactions = "Tony gave two $ to Peter, Bruce gave 500 € to Steve"

In [None]:
doc = nlp(transactions)
for token in doc:
    if token.like_num and doc[token.i+1].is_currency:
        print(token.text, doc[token.i+1].text) 



# Lowercasing
> Converting all text to lowercase to reduce variability in words (e.g., "Apple" and "apple" treated the same).

In [None]:
text = "This is an Example with Mixed CASES."
doc=nlp(text)
lowercased_text = doc.text.lower()
print(lowercased_text)

# Lemmatization/Stemming 
> is the process of reducing a word to its base or dictionary form, known as the lemma. This process considers the word’s part of speech and the context in which it is used.

In [None]:
doc = nlp("Mando talked for 3 hours although talking isn't his thing")
doc = nlp("eating eats eat ate adjustable rafting ability meeting better")
for token in doc:
    print(token.lemma_)

In [None]:
nlp = spacy.load("en_core_web_sm")

doc = nlp("Mando talked for 3 hours although talking isn't his thing")
doc = nlp("eating eats eat ate adjustable rafting ability meeting better")
for token in doc:
    print(token, " | ", token.lemma_)

In [None]:
# Process the first sentence
doc1 = nlp("Mando talked for 3 hours although talking isn't his thing")
# Get lemmatized version of the first sentence
lemmatized_sentence1 = ' '.join(token.lemma_ for token in doc1)

# Process the second sentence
doc2 = nlp("eating eats eat ate adjustable rafting ability meeting better")
# Get lemmatized version of the second sentence
lemmatized_sentence2 = ' '.join(token.lemma_ for token in doc2)

# Print the lemmatized sentences
print("Lemmatized Sentence 1:")
print(lemmatized_sentence1)

print("\nLemmatized Sentence 2:")
print(lemmatized_sentence2)


# Punctuation Removal
>  Stripping out punctuation marks to focus on meaningful text.

In [None]:
text = "Hello, world! It's a sunny day. Can you believe it? Wow!"

# Process the text with SpaCy
doc = nlp(text)

# Remove punctuation tokens
text_no_punctuation = " ".join([token.text for token in doc if not token.is_punct])

print(text_no_punctuation)

# Part-of-Speech (POS) tagging
is the process of assigning grammatical categories (e.g., noun, verb, adjective) to each word in a text based on its role in the sentence.









In [None]:
# Example text
text = "Natural Language Processing is a fascinating field of study."
doc = nlp(text)
print("spaCy POS Tags:")
for token in doc:
    print(f"{token.text}: {token.pos_} ({token.tag_})")

#  Stopword Removal
>  Stopword Removal is the process of eliminating common words from a text that carry little meaning and are often removed to improve the efficiency and accuracy of NLP tasks (e.g., "the", "is", "in").

In [None]:
# Example text
text = "Natural Language Processing is an interesting field of study."

# Process the text with spaCy
doc = nlp(text)

# Remove stopwords
filtered_tokens = [token.text for token in doc if not token.is_stop]

# Print the filtered tokens
print("Tokens after stopword removal:", filtered_tokens)

# Named Entity Recognition (NER) 
>  is a process in Natural Language Processing (NLP) that identifies and classifies named entities in a text into predefined categories, such as names of persons, organizations, locations, dates, and more.

In [None]:
# Example text
text = "Apple Inc. is planning to open a new office in San Francisco in 2024."

# Process the text with spaCy
doc = nlp(text)

# Print named entities
print("Named Entities using spaCy:")
for ent in doc.ents:
    print(ent.text, ent.label_)

# PARACTICE EXERCISE 
> Sample Text:
"Apple Inc. was founded by Steve Jobs, Steve Wozniak, and Ronald Wayne in 1976. The company, headquartered in Cupertino, California, revolutionized personal computing with the Macintosh in 1984. As of 2024, Apple is one of the most valuable companies in the world, with a market capitalization of over $2 trillion."
* Tokenize the sample text into words and sentences using spaCy.
* Remove stopwords from the tokenized words using spaCy's built-in stopword list.
* Lemmatize the tokenized words using spaCy to convert them to their base forms.
* dentify and classify named entities in the text, such as people, organizations, and locations.
*  Tag each word in the text with its corresponding part of speech.
