### Natural Language Processing
1. Tokenization
* using nltk and spacy

In [2]:
import nltk


In [3]:
text = "I love you !"
tokens = nltk.word_tokenize(text)
print(tokens)

['I', 'love', 'you', '!']


In [5]:
import spacy
nlp = spacy.load("en_core_web_sm")

doc = nlp("Spacy is a powerful NLP library")

for token in doc:
    print(token.text)

Spacy
is
a
powerful
NLP
library


## NLTK(Natural Language tool kit)

In [30]:
from nltk.tokenize import TreebankWordTokenizer
from nltk.tokenize import WordPunctTokenizer
from nltk.tokenize import WhitespaceTokenizer
from nltk.tokenize import RegexpTokenizer
#from nltk.tokenize.moses import MosesTokenizer
from nltk.tokenize import TweetTokenizer
text = "We're testing NLTK's tokenization."
token = nltk.word_tokenize(text)
tokenizer = TreebankWordTokenizer()
tokenizer1 = WordPunctTokenizer()
tokens = tokenizer.tokenize(text)
token1 = tokenizer1.tokenize(text)
print(token)
print(tokens)
print(token1)

['We', "'re", 'testing', 'NLTK', "'s", 'tokenization', '.']
['We', "'re", 'testing', 'NLTK', "'s", 'tokenization', '.']
['We', "'", 're', 'testing', 'NLTK', "'", 's', 'tokenization', '.']


In [33]:
# types of tokenizer
def types_of_tokenizer(tokenizer):
    text = "Hey @user! Check this out: https://example.com 😂 #fun"
    token = tokenizer.tokenize(text)
    return token
print(f"WordPunctTokenizer: {types_of_tokenizer(WordPunctTokenizer())}")
print(f"Treebankword: {types_of_tokenizer(TreebankWordTokenizer())}")
print(f"WhitespaceTokenizer:{types_of_tokenizer(WhitespaceTokenizer())}")
print(f"RegexpTokenzier:{types_of_tokenizer(RegexpTokenizer(r'\w+'))}")
print(f"TwitterTokenzier:{types_of_tokenizer(TweetTokenizer())}")

WordPunctTokenizer: ['Hey', '@', 'user', '!', 'Check', 'this', 'out', ':', 'https', '://', 'example', '.', 'com', '😂', '#', 'fun']
Treebankword: ['Hey', '@', 'user', '!', 'Check', 'this', 'out', ':', 'https', ':', '//example.com', '😂', '#', 'fun']
WhitespaceTokenizer:['Hey', '@user!', 'Check', 'this', 'out:', 'https://example.com', '😂', '#fun']
RegexpTokenzier:['Hey', 'user', 'Check', 'this', 'out', 'https', 'example', 'com', 'fun']
TwitterTokenzier:['Hey', '@user', '!', 'Check', 'this', 'out', ':', 'https://example.com', '😂', '#fun']


### Tokenization in spaCy



In [34]:
import spacy
nlp = spacy.load("en_core_web_sm")

text = "Hello! How's your NLP journey going?"

doc = nlp(text)

tokens = [token.text for token in doc]
print(tokens)

['Hello', '!', 'How', "'s", 'your', 'NLP', 'journey', 'going', '?']


### Stopwords removal

In [39]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
sentence = "This is a simple example to demonstrate stopword removal."

words = word_tokenize(sentence)

stop_words = set(stopwords.words('english'))

filtered_words = []

for word in words:
    if word.lower() not in stop_words:
        filtered_words.append(word)

print("Original words:", words)
print("Words after removing stopwords:", filtered_words)


Original words: ['This', 'is', 'a', 'simple', 'example', 'to', 'demonstrate', 'stopword', 'removal', '.']
Words after removing stopwords: ['simple', 'example', 'demonstrate', 'stopword', 'removal', '.']


### Stemming
Types of Stemmer
* PorterStemmer
* LancasterStemmer(More aggressive)
* SnowballStemmer
* RegexpStemmer
* 

In [43]:
# Porterstemmer
from nltk.stem import PorterStemmer

ps = PorterStemmer()
words = ["running", "files", "easily", "fairly", "stduies"]

stemmed_words = [ps.stem(word) for word in words]
print(stemmed_words)


['run', 'file', 'easili', 'fairli', 'stdui']


In [47]:
# LancasterStemmer
from nltk.stem import LancasterStemmer
words1 = ["running", "files", "easily", "fairly", "stduies","@"]
ls = LancasterStemmer()
stemmed_word1 = [ls.stem(word) for word in words1]
print(stemmed_word1)

['run', 'fil', 'easy', 'fair', 'stduies', '@']


In [52]:
# RegexpStemmer
from nltk.stem import RegexpStemmer
regexp_stemmer = RegexpStemmer('ning$|s$|ly$',min=4)
words = ["running", "flies", "easily", "fairly", "studies"]

stemmed_words = [regexp_stemmer.stem(word) for word in words]
print(stemmed_words)


['run', 'flie', 'easi', 'fair', 'studie']


## Lemmatization

In [54]:
# WordNetLemmatizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

lemmatizer = WordNetLemmatizer()
words = ["running", "flies", "better", "feet"]

lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
print(lemmatized_words)

# We can also specify the pos
print(lemmatizer.lemmatize("better", pos="a"))

['running', 'fly', 'better', 'foot']
good


### POS(Parts-of-Speech)

In [55]:
#Pos_tag
from nltk import pos_tag

sen = "The quick brown fox jumps over the lazy dog."
words = word_tokenize(sen)

pos_tags = pos_tag(words)
print(pos_tags)

[('The', 'DT'), ('quick', 'JJ'), ('brown', 'NN'), ('fox', 'NN'), ('jumps', 'VBZ'), ('over', 'IN'), ('the', 'DT'), ('lazy', 'JJ'), ('dog', 'NN'), ('.', '.')]


In [57]:
#using spacy
nlp = spacy.load("en_core_web_sm")
sen = "The quick brown fox jumps over the lazy dog."
doc = nlp(sen)

for token in doc:
    print(token.text,"->", token.pos_, token.tag_)

The -> DET DT
quick -> ADJ JJ
brown -> ADJ JJ
fox -> NOUN NN
jumps -> VERB VBZ
over -> ADP IN
the -> DET DT
lazy -> ADJ JJ
dog -> NOUN NN
. -> PUNCT .


### Named Entity Recognition 


In [59]:
# Using spacy
import spacy

nlp = spacy.load("en_core_web_sm")

sen = "Bal Ram Reddy was born in India in 2002"
doc = nlp(sen)

for ent in doc.ents:
    print(ent.text,"->>", ent.label_)

Bal Ram Reddy ->> PERSON
India ->> GPE
2002 ->> DATE


### Dependency Parsing

In [60]:
sen = "The cat sat on the mat."
doc = nlp(sen)

for token in doc:
    print(f"Word: {token.text}, Dependency: {token.dep_}, Head: {token.head.text}")

Word: The, Dependency: det, Head: cat
Word: cat, Dependency: nsubj, Head: sat
Word: sat, Dependency: ROOT, Head: sat
Word: on, Dependency: prep, Head: sat
Word: the, Dependency: det, Head: mat
Word: mat, Dependency: pobj, Head: on
Word: ., Dependency: punct, Head: sat
