# NLTK

In [None]:
import nltk

In [None]:
nltk.download('all')

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/abc.zip.
[nltk_data]    | Downloading package alpino to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/alpino.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_eng to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       taggers/averaged_perceptron_tagger_eng.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       taggers/averaged_perceptron_tagger_ru.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_rus to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |  

True

## Tokenization
sentence wise, word wise, letter wise


### Sentence Tokenization

In [None]:
from nltk.tokenize import sent_tokenize

In [None]:
text = "The cat sat on the mat. THen after it wend somewhere. Do you know where it is?"

In [None]:
sentences = sent_tokenize(text)

In [None]:
sentences

['The cat sat on the mat.',
 'THen after it wend somewhere.',
 'Do you know where it is?']

### Word Tokenization

In [None]:
from nltk.tokenize import word_tokenize

In [None]:
text = "The cat sat on the mat."

In [None]:
words = word_tokenize(text)

In [None]:
words

['The', 'cat', 'sat', 'on', 'the', 'mat', '.']

### Regex Tokenization

In [None]:
from nltk.tokenize import RegexpTokenizer

tokenizer = RegexpTokenizer(r'\w+')
words = tokenizer.tokenize("Natural Language Processing (NLP) is fun!")

print(words)


['Natural', 'Language', 'Processing', 'NLP', 'is', 'fun']


## Stopwords Removal

In [None]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [None]:
text = "This is an example showing stopword removal in Natural Language Processing."

# Tokenize the sentence
words = word_tokenize(text)

# Load English stopwords
stop_words = set(stopwords.words('english'))

# Remove stopwords
filtered_words = [word for word in words if word.lower() not in stop_words]

In [None]:
print("Original Words:", words)
print("Filtered Words (without stopwords):", filtered_words)

Original Words: ['This', 'is', 'an', 'example', 'showing', 'stopword', 'removal', 'in', 'Natural', 'Language', 'Processing', '.']
Filtered Words (without stopwords): ['example', 'showing', 'stopword', 'removal', 'Natural', 'Language', 'Processing', '.']


### Adding Custom Stopwords

In [None]:
custom_stopwords = stop_words.union({"example", "processing"})  # Adding new words
filtered_words_custom = [word for word in words if word.lower() not in custom_stopwords]
print(filtered_words_custom)

['showing', 'stopword', 'removal', 'Natural', 'Language', '.']


### Removing Stopwords

In [None]:
stop_words.remove('not')  # Keeping "not" for sentiment analysis

## Stemming

### Porter Stemmer

In [None]:
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

ps = PorterStemmer()

words = ["running", "flies", "fairly", "easily", "cats", "better", "jumping", "caring", "computing", "better"]
stemmed_words = [ps.stem(word) for word in words]

print(stemmed_words)

['run', 'fli', 'fairli', 'easili', 'cat', 'better', 'jump', 'care', 'comput', 'better']


### Lancaster Stemmer (More Aggressive)

In [None]:
from nltk.stem import LancasterStemmer

ls = LancasterStemmer()
print(ls.stem("running"))  # Output: "run"
print(ls.stem("happiness"))  # Output: "happy"
print(ls.stem("computing"))  # Output: "comput"
print(ls.stem("better"))  # Output: "bet"

run
happy
comput
bet


## Lemmatization

### WordNet Lemmatizer

In [None]:
from nltk.stem import WordNetLemmatizer

In [None]:
lemmatizer = WordNetLemmatizer()

words = ["running", "flies", "better", "cats", "mice", "jumping"]
lemmatized_words = [lemmatizer.lemmatize(word) for word in words]

print(lemmatized_words)

['running', 'fly', 'better', 'cat', 'mouse', 'jumping']


### Lemmatization with POS Tags

In [None]:
lemmatizer = WordNetLemmatizer()

print(lemmatizer.lemmatize("running", pos="v"))  # Verb: run
print(lemmatizer.lemmatize("better", pos="a"))   # Adjective: good
print(lemmatizer.lemmatize("flies", pos="n"))    # Noun: fly
print(lemmatizer.lemmatize("flies", pos="v"))    # Verb: fly

run
good
fly
fly


## POS Tagging

In [None]:
from nltk.tokenize import word_tokenize
from nltk import pos_tag

In [None]:
sentence = "The quick brown fox jumps over the lazy dog."
words = word_tokenize(sentence)

# Apply POS tagging
pos_tags = pos_tag(words)

print(pos_tags)

[('The', 'DT'), ('quick', 'JJ'), ('brown', 'NN'), ('fox', 'NN'), ('jumps', 'VBZ'), ('over', 'IN'), ('the', 'DT'), ('lazy', 'JJ'), ('dog', 'NN'), ('.', '.')]


### Automating POS Tagging for Better Lemmatization

In [None]:
from nltk.corpus import wordnet
from nltk import pos_tag

# Function to convert NLTK POS tags to WordNet POS tags
def get_wordnet_pos(word):
    tag = pos_tag([word])[0][1][0].upper()  # Get first letter of POS tag
    tag_dict = {"J": wordnet.ADJ, "N": wordnet.NOUN, "V": wordnet.VERB, "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)  # Default to noun if not found

# Example sentence
sentence = "The better quick brown foxes were running swiftly."
words = word_tokenize(sentence)

# Apply lemmatization with POS tagging
lemmatized_sentence = [lemmatizer.lemmatize(word, get_wordnet_pos(word)) for word in words]
print(" ".join(lemmatized_sentence))


The well quick brown fox be run swiftly .
