**NLP** **Techniques**

In [None]:
import nltk

# **1. Tokenization**

In [None]:
nltk.download('punkt_tab')
#required for task completed word_tokenize and sent_tokenize

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [None]:
from nltk.tokenize import word_tokenize, sent_tokenize

In [None]:
text="hello openeyes software solution private limited"
#word tokenize
print(word_tokenize(text))
#word=word_tokenize(text)
#print("words:",word)

#sent tokenize
print(sent_tokenize(text))
#sent=sent_tokenize(text)
#print("sentences:",sent

['hello', 'openeyes', 'software', 'solution', 'private', 'limited']
['hello openeyes software solution private limited']


**Converting lower case **

In [None]:
text = "HELLO  ItiSha SAkhReliya"
lowercased_text = text.lower()
print(lowercased_text)

hello  itisha sakhreliya


**Stop Word Removal**

In [None]:
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
sentence = "The quick brown fox jumps over the lazy dog."
stop_words = set(stopwords.words('english'))

filtered_sentence = ' '.join([word for word in sentence.split() if word.lower() not in stop_words])

print(filtered_sentence)


quick brown fox jumps lazy dog.


# **2**.**stremming**

In [None]:
from nltk.stem import PorterStemmer

In [None]:
ps = PorterStemmer()
words = ["program", "programs", "programmer", "programming", "programmers"]

for w in words:
	print(w, " : ", ps.stem(w))


program  :  program
programs  :  program
programmer  :  programm
programming  :  program
programmers  :  programm


In [None]:
ps = PorterStemmer()

sentence = "Programmers program with programming languages"
words = word_tokenize(sentence)

for w in words:
	print(w, " : ", ps.stem(w))


Programmers  :  programm
program  :  program
with  :  with
programming  :  program
languages  :  languag


# **3**. **Lemmatization**

In [None]:
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:

lemmatizer = WordNetLemmatizer()

print("rocks :", lemmatizer.lemmatize("rocks"))
print("corpora :", lemmatizer.lemmatize("corpora"))
print("better :", lemmatizer.lemmatize("better", pos="a"))

rocks : rock
corpora : corpus
better : good


# **4. part of speech(pos)**

In [None]:
from nltk import pos_tag
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


True

In [None]:
text="hello openeyes software solution private limited"
#if already tokenize direct used
print("pos_tags:",pos_tag(word))
text1="Preprocessing is crucial to clean and prepare the raw text data for analysis."
#if not do first tokenize words
tokens = word_tokenize(text1)
print("pos_tags:",pos_tag(tokens))

pos_tags: [('hello', 'JJ'), ('openeyes', 'NNS'), ('software', 'NN'), ('solution', 'NN'), ('private', 'JJ'), ('limited', 'VBD')]
pos_tags: [('Preprocessing', 'VBG'), ('is', 'VBZ'), ('crucial', 'JJ'), ('to', 'TO'), ('clean', 'VB'), ('and', 'CC'), ('prepare', 'VB'), ('the', 'DT'), ('raw', 'JJ'), ('text', 'NN'), ('data', 'NNS'), ('for', 'IN'), ('analysis', 'NN'), ('.', '.')]


# **5. Named Entity Recognition (NER)**

In [None]:
from nltk import ne_chunk
nltk.download('maxent_ne_chunker_tab')
nltk.download('words')


[nltk_data] Downloading package maxent_ne_chunker_tab to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping chunkers/maxent_ne_chunker_tab.zip.
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!


True

In [None]:
text = "Barack Obama was born in Hawaii."
tokens = word_tokenize(text)
tagged = pos_tag(tokens)
named_entities = ne_chunk(tagged)
print(named_entities)


(S
  (PERSON Barack/NNP)
  (PERSON Obama/NNP)
  was/VBD
  born/VBN
  in/IN
  (GPE Hawaii/NNP)
  ./.)


**Removing Special Characters and Numbers**

In [None]:
import re

text = "I have 2 apples @! and i have 5 oranges ,can you given 1 apple and 2 oranges?/,,"
clean_text = re.sub(r'[^a-zA-Z\s]', '', text)
print(clean_text)


I have  apples  and i have  oranges can you given  apple and  oranges


# **6. Vectorization (Converting Text to Numbers)**

**6.1 Bag of Words(BoW)**

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
# corpus is list of documents
corpus = ['I love programming', 'Python is amazing', 'I love Python']
vectorizer = CountVectorizer()

X = vectorizer.fit_transform(corpus)

print(X.toarray())

[[0 0 1 1 0]
 [1 1 0 0 1]
 [0 0 1 0 1]]


In [None]:
# corpus is list of documents
corpus = ['Cats are cute', 'I love cats', 'Cats are playful']
vectorizer = CountVectorizer()

X = vectorizer.fit_transform(corpus)

print(X.toarray())

[[1 1 1 0 0]
 [0 1 0 1 0]
 [1 1 0 0 1]]


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

 **6.2 Term Frequency-Inverse Document Frequency (TF-IDF)**

In [None]:
corpus = ['I love programming', 'Python is amazing', 'I love Python']
vectorizer = TfidfVectorizer()

X = vectorizer.fit_transform(corpus)

print(X.toarray())

[[0.         0.         0.60534851 0.79596054 0.        ]
 [0.62276601 0.62276601 0.         0.         0.4736296 ]
 [0.         0.         0.70710678 0.         0.70710678]]


**6.3 Continuous bag of words (CBOW)**

In [None]:
!pip install gensim
from gensim.models import Word2Vec



In [None]:
sentences = [['i', 'love', 'programming'], ['python', 'is', 'amazing']]
model = Word2Vec(sentences, min_count=1)

vector = model.wv['python']

print(vector)

[ 9.4563962e-05  3.0773198e-03 -6.8126451e-03 -1.3754654e-03
  7.6685809e-03  7.3464094e-03 -3.6732971e-03  2.6427018e-03
 -8.3171297e-03  6.2054861e-03 -4.6373224e-03 -3.1641065e-03
  9.3113566e-03  8.7338570e-04  7.4907029e-03 -6.0740625e-03
  5.1605068e-03  9.9228229e-03 -8.4573915e-03 -5.1356913e-03
 -7.0648370e-03 -4.8626517e-03 -3.7785638e-03 -8.5361991e-03
  7.9556061e-03 -4.8439382e-03  8.4236134e-03  5.2625705e-03
 -6.5500261e-03  3.9578713e-03  5.4701497e-03 -7.4265362e-03
 -7.4057197e-03 -2.4752307e-03 -8.6257253e-03 -1.5815723e-03
 -4.0343284e-04  3.2996845e-03  1.4418805e-03 -8.8142155e-04
 -5.5940580e-03  1.7303658e-03 -8.9737179e-04  6.7936908e-03
  3.9735902e-03  4.5294715e-03  1.4343059e-03 -2.6998555e-03
 -4.3668128e-03 -1.0320747e-03  1.4370275e-03 -2.6460087e-03
 -7.0737829e-03 -7.8053069e-03 -9.1217868e-03 -5.9351693e-03
 -1.8474245e-03 -4.3238713e-03 -6.4606704e-03 -3.7173224e-03
  4.2891586e-03 -3.7390434e-03  8.3781751e-03  1.5339935e-03
 -7.2423196e-03  9.43379

# **unigram,bigram,trigram,Ngram**

In [None]:
from nltk.util import ngrams
from nltk.tokenize import word_tokenize

# Tokenize the text into words
tokens = word_tokenize(text1)

#  Unigrams (1-grams)
unigrams = list(ngrams(tokens, 1))
print("Unigrams:", unigrams)

#  Bigrams (2-grams)
bigrams = list(ngrams(tokens, 2))
print("Bigrams:", bigrams)

#  Trigrams (3-grams)
trigrams = list(ngrams(tokens, 3))
print("Trigrams:", trigrams)


Unigrams: [('Preprocessing',), ('is',), ('crucial',), ('to',), ('clean',), ('and',), ('prepare',), ('the',), ('raw',), ('text',), ('data',), ('for',), ('analysis',), ('.',)]
Bigrams: [('Preprocessing', 'is'), ('is', 'crucial'), ('crucial', 'to'), ('to', 'clean'), ('clean', 'and'), ('and', 'prepare'), ('prepare', 'the'), ('the', 'raw'), ('raw', 'text'), ('text', 'data'), ('data', 'for'), ('for', 'analysis'), ('analysis', '.')]
Trigrams: [('Preprocessing', 'is', 'crucial'), ('is', 'crucial', 'to'), ('crucial', 'to', 'clean'), ('to', 'clean', 'and'), ('clean', 'and', 'prepare'), ('and', 'prepare', 'the'), ('prepare', 'the', 'raw'), ('the', 'raw', 'text'), ('raw', 'text', 'data'), ('text', 'data', 'for'), ('data', 'for', 'analysis'), ('for', 'analysis', '.')]
