In [6]:
import nltk
import numpy as np
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from gensim.models import Word2Vec
from gensim.models.word2vec import Word2Vec as GensimWord2Vec

# Download required NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('wordnet')
nltk.download('maxent_ne_chunker_tab')  # Updated chunker name
nltk.download('words')
nltk.download('tagsets')

# Sample text
text = "Natural Language Processing is a fascinating field. It enables machines to understand human language."


















[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package maxent_ne_chunker_tab to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package maxent_ne_chunker_tab is already up-to-date!
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package tagsets to /root/nltk_data...
[nltk_data]   Package tagsets is already up-to-date!


In [7]:
# Tokenization
tokens = word_tokenize(text)
sentences = sent_tokenize(text)
print("Tokens:", tokens)
print("Sentences:", sentences)

Tokens: ['Natural', 'Language', 'Processing', 'is', 'a', 'fascinating', 'field', '.', 'It', 'enables', 'machines', 'to', 'understand', 'human', 'language', '.']
Sentences: ['Natural Language Processing is a fascinating field.', 'It enables machines to understand human language.']


In [8]:
# Stopwords Removal
stop_words = set(stopwords.words('english'))
filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
print("Filtered Tokens (no stopwords):", filtered_tokens)

Filtered Tokens (no stopwords): ['Natural', 'Language', 'Processing', 'fascinating', 'field', '.', 'enables', 'machines', 'understand', 'human', 'language', '.']


In [9]:

# Stopwords Removal
stop_words = set(stopwords.words('english'))
filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
print("Filtered Tokens (no stopwords):", filtered_tokens)


Filtered Tokens (no stopwords): ['Natural', 'Language', 'Processing', 'fascinating', 'field', '.', 'enables', 'machines', 'understand', 'human', 'language', '.']


In [10]:
# Stemming
stemmer = PorterStemmer()
stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]
print("Stemmed Tokens:", stemmed_tokens)

Stemmed Tokens: ['natur', 'languag', 'process', 'fascin', 'field', '.', 'enabl', 'machin', 'understand', 'human', 'languag', '.']


In [11]:
# Lemmatization
lemmatizer = WordNetLemmatizer()
lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]
print("Lemmatized Tokens:", lemmatized_tokens)

Lemmatized Tokens: ['Natural', 'Language', 'Processing', 'fascinating', 'field', '.', 'enables', 'machine', 'understand', 'human', 'language', '.']


# New SectionCommon POS Tags
Here are some common POS tags used by NLTK:

NN: Noun, singular
NNS: Noun, plural
NNP: Proper noun, singular
VB: Verb, base form
VBD: Verb, past tense
VBG: Verb, gerund/present participle
JJ: Adjective
RB: Adverb
IN: Preposition or subordinating conjunction
CC: Coordinating conjunction
PRP: Personal pronoun

In [21]:
# POS Tagging
pos_tags = nltk.pos_tag(tokens)
print("POS Tags:", pos_tags)

POS Tags: [('Natural', 'JJ'), ('Language', 'NNP'), ('Processing', 'NNP'), ('is', 'VBZ'), ('a', 'DT'), ('fascinating', 'JJ'), ('field', 'NN'), ('.', '.'), ('It', 'PRP'), ('enables', 'VBZ'), ('machines', 'NNS'), ('to', 'TO'), ('understand', 'VB'), ('human', 'JJ'), ('language', 'NN'), ('.', '.')]


In [13]:
# Named Entity Recognition (NER)
ner = nltk.ne_chunk(pos_tags)
print("Named Entities:", ner)

Named Entities: (S
  Natural/JJ
  Language/NNP
  Processing/NNP
  is/VBZ
  a/DT
  fascinating/JJ
  field/NN
  ./.
  It/PRP
  enables/VBZ
  machines/NNS
  to/TO
  understand/VB
  human/JJ
  language/NN
  ./.)


In [14]:
# One-Hot Encoding
unique_words = set(filtered_tokens)
one_hot = {word: np.eye(len(unique_words))[i] for i, word in enumerate(unique_words)}
print("One-Hot Encodings:", one_hot)

One-Hot Encodings: {'Language': array([1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]), 'Natural': array([0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.]), '.': array([0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.]), 'machines': array([0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.]), 'Processing': array([0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.]), 'fascinating': array([0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.]), 'field': array([0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.]), 'understand': array([0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.]), 'language': array([0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.]), 'human': array([0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.]), 'enables': array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.])}


In [15]:
# Bag of Words (BoW)
vectorizer = CountVectorizer()
bow_matrix = vectorizer.fit_transform([text])
print("Bag of Words (BoW):\n", bow_matrix.toarray())
print("Feature Names:", vectorizer.get_feature_names_out())

Bag of Words (BoW):
 [[1 1 1 1 1 1 2 1 1 1 1 1]]
Feature Names: ['enables' 'fascinating' 'field' 'human' 'is' 'it' 'language' 'machines'
 'natural' 'processing' 'to' 'understand']


In [16]:
# n-grams
vectorizer_ngrams = CountVectorizer(ngram_range=(2, 2))  # Bigram example
ngrams_matrix = vectorizer_ngrams.fit_transform([text])
print("n-grams (Bigrams):\n", ngrams_matrix.toarray())
print("Feature Names (Bigrams):", vectorizer_ngrams.get_feature_names_out())

n-grams (Bigrams):
 [[1 1 1 1 1 1 1 1 1 1 1 1]]
Feature Names (Bigrams): ['enables machines' 'fascinating field' 'field it' 'human language'
 'is fascinating' 'it enables' 'language processing' 'machines to'
 'natural language' 'processing is' 'to understand' 'understand human']


In [17]:
# TF-IDF
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform([text])
print("TF-IDF Matrix:\n", tfidf_matrix.toarray())
print("TF-IDF Feature Names:", tfidf_vectorizer.get_feature_names_out())

TF-IDF Matrix:
 [[0.25819889 0.25819889 0.25819889 0.25819889 0.25819889 0.25819889
  0.51639778 0.25819889 0.25819889 0.25819889 0.25819889 0.25819889]]
TF-IDF Feature Names: ['enables' 'fascinating' 'field' 'human' 'is' 'it' 'language' 'machines'
 'natural' 'processing' 'to' 'understand']


In [18]:
# Word Embeddings using Word2Vec
# Tokenizing sentences for Word2Vec
sentence_tokens = [word_tokenize(sent) for sent in sentences]

In [19]:


# Word2Vec Model - CBOW
word2vec_model_cbow = GensimWord2Vec(sentence_tokens, vector_size=100, window=5, min_count=1, sg=0)  # CBOW
print("Word2Vec CBOW Embedding for 'language':", word2vec_model_cbow.wv['language'])


Word2Vec CBOW Embedding for 'language': [-8.6196875e-03  3.6657380e-03  5.1898835e-03  5.7419385e-03
  7.4669183e-03 -6.1676754e-03  1.1056137e-03  6.0472824e-03
 -2.8400505e-03 -6.1735227e-03 -4.1022300e-04 -8.3689485e-03
 -5.6000124e-03  7.1045388e-03  3.3525396e-03  7.2256695e-03
  6.8002474e-03  7.5307419e-03 -3.7891543e-03 -5.6180597e-04
  2.3483764e-03 -4.5190323e-03  8.3887316e-03 -9.8581640e-03
  6.7646410e-03  2.9144168e-03 -4.9328315e-03  4.3981876e-03
 -1.7395747e-03  6.7113843e-03  9.9648498e-03 -4.3624435e-03
 -5.9933780e-04 -5.6956373e-03  3.8508223e-03  2.7866268e-03
  6.8910765e-03  6.1010956e-03  9.5384968e-03  9.2734173e-03
  7.8980681e-03 -6.9895042e-03 -9.1558648e-03 -3.5575271e-04
 -3.0998408e-03  7.8943167e-03  5.9385742e-03 -1.5456629e-03
  1.5109634e-03  1.7900408e-03  7.8175711e-03 -9.5101865e-03
 -2.0553112e-04  3.4691966e-03 -9.3897223e-04  8.3817719e-03
  9.0107834e-03  6.5365066e-03 -7.1162102e-04  7.7104042e-03
 -8.5343346e-03  3.2071066e-03 -4.6379971e-03

In [20]:


# Word2Vec Model - Skip-gram
word2vec_model_skipgram = GensimWord2Vec(sentence_tokens, vector_size=100, window=5, min_count=1, sg=1)  # Skip-gram
print("Word2Vec Skip-gram Embedding for 'language':", word2vec_model_skipgram.wv['language'])

Word2Vec Skip-gram Embedding for 'language': [-8.6196875e-03  3.6657380e-03  5.1898835e-03  5.7419385e-03
  7.4669183e-03 -6.1676754e-03  1.1056137e-03  6.0472824e-03
 -2.8400505e-03 -6.1735227e-03 -4.1022300e-04 -8.3689485e-03
 -5.6000124e-03  7.1045388e-03  3.3525396e-03  7.2256695e-03
  6.8002474e-03  7.5307419e-03 -3.7891543e-03 -5.6180597e-04
  2.3483764e-03 -4.5190323e-03  8.3887316e-03 -9.8581640e-03
  6.7646410e-03  2.9144168e-03 -4.9328315e-03  4.3981876e-03
 -1.7395747e-03  6.7113843e-03  9.9648498e-03 -4.3624435e-03
 -5.9933780e-04 -5.6956373e-03  3.8508223e-03  2.7866268e-03
  6.8910765e-03  6.1010956e-03  9.5384968e-03  9.2734173e-03
  7.8980681e-03 -6.9895042e-03 -9.1558648e-03 -3.5575271e-04
 -3.0998408e-03  7.8943167e-03  5.9385742e-03 -1.5456629e-03
  1.5109634e-03  1.7900408e-03  7.8175711e-03 -9.5101865e-03
 -2.0553112e-04  3.4691966e-03 -9.3897223e-04  8.3817719e-03
  9.0107834e-03  6.5365066e-03 -7.1162102e-04  7.7104042e-03
 -8.5343346e-03  3.2071066e-03 -4.637997