#Stemming

In [6]:
import nltk
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords

nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

Porter Stemmer - use to remove suffixes from English words and obtain their stems.

stop words - the uninformative words that don't add substance.

In [2]:
paragraph = """
I have three visions for India. In 3000 years of our history, people from all over
               the world have come and invaded us, captured our lands, conquered our minds.
               From Alexander onwards, the Greeks, the Turks, the Moguls, the Portuguese, the British,
               the French, the Dutch, all of them came and looted us, took over what was ours.
               Yet we have not done this to any other nation. We have not conquered anyone.
               We have not grabbed their land, their culture,
               their history and tried to enforce our way of life on them.
               """

In [7]:
sentence = nltk.sent_tokenize(paragraph)

In [8]:
sentence

['\nI have three visions for India.',
 'In 3000 years of our history, people from all over \n               the world have come and invaded us, captured our lands, conquered our minds.',
 'From Alexander onwards, the Greeks, the Turks, the Moguls, the Portuguese, the British,\n               the French, the Dutch, all of them came and looted us, took over what was ours.',
 'Yet we have not done this to any other nation.',
 'We have not conquered anyone.',
 'We have not grabbed their land, their culture, \n               their history and tried to enforce our way of life on them.']

In [10]:
stemmer = PorterStemmer()

In [14]:
nltk.download('stopwords')

for i in range(len(sentence)):
  words=nltk.word_tokenize(sentence[i])
  words = [stemmer.stem(word) for word in words if word not in set(stopwords.words('english'))]
  sentence[i]=''.join(words)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


#Lemmatization

In [15]:
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

Lemmatization is a text pre-processing technique used in natural language processing (NLP) models to break a word down to its root meaning to identify similarities.

In [16]:
paragraph = """I have to thank
               everyone from the very onset of my career … To my parents;
               none of this would be possible without you. And to my
               friends, I love you dearly; you know who you are. And lastly,
               I just want to say this: Making The Revenant was about
               man's relationship to the natural world. A world that we
               collectively felt in 2015 as the hottest year in recorded
               history. Our production needed to move to the southern
               tip of this planet just to be able to find snow. Climate
               change is real, it is happening right now. It is the most
               urgent threat facing our entire species, and we need to work
               collectively together and stop procrastinating.

"""

In [17]:
sentences = nltk.sent_tokenize(paragraph)
lemmatizer = WordNetLemmatizer()

In [19]:
nltk.download('wordnet')

for i in range(len(sentences)):
    words = nltk.word_tokenize(sentences[i])
    words = [lemmatizer.lemmatize(word) for word in words if word not in set(stopwords.words('english'))]
    sentences[i] = ' '.join(words)

[nltk_data] Downloading package wordnet to /root/nltk_data...


#BagOfWords

In [20]:
nltk.download()

NLTK Downloader
---------------------------------------------------------------------------
    d) Download   l) List    u) Update   c) Config   h) Help   q) Quit
---------------------------------------------------------------------------
Downloader> d

Download which package (l=list; x=cancel)?
  Identifier> l
Packages:
  [ ] abc................. Australian Broadcasting Commission 2006
  [ ] alpino.............. Alpino Dutch Treebank
  [ ] averaged_perceptron_tagger Averaged Perceptron Tagger
  [ ] averaged_perceptron_tagger_ru Averaged Perceptron Tagger (Russian)
  [ ] basque_grammars..... Grammars for Basque
  [ ] bcp47............... BCP-47 Language Tags
  [ ] biocreative_ppi..... BioCreAtIvE (Critical Assessment of Information
                           Extraction Systems in Biology)
  [ ] bllip_wsj_no_aux.... BLLIP Parser: WSJ Model
  [ ] book_grammars....... Grammars from NLTK Book
  [ ] brown............... Brown Corpus
  [ ] brown_tei........... Brown Corpus (TEI XML Version)


True

In [21]:
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

In [22]:
ps = PorterStemmer()
wordnet=WordNetLemmatizer()

In [23]:
sentence=nltk.sent_tokenize(paragraph)
corpus=[]
for i in range(len(sentence)):
  review = re.sub('[^a-zA-Z]', '',sentence[i])
  review=review.lower()
  review=review.split()
  review=[ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
  review=''.join(review)
  corpus.append(review)

In [24]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 1500)
X = cv.fit_transform(corpus).toarray()

#Word2Vec

In [26]:
import nltk
from gensim.models import Word2Vec
from nltk.corpus import stopwords
import re

In [27]:
paragraph="""I have three visions for India. In 3000 years of our history, people from all over
               the world have come and invaded us, captured our lands, conquered our minds.
               From Alexander onwards, the Greeks, the Turks, the Moguls, the Portuguese, the British,
               the French, the Dutch, all of them came and looted us, took over what was ours.
               Yet we have not done this to any other nation. We have not conquered anyone.
               We have not grabbed their land, their culture,
               their history and tried to enforce our way of life on them.
               Why? Because we respect the freedom of others.That is why my
               first vision is that of freedom. I believe that India got its first vision of
               this in 1857, when we started the War of Independence. It is this freedom that
               we must protect and nurture and build on. If we are not free, no one will respect us."""

In [28]:
text = re.sub(r'\[[0-9]*\]',' ',paragraph)
text = re.sub(r'\s+',' ',text)
text = text.lower()
text = re.sub(r'\d',' ',text)
text = re.sub(r'\s+',' ',text)

In [32]:
import nltk
from nltk.corpus import stopwords
from gensim.models import Word2Vec

# Assuming 'text' is defined somewhere in your code
sentences = nltk.sent_tokenize(text)
sentences = [nltk.word_tokenize(sentence) for sentence in sentences]

# Removing stopwords
stop_words = set(stopwords.words('english'))
for i in range(len(sentences)):
    sentences[i] = [word for word in sentences[i] if word not in stop_words]

# Training the Word2Vec model
model = Word2Vec(sentences, min_count=1)

# Accessing the vocabulary (updated method)
word_list = model.wv.index_to_key

# Finding Word Vectors
vector = model.wv['war']

# Most similar words
similar = model.wv.most_similar('alexander')

print("Vocabulary:", word_list)
print("Vector for 'war':", vector)
print("Words most similar to 'vikram':", similar)


Vocabulary: [',', '.', 'us', 'freedom', 'respect', 'india', 'history', 'conquered', 'vision', 'first', 'greeks', 'alexander', 'onwards', 'moguls', 'turks', 'portuguese', 'british', 'french', 'minds', 'one', 'lands', 'captured', 'came', 'invaded', 'come', 'world', 'people', 'years', 'visions', 'dutch', 'took', 'looted', '?', 'build', 'nurture', 'protect', 'must', 'independence', 'war', 'started', 'got', 'believe', 'others.that', 'life', 'free', 'way', 'enforce', 'tried', 'culture', 'land', 'grabbed', 'anyone', 'nation', 'done', 'yet', 'three']
Vector for 'war': [-0.00778416 -0.00674505 -0.00315848  0.00660989 -0.00083035  0.0088692
 -0.00229334 -0.0052808   0.00392424  0.00220975 -0.0002728  -0.00245031
 -0.00600819  0.00404109 -0.0076133  -0.00892653 -0.00915215  0.00838399
  0.0040059   0.0070914   0.00990442 -0.00723492  0.00405298  0.00282833
  0.00614737 -0.00339078  0.00932741 -0.00577961  0.00702469  0.00694575
 -0.00037145  0.00504391  0.00665779  0.00152779  0.00737634  0.00240