In [1]:
import re
import nltk
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
#lets start this with a small text data:

data= 'hello everybody!. Welcome to NLP tutorial. We are going to see the simple text preprocessing. I am going \
to use stemmer & lemmatization from nltk-library.'

In [3]:
sentences= nltk.sent_tokenize(data)

In [4]:
#clean the data:

corpus=[]

for i in range(len(sentences)):
    review= re.sub('[^a-zA-Z]',' ', sentences[i])
    review= review.lower()
    corpus.append(review)

In [5]:
corpus

['hello everybody  ',
 'welcome to nlp tutorial ',
 'we are going to see the simple text preprocessing ',
 'i am going to use stemmer   lemmatization from nltk library ']

In [6]:
#lets start bag of words:

cv= CountVectorizer()

vectors= cv.fit_transform(corpus)

In [8]:
# it gets the all features in the corpus by performing stopwords but not that accurate.
cv.get_feature_names()

['am',
 'are',
 'everybody',
 'from',
 'going',
 'hello',
 'lemmatization',
 'library',
 'nlp',
 'nltk',
 'preprocessing',
 'see',
 'simple',
 'stemmer',
 'text',
 'the',
 'to',
 'tutorial',
 'use',
 'we',
 'welcome']

In [9]:
#its randomly assigns the unique number to every word in the corpus
cv.vocabulary_

{'hello': 5,
 'everybody': 2,
 'welcome': 20,
 'to': 16,
 'nlp': 8,
 'tutorial': 17,
 'we': 19,
 'are': 1,
 'going': 4,
 'see': 11,
 'the': 15,
 'simple': 12,
 'text': 14,
 'preprocessing': 10,
 'am': 0,
 'use': 18,
 'stemmer': 13,
 'lemmatization': 6,
 'from': 3,
 'nltk': 9,
 'library': 7}

In [10]:
#lets see the vectors of all the words in corpus:

vectors.toarray()

array([[0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1],
       [0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0],
       [1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0]],
      dtype=int64)

Note:  The corpus having 4 sentences so we got 4 vectors with 22 elements

In [11]:
# we can check the shape fo the obtained vector:

vectors.toarray().shape

(4, 21)

Note: the elements in the vector or length of the vectors depends on the highest highest length sentence. And also we can pass the length of the sentence as parameter.

In [12]:
#lets do the BOW on larger data:

paragraph =  open('C:/Users/acreddy/Desktop/spacy_ds/nlp.txt').read()

In [27]:
paragraph

'I have three visions for India. In 3000 years of our history, people from all over \nthe world have come and invaded us, captured our lands, conquered our minds. \nFrom Alexander onwards, the Greeks, the Turks, the Moguls, the Portuguese, the British,\nthe French, the Dutch, all of them came and looted us, took over what was ours. \nYet we have not done this to any other nation. We have not conquered anyone. \nWe have not grabbed their land, their culture, \ntheir history and tried to enforce our way of life on them. \nWhy? Because we respect the freedom of others.That is why my \nfirst vision is that of freedom. I believe that India got its first vision of \nthis in 1857, when we started the War of Independence. It is this freedom that\nwe must protect and nurture and build on. If we are not free, no one will respect us.\nMy second vision for Indiaâ€™s development. For fifty years we have been a developing nation.\nIt is time we see ourselves as a developed nation. We are among the t

In [28]:
#lets breakdown in to sentences:

sents= nltk.sent_tokenize(paragraph)

In [29]:
sents

['I have three visions for India.',
 'In 3000 years of our history, people from all over \nthe world have come and invaded us, captured our lands, conquered our minds.',
 'From Alexander onwards, the Greeks, the Turks, the Moguls, the Portuguese, the British,\nthe French, the Dutch, all of them came and looted us, took over what was ours.',
 'Yet we have not done this to any other nation.',
 'We have not conquered anyone.',
 'We have not grabbed their land, their culture, \ntheir history and tried to enforce our way of life on them.',
 'Why?',
 'Because we respect the freedom of others.That is why my \nfirst vision is that of freedom.',
 'I believe that India got its first vision of \nthis in 1857, when we started the War of Independence.',
 'It is this freedom that\nwe must protect and nurture and build on.',
 'If we are not free, no one will respect us.',
 'My second vision for Indiaâ€™s development.',
 'For fifty years we have been a developing nation.',
 'It is time we see ourselve

In [30]:
#lets clean the data  a bit:

corpus1=[]

for i in range(len(sents)):
    review= re.sub('[^a-zA-Z0-9]', ' ', sents[i])
    review= review.lower()
    corpus1.append(review)

In [31]:
corpus1

['i have three visions for india ',
 'in 3000 years of our history  people from all over  the world have come and invaded us  captured our lands  conquered our minds ',
 'from alexander onwards  the greeks  the turks  the moguls  the portuguese  the british  the french  the dutch  all of them came and looted us  took over what was ours ',
 'yet we have not done this to any other nation ',
 'we have not conquered anyone ',
 'we have not grabbed their land  their culture   their history and tried to enforce our way of life on them ',
 'why ',
 'because we respect the freedom of others that is why my  first vision is that of freedom ',
 'i believe that india got its first vision of  this in 1857  when we started the war of independence ',
 'it is this freedom that we must protect and nurture and build on ',
 'if we are not free  no one will respect us ',
 'my second vision for india   s development ',
 'for fifty years we have been a developing nation ',
 'it is time we see ourselves as a

In [32]:
#lets apply BOW on the above corpus1:

cv= CountVectorizer()

count_vects= cv.fit_transform(corpus1)

In [33]:
cv.get_feature_names_out()

array(['10', '1857', '3000', 'achievements', 'alexander', 'all', 'also',
       'among', 'an', 'and', 'any', 'anyone', 'are', 'areas', 'as',
       'assured', 'be', 'because', 'been', 'being', 'believe', 'both',
       'brahm', 'british', 'build', 'but', 'came', 'captured', 'career',
       'closely', 'come', 'confidence', 'conquered', 'consider',
       'culture', 'dept', 'developed', 'developing', 'development',
       'dhawan', 'done', 'dr', 'dutch', 'economic', 'enforce', 'falling',
       'father', 'fifty', 'first', 'for', 'fortune', 'four', 'free',
       'freedom', 'french', 'from', 'gdp', 'globally', 'go', 'good',
       'got', 'grabbed', 'great', 'greeks', 'growth', 'hand', 'have',
       'him', 'history', 'if', 'in', 'incorrect', 'independence', 'india',
       'invaded', 'is', 'isn', 'it', 'its', 'lack', 'land', 'lands',
       'levels', 'life', 'looted', 'lucky', 'material', 'milestones',
       'military', 'minds', 'moguls', 'most', 'must', 'my', 'nation',
       'nations'

In [34]:
cv.vocabulary_

{'have': 66,
 'three': 144,
 'visions': 157,
 'for': 49,
 'india': 73,
 'in': 70,
 '3000': 2,
 'years': 170,
 'of': 100,
 'our': 108,
 'history': 68,
 'people': 112,
 'from': 55,
 'all': 5,
 'over': 111,
 'the': 139,
 'world': 169,
 'come': 30,
 'and': 9,
 'invaded': 74,
 'us': 154,
 'captured': 27,
 'lands': 81,
 'conquered': 32,
 'minds': 89,
 'alexander': 4,
 'onwards': 104,
 'greeks': 63,
 'turks': 151,
 'moguls': 90,
 'portuguese': 114,
 'british': 23,
 'french': 54,
 'dutch': 42,
 'them': 141,
 'came': 26,
 'looted': 84,
 'took': 148,
 'what': 162,
 'was': 159,
 'ours': 109,
 'yet': 171,
 'we': 161,
 'not': 97,
 'done': 40,
 'this': 143,
 'to': 146,
 'any': 10,
 'other': 106,
 'nation': 94,
 'anyone': 11,
 'grabbed': 61,
 'their': 140,
 'land': 80,
 'culture': 34,
 'tried': 150,
 'enforce': 44,
 'way': 160,
 'life': 83,
 'on': 101,
 'why': 165,
 'because': 17,
 'respect': 123,
 'freedom': 53,
 'others': 107,
 'that': 138,
 'is': 75,
 'my': 93,
 'first': 48,
 'vision': 156,
 'beli

In [44]:
#lets print the vectors:

count_vects.toarray()


array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 1, ..., 1, 1, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [46]:
count_vects.toarray().shape

(31, 172)

Note: 1. We can apply Ngram: which gives the window size to combine words with their next words.                         
      2. And also we can apply binary=True:  CountVectorizer gives the frequency of the features if i give 
            binary=ture then it will print only 1 and 0 depsite if a single word repeates many times

In [38]:
#lets apply ngram and binary=True

cv= CountVectorizer(binary=True, ngram_range= (2,3)) # here applying bigram to Trigram

cnt_vectors= cv.fit_transform(corpus1)

In [39]:
cv.get_feature_names_out()

array(['10 percent', '10 percent growth', '1857 when', '1857 when we',
       '3000 years', '3000 years of', 'achievements are',
       'achievements are being', 'alexander onwards',
       'alexander onwards the', 'all of', 'all of them', 'all over',
       'all over the', 'all three', 'all three of', 'also as',
       'also as an', 'among the', 'among the top', 'an economic',
       'an economic power', 'and build', 'and build on', 'and consider',
       'and consider this', 'and dr', 'and dr brahm', 'and invaded',
       'and invaded us', 'and looted', 'and looted us', 'and nurture',
       'and nurture and', 'and self', 'and self assured', 'and tried',
       'and tried to', 'any other', 'any other nation', 'are among',
       'are among the', 'are being', 'are being globally', 'are falling',
       'are not', 'are not free', 'as an', 'as an economic',
       'as developed', 'as developed nation', 'as military',
       'as military power', 'be strong', 'be strong not',
       'beca

In [40]:
cv.vocabulary_

{'have three': 166,
 'three visions': 442,
 'visions for': 480,
 'for india': 117,
 'have three visions': 167,
 'three visions for': 443,
 'visions for india': 481,
 'in 3000': 180,
 '3000 years': 4,
 'years of': 532,
 'of our': 280,
 'our history': 308,
 'history people': 174,
 'people from': 323,
 'from all': 133,
 'all over': 12,
 'over the': 319,
 'the world': 414,
 'world have': 526,
 'have come': 158,
 'come and': 81,
 'and invaded': 28,
 'invaded us': 196,
 'us captured': 468,
 'captured our': 77,
 'our lands': 310,
 'lands conquered': 217,
 'conquered our': 86,
 'our minds': 312,
 'in 3000 years': 181,
 '3000 years of': 5,
 'years of our': 533,
 'of our history': 281,
 'our history people': 309,
 'history people from': 175,
 'people from all': 324,
 'from all over': 134,
 'all over the': 13,
 'over the world': 320,
 'the world have': 415,
 'world have come': 527,
 'have come and': 159,
 'come and invaded': 82,
 'and invaded us': 29,
 'invaded us captured': 197,
 'us captured ou

In [41]:
#lets print the vectors:

cnt_vectors.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [42]:
#lets check the size:

cnt_vectors.toarray().shape

(31, 539)

Note: 31 sentences and each sentence got 539 vocab_size