In [1]:
!pip show spacy

Name: spacy
Version: 3.7.4
Summary: Industrial-strength Natural Language Processing (NLP) in Python
Home-page: https://spacy.io
Author: Explosion
Author-email: contact@explosion.ai
License: MIT
Location: D:\Anaconda\Lib\site-packages
Requires: catalogue, cymem, jinja2, langcodes, murmurhash, numpy, packaging, preshed, pydantic, requests, setuptools, smart-open, spacy-legacy, spacy-loggers, srsly, thinc, tqdm, typer, wasabi, weasel
Required-by: 


In [2]:
!pip show nltk

Name: nltk
Version: 3.8.1
Summary: Natural Language Toolkit
Home-page: https://www.nltk.org/
Author: NLTK Team
Author-email: nltk.team@gmail.com
License: Apache License, Version 2.0
Location: D:\Anaconda\Lib\site-packages
Requires: click, joblib, regex, tqdm
Required-by: 


In [3]:
import nltk
from nltk.stem import PorterStemmer ## For doing Stemming
from nltk.corpus import stopwords

In [4]:
paragraph = "Virat Kohli was born on November 5, 1988, in Delhi, India. He grew up in Delhi and was one of the first to train at the West Delhi Cricket Academy, created in 1998. In 2002 he played for the Delhi Under-15 team and was the highest run scorer in the 2003–04 Vijay Merchant Trophy, playing for the Delhi Under-17 team."

In [5]:
paragraph

'Virat Kohli was born on November 5, 1988, in Delhi, India. He grew up in Delhi and was one of the first to train at the West Delhi Cricket Academy, created in 1998. In 2002 he played for the Delhi Under-15 team and was the highest run scorer in the 2003–04 Vijay Merchant Trophy, playing for the Delhi Under-17 team.'

In [6]:
## tokenization -- paragraph-->sentences-->words
nltk.download('punkt')
sentences = nltk.sent_tokenize(paragraph)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\abhin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [7]:
print(sentences)

['Virat Kohli was born on November 5, 1988, in Delhi, India.', 'He grew up in Delhi and was one of the first to train at the West Delhi Cricket Academy, created in 1998.', 'In 2002 he played for the Delhi Under-15 team and was the highest run scorer in the 2003–04 Vijay Merchant Trophy, playing for the Delhi Under-17 team.']


In [8]:
type(sentences)

list

In [9]:
stemmer = PorterStemmer()
stemmer.stem('drinking')

'drink'

In [10]:
stemmer.stem('history')

'histori'

In [11]:
## For Lemmatization
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet') ## This needs to be downloaded.

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\abhin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [12]:
lemmatizer = WordNetLemmatizer()

In [13]:
lemmatizer.lemmatize('history')

'history'

In [14]:
## Clean the special characters in the sentences
import re
corpus = []
for i in range(len(sentences)):
    review = re.sub('[^a-zA-Z^]',' ',sentences[i])
    review = review.lower() # to lowercase
    corpus.append(review)

In [15]:
corpus

['virat kohli was born on november          in delhi  india ',
 'he grew up in delhi and was one of the first to train at the west delhi cricket academy  created in      ',
 'in      he played for the delhi under    team and was the highest run scorer in the         vijay merchant trophy  playing for the delhi under    team ']

In [16]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\abhin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [17]:
stopwords.words('english')

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [18]:
## Stemming
for i in corpus:
    words = nltk.word_tokenize(i)
    for word in words:
        if word not in set(stopwords.words('english')):
            print(stemmer.stem(word))      

virat
kohli
born
novemb
delhi
india
grew
delhi
one
first
train
west
delhi
cricket
academi
creat
play
delhi
team
highest
run
scorer
vijay
merchant
trophi
play
delhi
team


In [19]:
## Lemmatization
for i in corpus:
    words = nltk.word_tokenize(i)
    for word in words:
        if word not in set(stopwords.words('english')):
            print(lemmatizer.lemmatize(word))    

virat
kohli
born
november
delhi
india
grew
delhi
one
first
train
west
delhi
cricket
academy
created
played
delhi
team
highest
run
scorer
vijay
merchant
trophy
playing
delhi
team


In [20]:
# Apply stopwords, lemmatize.

In [40]:
import re
corpus = []
for i in range(len(sentences)):
    review = re.sub('[^a-zA-Z]',' ',sentences[i])
    review = review.lowercase()
    review = review.split()
    review = [lemmatizer.lemmatize(word) for word in review if not word in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus.append(review)

AttributeError: 'str' object has no attribute 'lowercase'

### Bag of Words

In [24]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(binary=True)

In [25]:
X = cv.fit_transform(corpus)

In [26]:
cv.vocabulary_

{'virat': 32,
 'kohli': 14,
 'was': 33,
 'born': 3,
 'on': 18,
 'november': 16,
 'in': 12,
 'delhi': 6,
 'india': 13,
 'he': 10,
 'grew': 9,
 'up': 30,
 'and': 1,
 'one': 19,
 'of': 17,
 'the': 25,
 'first': 7,
 'to': 26,
 'train': 27,
 'at': 2,
 'west': 34,
 'cricket': 5,
 'academy': 0,
 'created': 4,
 'played': 20,
 'for': 8,
 'under': 29,
 'team': 24,
 'highest': 11,
 'run': 22,
 'scorer': 23,
 'vijay': 31,
 'merchant': 15,
 'trophy': 28,
 'playing': 21}

In [27]:
# Demonstrating n-grams

In [29]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(binary=True,ngram_range=(2,3))

In [30]:
X = cv.fit_transform(corpus)

In [31]:
cv.vocabulary_

{'virat kohli': 82,
 'kohli was': 38,
 'was born': 84,
 'born on': 7,
 'on november': 46,
 'november in': 42,
 'in delhi': 31,
 'delhi india': 16,
 'virat kohli was': 83,
 'kohli was born': 39,
 'was born on': 85,
 'born on november': 8,
 'on november in': 47,
 'november in delhi': 43,
 'in delhi india': 33,
 'he grew': 25,
 'grew up': 23,
 'up in': 78,
 'delhi and': 12,
 'and was': 2,
 'was one': 86,
 'one of': 48,
 'of the': 44,
 'the first': 62,
 'first to': 19,
 'to train': 70,
 'train at': 72,
 'at the': 5,
 'the west': 68,
 'west delhi': 90,
 'delhi cricket': 14,
 'cricket academy': 10,
 'academy created': 0,
 'created in': 9,
 'he grew up': 26,
 'grew up in': 24,
 'up in delhi': 79,
 'in delhi and': 32,
 'delhi and was': 13,
 'and was one': 3,
 'was one of': 87,
 'one of the': 49,
 'of the first': 45,
 'the first to': 63,
 'first to train': 20,
 'to train at': 71,
 'train at the': 73,
 'at the west': 6,
 'the west delhi': 69,
 'west delhi cricket': 91,
 'delhi cricket academy': 

In [32]:
corpus[0]

'virat kohli was born on november          in delhi  india '

In [33]:
X[0].toarray()

array([[0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1,
        0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0,
        0, 0, 0, 0]], dtype=int64)

## TF-IDF

In [34]:
## TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer
cv = TfidfVectorizer(ngram_range=(2, 3))
X = cv.fit_transform(corpus)

In [35]:
corpus[0]

'virat kohli was born on november          in delhi  india '

In [36]:
X[0].toarray()

array([[0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.26190578, 0.26190578, 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.26190578, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.19918609, 0.        , 0.26190578, 0.        ,
        0.        , 0.        , 0.        , 0.26190578, 0.26190578,
        0.        , 0.        , 0.26190578, 0.26190578, 0.        ,
        0.        , 0.26190578, 0.26190578, 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.  