<a href="https://colab.research.google.com/github/AdityaKumbhar21/Natural_Language_Processing/blob/main/NLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Implementing the text preprocessing

In [60]:
!pip install nltk



In [61]:
paragraph = """
Elon Reeve Musk FRS (/ˈiːlɒn/ EE-lon; born June 28, 1971) is a businessman. He is known for his leadership of Tesla, SpaceX, X (formerly Twitter), and the Department of Government Efficiency (DOGE). Musk has been considered the wealthiest person in the world since 2021; as of May 2025, Forbes estimates his net worth to be US$424.7 billion.

Born to a wealthy family in Pretoria, South Africa, Musk emigrated in 1989 to Canada. He received bachelor's degrees from the University of Pennsylvania in 1997 before moving to California, United States, to pursue business ventures. In 1995, Musk co-founded the software company Zip2. Following its sale in 1999, he co-founded X.com, an online payment company that later merged to form PayPal, which was acquired by eBay in 2002. That year, Musk also became an American citizen.
"""

In [62]:
import nltk
from nltk.tokenize import sent_tokenize
from nltk.stem import PorterStemmer # Stemming library
from nltk.stem import WordNetLemmatizer # Lemmatizing
from nltk.corpus import stopwords # stopwards

In [63]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [64]:
# tokenization
sentences = nltk.sent_tokenize(paragraph)
sentences

['\nElon Reeve Musk FRS (/ˈiːlɒn/ EE-lon; born June 28, 1971) is a businessman.',
 'He is known for his leadership of Tesla, SpaceX, X (formerly Twitter), and the Department of Government Efficiency (DOGE).',
 'Musk has been considered the wealthiest person in the world since 2021; as of May 2025, Forbes estimates his net worth to be US$424.7 billion.',
 'Born to a wealthy family in Pretoria, South Africa, Musk emigrated in 1989 to Canada.',
 "He received bachelor's degrees from the University of Pennsylvania in 1997 before moving to California, United States, to pursue business ventures.",
 'In 1995, Musk co-founded the software company Zip2.',
 'Following its sale in 1999, he co-founded X.com, an online payment company that later merged to form PayPal, which was acquired by eBay in 2002.',
 'That year, Musk also became an American citizen.']

In [65]:
# stemming
stemmer = PorterStemmer()
stemmer.stem('thinking')

'think'

In [66]:
# lemmatization
nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()
lemmatizer.lemmatize('facial')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


'facial'

In [67]:
# cleaning the text -> removing special chars
import re

corpus = []
for i in range(len(sentences)):
  cleaned_text = re.sub('[^a-zA-Z]', ' ', sentences[i])
  cleaned_text = cleaned_text.lower()
  corpus.append(cleaned_text)

In [68]:
corpus

[' elon reeve musk frs    i l n  ee lon  born june           is a businessman ',
 'he is known for his leadership of tesla  spacex  x  formerly twitter   and the department of government efficiency  doge  ',
 'musk has been considered the wealthiest person in the world since       as of may       forbes estimates his net worth to be us       billion ',
 'born to a wealthy family in pretoria  south africa  musk emigrated in      to canada ',
 'he received bachelor s degrees from the university of pennsylvania in      before moving to california  united states  to pursue business ventures ',
 'in       musk co founded the software company zip  ',
 'following its sale in       he co founded x com  an online payment company that later merged to form paypal  which was acquired by ebay in      ',
 'that year  musk also became an american citizen ']

In [69]:
nltk.download('stopwords')
stopwords.words('english') # this are all stopwords w.r.t englis

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


['a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 "he'd",
 "he'll",
 'her',
 'here',
 'hers',
 'herself',
 "he's",
 'him',
 'himself',
 'his',
 'how',
 'i',
 "i'd",
 'if',
 "i'll",
 "i'm",
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it'd",
 "it'll",
 "it's",
 'its',
 'itself',
 "i've",
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'on

In [70]:
# applying tokeniation, stopwords and lemmatization on corpus

corpus_f = []

for i in range(len(sentences)):
    review = re.sub('[^a-zA-Z]', ' ', sentences[i])  # Fix typo: 'A-z' to 'A-Z'
    review = review.lower()
    review = review.split()
    review = [lemmatizer.lemmatize(word) for word in review if word not in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus_f.append(review)


In [71]:
corpus_f

['elon reeve musk fr l n ee lon born june businessman',
 'known leadership tesla spacex x formerly twitter department government efficiency doge',
 'musk considered wealthiest person world since may forbes estimate net worth u billion',
 'born wealthy family pretoria south africa musk emigrated canada',
 'received bachelor degree university pennsylvania moving california united state pursue business venture',
 'musk co founded software company zip',
 'following sale co founded x com online payment company later merged form paypal acquired ebay',
 'year musk also became american citizen']

In [72]:
# implementing Bag of words
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(binary=True)

In [73]:
X = cv.fit_transform(corpus_f)

In [74]:
cv.vocabulary_

{'elon': 23,
 'reeve': 52,
 'musk': 42,
 'fr': 32,
 'ee': 21,
 'lon': 38,
 'born': 7,
 'june': 34,
 'businessman': 9,
 'known': 35,
 'leadership': 37,
 'tesla': 59,
 'spacex': 57,
 'formerly': 30,
 'twitter': 60,
 'department': 18,
 'government': 33,
 'efficiency': 22,
 'doge': 19,
 'considered': 16,
 'wealthiest': 64,
 'person': 48,
 'world': 66,
 'since': 54,
 'may': 39,
 'forbes': 28,
 'estimate': 25,
 'net': 43,
 'worth': 67,
 'billion': 6,
 'wealthy': 65,
 'family': 26,
 'pretoria': 49,
 'south': 56,
 'africa': 1,
 'emigrated': 24,
 'canada': 11,
 'received': 51,
 'bachelor': 4,
 'degree': 17,
 'university': 62,
 'pennsylvania': 47,
 'moving': 41,
 'california': 10,
 'united': 61,
 'state': 58,
 'pursue': 50,
 'business': 8,
 'venture': 63,
 'co': 13,
 'founded': 31,
 'software': 55,
 'company': 15,
 'zip': 69,
 'following': 27,
 'sale': 53,
 'com': 14,
 'online': 44,
 'payment': 45,
 'later': 36,
 'merged': 40,
 'form': 29,
 'paypal': 46,
 'acquired': 0,
 'ebay': 20,
 'year': 68,

In [75]:
X[0].toarray()

array([[0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
        0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0]])

In [76]:
# implementing ngrams
# implementing Bag of words
from sklearn.feature_extraction.text import CountVectorizer
cv_n = CountVectorizer(binary=True, ngram_range=(2,3)) # this will create bigrams and trigrams

In [77]:
X = cv_n.fit_transform(corpus_f)

In [78]:
cv_n.vocabulary_

{'elon reeve': 34,
 'reeve musk': 100,
 'musk fr': 80,
 'fr ee': 53,
 'ee lon': 31,
 'lon born': 64,
 'born june': 10,
 'june businessman': 57,
 'elon reeve musk': 35,
 'reeve musk fr': 101,
 'musk fr ee': 81,
 'fr ee lon': 54,
 'ee lon born': 32,
 'lon born june': 65,
 'born june businessman': 11,
 'known leadership': 58,
 'leadership tesla': 62,
 'tesla spacex': 114,
 'spacex formerly': 110,
 'formerly twitter': 47,
 'twitter department': 116,
 'department government': 29,
 'government efficiency': 55,
 'efficiency doge': 33,
 'known leadership tesla': 59,
 'leadership tesla spacex': 63,
 'tesla spacex formerly': 115,
 'spacex formerly twitter': 111,
 'formerly twitter department': 48,
 'twitter department government': 117,
 'department government efficiency': 30,
 'government efficiency doge': 56,
 'musk considered': 76,
 'considered wealthiest': 25,
 'wealthiest person': 122,
 'person world': 92,
 'world since': 126,
 'since may': 104,
 'may forbes': 66,
 'forbes estimate': 43,
 'e

In [79]:
X[0].toarray()

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

In [86]:
# Implementing TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(ngram_range=(2,3) ) # max_features = 2) # the max_features takes only top performing words frequency

In [87]:
X = tfidf.fit_transform(corpus_f)

In [88]:
tfidf.vocabulary_

{'elon reeve': 34,
 'reeve musk': 100,
 'musk fr': 80,
 'fr ee': 53,
 'ee lon': 31,
 'lon born': 64,
 'born june': 10,
 'june businessman': 57,
 'elon reeve musk': 35,
 'reeve musk fr': 101,
 'musk fr ee': 81,
 'fr ee lon': 54,
 'ee lon born': 32,
 'lon born june': 65,
 'born june businessman': 11,
 'known leadership': 58,
 'leadership tesla': 62,
 'tesla spacex': 114,
 'spacex formerly': 110,
 'formerly twitter': 47,
 'twitter department': 116,
 'department government': 29,
 'government efficiency': 55,
 'efficiency doge': 33,
 'known leadership tesla': 59,
 'leadership tesla spacex': 63,
 'tesla spacex formerly': 115,
 'spacex formerly twitter': 111,
 'formerly twitter department': 48,
 'twitter department government': 117,
 'department government efficiency': 30,
 'government efficiency doge': 56,
 'musk considered': 76,
 'considered wealthiest': 25,
 'wealthiest person': 122,
 'person world': 92,
 'world since': 126,
 'since may': 104,
 'may forbes': 66,
 'forbes estimate': 43,
 'e

In [89]:
X.toarray()

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.21821789, 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.20120124, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.33333333,
        0.33333333]])