In [1]:
text = '''The cat sat on the mat.
The dog barked at the stranger.
The bird is singing in the tree.
The sun is shining brightly.
The cat and dog are playing together.
I love reading books on artificial intelligence.
The weather is cold and rainy today.
My laptop battery died while working.
The football team won the championship.
The chef is preparing a delicious meal.
She enjoys hiking in the mountains.
The train arrived at the station on time.
Scientists are researching new medical treatments.
The smartphone has a powerful camera.
The students are studying for their exams.
The movie was full of action and suspense.
He listens to music while coding.
The artist painted a beautiful landscape.
The internet speed is very slow today.
The bakery sells fresh bread every morning.
'''
from io import StringIO
text_io = StringIO(text)

In [2]:
import pandas as pd
messages = pd.read_csv(text_io,sep='\t',names=['message'])
messages

Unnamed: 0,message
0,The cat sat on the mat.
1,The dog barked at the stranger.
2,The bird is singing in the tree.
3,The sun is shining brightly.
4,The cat and dog are playing together.
5,I love reading books on artificial intelligence.
6,The weather is cold and rainy today.
7,My laptop battery died while working.
8,The football team won the championship.
9,The chef is preparing a delicious meal.


In [3]:
import re,nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [5]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
wordlemmatize = WordNetLemmatizer()

In [6]:
corpus = []
for i in range(0,len(messages)):
    review = re.sub('[^a-zA-Z]',' ',messages['message'][i])
    review = review.lower()
    review = review.split()
    review = [wordlemmatize.lemmatize(word) for word in review if word not in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)


In [7]:
corpus

['cat sat mat',
 'dog barked stranger',
 'bird singing tree',
 'sun shining brightly',
 'cat dog playing together',
 'love reading book artificial intelligence',
 'weather cold rainy today',
 'laptop battery died working',
 'football team championship',
 'chef preparing delicious meal',
 'enjoys hiking mountain',
 'train arrived station time',
 'scientist researching new medical treatment',
 'smartphone powerful camera',
 'student studying exam',
 'movie full action suspense',
 'listens music coding',
 'artist painted beautiful landscape',
 'internet speed slow today',
 'bakery sell fresh bread every morning']

# TF-IDF 
Term Frequency - Inverse Document Frequency 

It is a statistical measure used in NLP to evaluate how important a word is in a corpus. It improves upon bag of words by reducing the impact of frequently occuring words (like 'the','is,'and') and giving more importance to rare, meaningful words.

In [None]:
#Create the BOW model
from sklearn.feature_extraction.text import TfidfVectorizer
#for Binary BOW enable binary=True
tfidf = TfidfVectorizer(max_features=100) 
x = tfidf.fit_transform(corpus).toarray()
x #you can see there are decimal values


array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.528, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0.601, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0.601, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.528, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.601, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0.577, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.577, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.577, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.577, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.577, 0, 0, 0, 0, 0, 0, 0, 0, 0.577, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.467, 0, 0, 0, 0, 0, 0, 0.467, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0.531, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [12]:
import numpy as np
np.set_printoptions(edgeitems=30,linewidth=10000,
                    formatter=dict(float=lambda x: "%.3g" % x))

# With N-Grams

In [10]:
tfidf = TfidfVectorizer(max_features=100,ngram_range=(2,2))
x = tfidf.fit_transform(corpus).toarray()

In [11]:
tfidf.vocabulary_

{'cat sat': np.int64(12),
 'sat mat': np.int64(40),
 'dog barked': np.int64(17),
 'barked stranger': np.int64(5),
 'bird singing': np.int64(8),
 'singing tree': np.int64(44),
 'sun shining': np.int64(51),
 'shining brightly': np.int64(43),
 'cat dog': np.int64(11),
 'dog playing': np.int64(18),
 'playing together': np.int64(34),
 'love reading': np.int64(28),
 'reading book': np.int64(38),
 'book artificial': np.int64(9),
 'artificial intelligence': np.int64(2),
 'weather cold': np.int64(54),
 'cold rainy': np.int64(14),
 'rainy today': np.int64(37),
 'laptop battery': np.int64(26),
 'battery died': np.int64(6),
 'died working': np.int64(16),
 'football team': np.int64(21),
 'team championship': np.int64(52),
 'chef preparing': np.int64(13),
 'preparing delicious': np.int64(36),
 'delicious meal': np.int64(15),
 'enjoys hiking': np.int64(19),
 'hiking mountain': np.int64(24),
 'train arrived': np.int64(53),
 'arrived station': np.int64(1),
 'station time': np.int64(48),
 'scientist res