In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import numpy as np

In [2]:
data = pd.read_csv("reviews.csv")
data

Unnamed: 0,Review,Rating
0,I do not like this product,1
1,I like this product very much,5
2,This product is just awesome,4
3,I have recommended this product to my friend,4
4,horrible,2
5,waste of money,2
6,worth buying,5
7,perfect fit for my requirements,4
8,packaging is not good,3
9,it is really expensive,3


In [3]:
# 1 Gram / bag of words

In [4]:
cv_1_gram = CountVectorizer()
transformed_data_1_gram = cv_1_gram.fit_transform(data['Review'])

In [5]:
transformed_data_1_gram.shape # This is simply a bag of words representation where the no. of dimensions are 30

(10, 30)

In [6]:
cv_1_gram.vocabulary_

{'do': 2,
 'not': 17,
 'like': 13,
 'this': 25,
 'product': 21,
 'very': 27,
 'much': 15,
 'is': 10,
 'just': 12,
 'awesome': 0,
 'have': 8,
 'recommended': 23,
 'to': 26,
 'my': 16,
 'friend': 6,
 'horrible': 9,
 'waste': 28,
 'of': 18,
 'money': 14,
 'worth': 29,
 'buying': 1,
 'perfect': 20,
 'fit': 4,
 'for': 5,
 'requirements': 24,
 'packaging': 19,
 'good': 7,
 'it': 11,
 'really': 22,
 'expensive': 3}

In [7]:
# bi-grams

In [8]:
cv_bi_gram = CountVectorizer(ngram_range=(1,2))
transformed_data_bi_gram = cv_bi_gram.fit_transform(data['Review'])

In [9]:
transformed_data_bi_gram.shape # This contains 1 grams and also bi grams

(10, 57)

In [10]:
cv_bi_gram.vocabulary_

{'do': 2,
 'not': 29,
 'like': 22,
 'this': 47,
 'product': 38,
 'do not': 3,
 'not like': 31,
 'like this': 23,
 'this product': 48,
 'very': 51,
 'much': 25,
 'product very': 41,
 'very much': 52,
 'is': 14,
 'just': 20,
 'awesome': 0,
 'product is': 39,
 'is just': 15,
 'just awesome': 21,
 'have': 11,
 'recommended': 44,
 'to': 49,
 'my': 26,
 'friend': 9,
 'have recommended': 12,
 'recommended this': 45,
 'product to': 40,
 'to my': 50,
 'my friend': 27,
 'horrible': 13,
 'waste': 53,
 'of': 32,
 'money': 24,
 'waste of': 54,
 'of money': 33,
 'worth': 55,
 'buying': 1,
 'worth buying': 56,
 'perfect': 36,
 'fit': 5,
 'for': 7,
 'requirements': 46,
 'perfect fit': 37,
 'fit for': 6,
 'for my': 8,
 'my requirements': 28,
 'packaging': 34,
 'good': 10,
 'packaging is': 35,
 'is not': 16,
 'not good': 30,
 'it': 18,
 'really': 42,
 'expensive': 4,
 'it is': 19,
 'is really': 17,
 'really expensive': 43}

In [11]:
#tri grams

In [12]:
cv_tri_gram = CountVectorizer(ngram_range=(1,3))
transformed_data_tri_gram = cv_tri_gram.fit_transform(data['Review'])

In [13]:
transformed_data_tri_gram.shape # This contains 1 grams, bi and also tri grams

(10, 78)

In [14]:
cv_tri_gram.vocabulary_

{'do': 2,
 'not': 38,
 'like': 30,
 'this': 63,
 'product': 50,
 'do not': 3,
 'not like': 40,
 'like this': 31,
 'this product': 64,
 'do not like': 4,
 'not like this': 41,
 'like this product': 32,
 'very': 71,
 'much': 34,
 'product very': 55,
 'very much': 72,
 'this product very': 67,
 'product very much': 56,
 'is': 18,
 'just': 28,
 'awesome': 0,
 'product is': 51,
 'is just': 19,
 'just awesome': 29,
 'this product is': 65,
 'product is just': 52,
 'is just awesome': 20,
 'have': 14,
 'recommended': 59,
 'to': 68,
 'my': 35,
 'friend': 12,
 'have recommended': 15,
 'recommended this': 60,
 'product to': 53,
 'to my': 69,
 'my friend': 36,
 'have recommended this': 16,
 'recommended this product': 61,
 'this product to': 66,
 'product to my': 54,
 'to my friend': 70,
 'horrible': 17,
 'waste': 73,
 'of': 42,
 'money': 33,
 'waste of': 74,
 'of money': 43,
 'waste of money': 75,
 'worth': 76,
 'buying': 1,
 'worth buying': 77,
 'perfect': 47,
 'fit': 6,
 'for': 9,
 'requirements

## Tf-Idf

In [15]:
tv_bi_gram = TfidfVectorizer(ngram_range=(1,2))
tv_transformed_data_bi_gram = tv_bi_gram.fit_transform(data['Review'])

In [16]:
tv_transformed_data_bi_gram.shape

(10, 57)

In [17]:
tv_bi_gram.vocabulary_

{'do': 2,
 'not': 29,
 'like': 22,
 'this': 47,
 'product': 38,
 'do not': 3,
 'not like': 31,
 'like this': 23,
 'this product': 48,
 'very': 51,
 'much': 25,
 'product very': 41,
 'very much': 52,
 'is': 14,
 'just': 20,
 'awesome': 0,
 'product is': 39,
 'is just': 15,
 'just awesome': 21,
 'have': 11,
 'recommended': 44,
 'to': 49,
 'my': 26,
 'friend': 9,
 'have recommended': 12,
 'recommended this': 45,
 'product to': 40,
 'to my': 50,
 'my friend': 27,
 'horrible': 13,
 'waste': 53,
 'of': 32,
 'money': 24,
 'waste of': 54,
 'of money': 33,
 'worth': 55,
 'buying': 1,
 'worth buying': 56,
 'perfect': 36,
 'fit': 5,
 'for': 7,
 'requirements': 46,
 'perfect fit': 37,
 'fit for': 6,
 'for my': 8,
 'my requirements': 28,
 'packaging': 34,
 'good': 10,
 'packaging is': 35,
 'is not': 16,
 'not good': 30,
 'it': 18,
 'really': 42,
 'expensive': 4,
 'it is': 19,
 'is really': 17,
 'really expensive': 43}

In [18]:
tv_transformed_data_bi_gram.shape

(10, 57)

In [19]:
transformed_data_bi_gram.shape

(10, 57)

In [20]:
tv_transformed_data_bi_gram.toarray().shape

(10, 57)

In [21]:
np.round(tv_transformed_data_bi_gram.toarray(), 2)

array([[0.  , 0.  , 0.39, 0.39, 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
        0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
        0.33, 0.33, 0.  , 0.  , 0.  , 0.  , 0.  , 0.33, 0.  , 0.39, 0.  ,
        0.  , 0.  , 0.  , 0.  , 0.  , 0.26, 0.  , 0.  , 0.  , 0.  , 0.  ,
        0.  , 0.  , 0.  , 0.26, 0.26, 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
        0.  , 0.  ],
       [0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
        0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
        0.33, 0.33, 0.  , 0.38, 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
        0.  , 0.  , 0.  , 0.  , 0.  , 0.25, 0.  , 0.  , 0.38, 0.  , 0.  ,
        0.  , 0.  , 0.  , 0.25, 0.25, 0.  , 0.  , 0.38, 0.38, 0.  , 0.  ,
        0.  , 0.  ],
       [0.38, 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
        0.  , 0.  , 0.  , 0.28, 0.38, 0.  , 0.  , 0.  , 0.  , 0.38, 0.38,
        0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.

In [22]:
data.head(1)

Unnamed: 0,Review,Rating
0,I do not like this product,1


In [23]:
transformed_data_bi_gram.toarray()[0]

array([0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0])

In [24]:
np.round(tv_transformed_data_bi_gram.toarray()[0], 3)

array([0.   , 0.   , 0.393, 0.393, 0.   , 0.   , 0.   , 0.   , 0.   ,
       0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   ,
       0.   , 0.   , 0.   , 0.   , 0.334, 0.334, 0.   , 0.   , 0.   ,
       0.   , 0.   , 0.334, 0.   , 0.393, 0.   , 0.   , 0.   , 0.   ,
       0.   , 0.   , 0.26 , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   ,
       0.   , 0.   , 0.26 , 0.26 , 0.   , 0.   , 0.   , 0.   , 0.   ,
       0.   , 0.   , 0.   ])

In [25]:
tv_bi_gram.get_feature_names()

['awesome',
 'buying',
 'do',
 'do not',
 'expensive',
 'fit',
 'fit for',
 'for',
 'for my',
 'friend',
 'good',
 'have',
 'have recommended',
 'horrible',
 'is',
 'is just',
 'is not',
 'is really',
 'it',
 'it is',
 'just',
 'just awesome',
 'like',
 'like this',
 'money',
 'much',
 'my',
 'my friend',
 'my requirements',
 'not',
 'not good',
 'not like',
 'of',
 'of money',
 'packaging',
 'packaging is',
 'perfect',
 'perfect fit',
 'product',
 'product is',
 'product to',
 'product very',
 'really',
 'really expensive',
 'recommended',
 'recommended this',
 'requirements',
 'this',
 'this product',
 'to',
 'to my',
 'very',
 'very much',
 'waste',
 'waste of',
 'worth',
 'worth buying']

In [26]:
def top_tfidf_feats(row, features, top_n = 25):
    topn_ids = np.argsort(row)[::-1][:top_n]
    top_feats = [(features[i], row[i]) for i in topn_ids]
    df = pd.DataFrame(top_feats)
    df.columns = ['feature', 'tfidf']

    return df

top_tfidf_feats(tv_transformed_data_bi_gram.toarray()[0], tv_bi_gram.get_feature_names(), 10)

Unnamed: 0,feature,tfidf
0,do,0.392848
1,do not,0.392848
2,not like,0.392848
3,like this,0.333957
4,like,0.333957
5,not,0.333957
6,this product,0.259762
7,product,0.259762
8,this,0.259762
9,it,0.0


In [27]:
top_tfidf_feats(transformed_data_bi_gram.toarray()[0], cv_bi_gram.get_feature_names(), 10)

Unnamed: 0,feature,tfidf
0,this product,1
1,like this,1
2,do,1
3,do not,1
4,like,1
5,not,1
6,product,1
7,this,1
8,not like,1
9,it,0
