Imported libraries and data

In [17]:
import pandas as pd

importData = pd.read_csv('Reviews.csv')
data = importData[['ProductId', 'Text']]
print(data.info())
print(data.head(10))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 568454 entries, 0 to 568453
Data columns (total 2 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   ProductId  568454 non-null  object
 1   Text       568454 non-null  object
dtypes: object(2)
memory usage: 8.7+ MB
None
    ProductId                                               Text
0  B001E4KFG0  I have bought several of the Vitality canned d...
1  B00813GRG4  Product arrived labeled as Jumbo Salted Peanut...
2  B000LQOCH0  This is a confection that has been around a fe...
3  B000UA0QIQ  If you are looking for the secret ingredient i...
4  B006K2ZZ7K  Great taffy at a great price.  There was a wid...
5  B006K2ZZ7K  I got a wild hair for taffy and ordered this f...
6  B006K2ZZ7K  This saltwater taffy had great flavors and was...
7  B006K2ZZ7K  This taffy is so good.  It is very soft and ch...
8  B000E7L2R4  Right now I'm mostly just sprouting this so my...
9  B00171APVA  This is a very healt

Performed text preprocessing for each text review

In [18]:
import nltk
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

cleanedData = {}

#Word tokenization
for index, row in data.iterrows() :
    tokens = word_tokenize(row['Text'])
    id = row['ProductId']
    if id in cleanedData.keys() :
        cleanedData[id].append(tokens)
    else :
        cleanedData[id] = [tokens]

for product, tokens in cleanedData.items():

    # Lowercasing
    lowercased_tokens = [word.lower() for sublist in tokens for word in sublist]

    # Removing stopwords
    stops = set(stopwords.words('english'))
    without_stopwords = [word for word in lowercased_tokens if word not in stops]

    # Removing punctuation
    without_punctuation = [re.sub(r'\W+', '', word) for word in without_stopwords]

    # Stemming
    stemmer = PorterStemmer()
    stemmed_tokens = [stemmer.stem(word) for word in without_punctuation]

    # Lemmatization
    lemmatizer = nltk.WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(word, pos='v') for word in stemmed_tokens]

    # Assign the final list to cleanedData[product]
    cleanedData[product] = lemmatized_tokens

print("Words tokenized by product ID: ")
for key, value in list(cleanedData.items())[:5] :
    print(f"{key}: {value}")

Words tokenized by product ID: 
B001E4KFG0: ['buy', 'sever', 'vital', 'can', 'dog', 'food', 'product', 'find', 'good', 'qualiti', '', 'product', 'look', 'like', 'stew', 'process', 'meat', 'smell', 'better', '', 'labrador', 'finicki', 'appreci', 'product', 'better', '']
B00813GRG4: ['product', 'arriv', 'label', 'jumbo', 'salt', 'peanut', '', 'peanut', 'actual', 'small', 'size', 'unsalt', '', 'sure', 'error', 'vendor', 'intend', 'repres', 'product', '', 'jumbo', '', '']
B000LQOCH0: ['confect', 'around', 'centuri', '', 'light', '', 'pillowi', 'citru', 'gelatin', 'nut', '', 'case', 'filbert', '', 'cut', 'tini', 'squar', 'liber', 'coat', 'powder', 'sugar', '', 'tini', 'mouth', 'heaven', '', 'chewi', '', 'flavor', '', 'highli', 'recommend', 'yummi', 'treat', '', 'familiar', 'stori', 'cs', '', 'lewi', '', '', 'lion', '', 'witch', '', 'wardrob', '', '', 'treat', 'seduc', 'edmund', 'sell', 'brother', 'sister', 'witch', '']
B000UA0QIQ: ['look', 'secret', 'ingredi', 'robitussin', 'believ', 'find'

Implemented NLP model: Bag-of-Words

In [19]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
vectorizer.fit(data['Text'])
feature_names = vectorizer.get_feature_names_out()
for x in range(0, len(data['Text']), 1000) :
    X = vectorizer.transform(data['Text'][x:x+1000])
    print(X)
    print(feature_names)


  (0, 7659)	1
  (0, 8702)	3
  (0, 9696)	1
  (0, 20503)	1
  (0, 21549)	2
  (0, 23609)	1
  (0, 26441)	1
  (0, 40480)	1
  (0, 47808)	1
  (0, 48990)	1
  (0, 49440)	1
  (0, 52974)	1
  (0, 55826)	2
  (0, 61806)	1
  (0, 61953)	1
  (0, 65312)	1
  (0, 66909)	1
  (0, 67849)	1
  (0, 70919)	1
  (0, 73778)	1
  (0, 73956)	1
  (0, 74890)	1
  (0, 78138)	2
  (0, 86746)	1
  (0, 86838)	2
  :	:
  (999, 94591)	1
  (999, 95142)	2
  (999, 95153)	1
  (999, 96472)	1
  (999, 96501)	1
  (999, 97955)	1
  (999, 98936)	1
  (999, 98940)	1
  (999, 100814)	1
  (999, 101491)	1
  (999, 102194)	1
  (999, 104313)	1
  (999, 104412)	1
  (999, 106749)	1
  (999, 107849)	4
  (999, 109062)	1
  (999, 109486)	1
  (999, 114703)	1
  (999, 115010)	1
  (999, 116083)	1
  (999, 116192)	1
  (999, 116979)	1
  (999, 117659)	1
  (999, 118416)	1
  (999, 118776)	1
['00' '000' '0000' ... 'être' 'île' 'ît']
  (0, 19327)	2
  (0, 34640)	1
  (0, 52974)	2
  (0, 53425)	1
  (0, 59633)	1
  (0, 61806)	1
  (0, 62065)	1
  (0, 66777)	1
  (0, 69164)	1
  (