Imported libraries and data

In [17]:
import pandas as pd

importData = pd.read_csv('Reviews.csv')
data = importData[['ProductId', 'Text']]
print(data.info())
print(data.head(10))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 568454 entries, 0 to 568453
Data columns (total 2 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   ProductId  568454 non-null  object
 1   Text       568454 non-null  object
dtypes: object(2)
memory usage: 8.7+ MB
None
    ProductId                                               Text
0  B001E4KFG0  I have bought several of the Vitality canned d...
1  B00813GRG4  Product arrived labeled as Jumbo Salted Peanut...
2  B000LQOCH0  This is a confection that has been around a fe...
3  B000UA0QIQ  If you are looking for the secret ingredient i...
4  B006K2ZZ7K  Great taffy at a great price.  There was a wid...
5  B006K2ZZ7K  I got a wild hair for taffy and ordered this f...
6  B006K2ZZ7K  This saltwater taffy had great flavors and was...
7  B006K2ZZ7K  This taffy is so good.  It is very soft and ch...
8  B000E7L2R4  Right now I'm mostly just sprouting this so my...
9  B00171APVA  This is a very healt

Performed text preprocessing for each text review

In [18]:
import nltk
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

cleanedData = {}

#Word tokenization
for index, row in data.iterrows() :
    tokens = word_tokenize(row['Text'])
    id = row['ProductId']
    if id in cleanedData.keys() :
        cleanedData[id].append(tokens)
    else :
        cleanedData[id] = [tokens]

for product, tokens in cleanedData.items():

    # Lowercasing
    lowercased_tokens = [word.lower() for sublist in tokens for word in sublist]

    # Removing stopwords
    stops = set(stopwords.words('english'))
    without_stopwords = [word for word in lowercased_tokens if word not in stops]

    # Removing punctuation
    without_punctuation = [re.sub(r'\W+', '', word) for word in without_stopwords]

    # Stemming
    stemmer = PorterStemmer()
    stemmed_tokens = [stemmer.stem(word) for word in without_punctuation]

    # Lemmatization
    lemmatizer = nltk.WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(word, pos='v') for word in stemmed_tokens]

    # Assign the final list to cleanedData[product]
    cleanedData[product] = lemmatized_tokens

print("Words tokenized by product ID: ")
for key, value in list(cleanedData.items())[:5] :
    print(f"{key}: {value}")

Words tokenized by product ID: 
B001E4KFG0: ['buy', 'sever', 'vital', 'can', 'dog', 'food', 'product', 'find', 'good', 'qualiti', '', 'product', 'look', 'like', 'stew', 'process', 'meat', 'smell', 'better', '', 'labrador', 'finicki', 'appreci', 'product', 'better', '']
B00813GRG4: ['product', 'arriv', 'label', 'jumbo', 'salt', 'peanut', '', 'peanut', 'actual', 'small', 'size', 'unsalt', '', 'sure', 'error', 'vendor', 'intend', 'repres', 'product', '', 'jumbo', '', '']
B000LQOCH0: ['confect', 'around', 'centuri', '', 'light', '', 'pillowi', 'citru', 'gelatin', 'nut', '', 'case', 'filbert', '', 'cut', 'tini', 'squar', 'liber', 'coat', 'powder', 'sugar', '', 'tini', 'mouth', 'heaven', '', 'chewi', '', 'flavor', '', 'highli', 'recommend', 'yummi', 'treat', '', 'familiar', 'stori', 'cs', '', 'lewi', '', '', 'lion', '', 'witch', '', 'wardrob', '', '', 'treat', 'seduc', 'edmund', 'sell', 'brother', 'sister', 'witch', '']
B000UA0QIQ: ['look', 'secret', 'ingredi', 'robitussin', 'believ', 'find'

Implemented NLP model: Bag-of-Words

In [21]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
all_processed_text = [' '.join(tokens) for tokens in cleanedData.values()]
vectorizer.fit(all_processed_text)
feature_names = vectorizer.get_feature_names_out()
vectorizer.fit_transform(all_processed_text)
print(feature_names)

['00' '000' '0000' ... 'être' 'île' 'ît']


In [25]:
from nltk import ngrams

# Bigrams
bigrams = ngrams(cleanedData, 2)
bigrams = [' '.join(gram) for gram in bigrams]
print("Bigrams: ", bigrams)

# Trigrams
trigrams = ngrams(cleanedData, 3)
trigrams = [' '.join(gram) for gram in trigrams]
print("Trigrams: ", trigrams)

Bigrams:  ['B001E4KFG0 B00813GRG4', 'B00813GRG4 B000LQOCH0', 'B000LQOCH0 B000UA0QIQ', 'B000UA0QIQ B006K2ZZ7K', 'B006K2ZZ7K B000E7L2R4', 'B000E7L2R4 B00171APVA', 'B00171APVA B0001PB9FE', 'B0001PB9FE B0009XLVG0', 'B0009XLVG0 B001GVISJM', 'B001GVISJM B00144C10S', 'B00144C10S B0001PB9FY', 'B0001PB9FY B003F6UO7K', 'B003F6UO7K B001EO5QW8', 'B001EO5QW8 B000G6RPMY', 'B000G6RPMY B002GWHC0G', 'B002GWHC0G B004N5KULM', 'B004N5KULM B001EO5TPM', 'B001EO5TPM B005DUM9UQ', 'B005DUM9UQ B000E7VI7S', 'B000E7VI7S B001GVISJC', 'B001GVISJC B006SQBRMA', 'B006SQBRMA B0059WXJKM', 'B0059WXJKM B001EPPI84', 'B001EPPI84 B004X2KR36', 'B004X2KR36 B005R8JE8O', 'B005R8JE8O B0066DMI6Y', 'B0066DMI6Y B003ZFRKGO', 'B003ZFRKGO B0019CW0HE', 'B0019CW0HE B004K2IHUO', 'B004K2IHUO B001REEG6C', 'B001REEG6C B000J0HIT2', 'B000J0HIT2 B0037LW78C', 'B0037LW78C B0026Y3YBK', 'B0026Y3YBK B003SE19UK', 'B003SE19UK B003OB0IB8', 'B003OB0IB8 B002SRYRE8', 'B002SRYRE8 B001GVISJW', 'B001GVISJW B0017I8UME', 'B0017I8UME B0064KU9HO', 'B0064KU9HO B0