Imported libraries and data

In [3]:
import pandas as pd

importData = pd.read_csv('Reviews.csv')
data = importData[['ProductId', 'Text']]
print(data.info())
print(data.head(10))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 568454 entries, 0 to 568453
Data columns (total 2 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   ProductId  568454 non-null  object
 1   Text       568454 non-null  object
dtypes: object(2)
memory usage: 8.7+ MB
None
    ProductId                                               Text
0  B001E4KFG0  I have bought several of the Vitality canned d...
1  B00813GRG4  Product arrived labeled as Jumbo Salted Peanut...
2  B000LQOCH0  This is a confection that has been around a fe...
3  B000UA0QIQ  If you are looking for the secret ingredient i...
4  B006K2ZZ7K  Great taffy at a great price.  There was a wid...
5  B006K2ZZ7K  I got a wild hair for taffy and ordered this f...
6  B006K2ZZ7K  This saltwater taffy had great flavors and was...
7  B006K2ZZ7K  This taffy is so good.  It is very soft and ch...
8  B000E7L2R4  Right now I'm mostly just sprouting this so my...
9  B00171APVA  This is a very healt

Performed text preprocessing for each text review

In [4]:
import nltk
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

cleanedData = {}

#Word tokenization
for index, row in data.iterrows() :
    tokens = word_tokenize(row['Text'])
    id = row['ProductId']
    if id in cleanedData.keys() :
        cleanedData[id].append(tokens)
    else :
        cleanedData[id] = [tokens]

for product, tokens in cleanedData.items():

    # Lowercasing
    lowercased_tokens = [word.lower() for sublist in tokens for word in sublist]

    # Removing stopwords
    stops = set(stopwords.words('english'))
    without_stopwords = [word for word in lowercased_tokens if word not in stops]

    # Removing punctuation
    without_punctuation = [re.sub(r'\W+', '', word) for word in without_stopwords]

    # Stemming
    stemmer = PorterStemmer()
    stemmed_tokens = [stemmer.stem(word) for word in without_punctuation]

    # Lemmatization
    lemmatizer = nltk.WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(word, pos='v') for word in stemmed_tokens]

    # Assign the final list to cleanedData[product]
    cleanedData[product] = lemmatized_tokens

print("Words tokenized by product ID: ")
for key, value in list(cleanedData.items())[:5] :
    print(f"{key}: {value}")

Words tokenized by product ID: 
B001E4KFG0: ['buy', 'sever', 'vital', 'can', 'dog', 'food', 'product', 'find', 'good', 'qualiti', '', 'product', 'look', 'like', 'stew', 'process', 'meat', 'smell', 'better', '', 'labrador', 'finicki', 'appreci', 'product', 'better', '']
B00813GRG4: ['product', 'arriv', 'label', 'jumbo', 'salt', 'peanut', '', 'peanut', 'actual', 'small', 'size', 'unsalt', '', 'sure', 'error', 'vendor', 'intend', 'repres', 'product', '', 'jumbo', '', '']
B000LQOCH0: ['confect', 'around', 'centuri', '', 'light', '', 'pillowi', 'citru', 'gelatin', 'nut', '', 'case', 'filbert', '', 'cut', 'tini', 'squar', 'liber', 'coat', 'powder', 'sugar', '', 'tini', 'mouth', 'heaven', '', 'chewi', '', 'flavor', '', 'highli', 'recommend', 'yummi', 'treat', '', 'familiar', 'stori', 'cs', '', 'lewi', '', '', 'lion', '', 'witch', '', 'wardrob', '', '', 'treat', 'seduc', 'edmund', 'sell', 'brother', 'sister', 'witch', '']
B000UA0QIQ: ['look', 'secret', 'ingredi', 'robitussin', 'believ', 'find'

Implemented NLP model: Bag-of-Words

In [5]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
all_processed_text = [' '.join(tokens) for tokens in cleanedData.values()]
vectorizer.fit(all_processed_text)
feature_names = vectorizer.get_feature_names_out()
vectorizer.fit_transform(all_processed_text)
print(feature_names)

['00' '000' '0000' ... 'être' 'île' 'ît']


Implemented NLP model: N-grams

In [14]:
from nltk import ngrams

# Bigrams
bigrams = {product_id: list(ngrams(words, 2)) for product_id, words in cleanedData.items()}
print("Bigrams - First 5 products: ")
for id, bigram in list(bigrams.items())[:5] :
    print(bigram[:10])

# Trigrams
trigrams = {product_id: list(ngrams(words, 3)) for product_id, words in cleanedData.items()}
print("Trigrams - First 5 products: ")
for id, trigram in list(trigrams.items())[:5] :
    print(trigram[:10])

Bigrams - First 5 products: 
[('buy', 'sever'), ('sever', 'vital'), ('vital', 'can'), ('can', 'dog'), ('dog', 'food'), ('food', 'product'), ('product', 'find'), ('find', 'good'), ('good', 'qualiti'), ('qualiti', '')]
[('product', 'arriv'), ('arriv', 'label'), ('label', 'jumbo'), ('jumbo', 'salt'), ('salt', 'peanut'), ('peanut', ''), ('', 'peanut'), ('peanut', 'actual'), ('actual', 'small'), ('small', 'size')]
[('confect', 'around'), ('around', 'centuri'), ('centuri', ''), ('', 'light'), ('light', ''), ('', 'pillowi'), ('pillowi', 'citru'), ('citru', 'gelatin'), ('gelatin', 'nut'), ('nut', '')]
[('look', 'secret'), ('secret', 'ingredi'), ('ingredi', 'robitussin'), ('robitussin', 'believ'), ('believ', 'find'), ('find', ''), ('', 'get'), ('get', 'addit'), ('addit', 'root'), ('root', 'beer')]
[('great', 'taffi'), ('taffi', 'great'), ('great', 'price'), ('price', ''), ('', 'wide'), ('wide', 'assort'), ('assort', 'yummi'), ('yummi', 'taffi'), ('taffi', ''), ('', 'deliveri')]
Trigrams - First

Implemented NLP model: Tf-idf

In [22]:
from sklearn.feature_extraction.text import TfidfVectorizer

all_text = [' '.join(sentences) for sentences in cleanedData.values()]
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
matrix = tfidf_vectorizer.fit_transform(all_text)
feature_names2 = tfidf_vectorizer.get_feature_names_out()
tfidf_values = matrix.toarray()
print("Feature Names: " + feature_names2)
print("TF-IDF values - First 10: ")
for i, (product, values) in enumerate(zip(cleanedData.keys(), tfidf_values)):
    print(f"\nProduct {product} - First 10 TF-IDF Values:")
    for j, value in enumerate(values[:10]):
        print(f"    {feature_names[j]}: {value:.4f}")
    if i >= 1: 
        break

['Feature Names: 05' 'Feature Names: 0g' 'Feature Names: 10' ...
 'Feature Names: zone' 'Feature Names: zucchini' 'Feature Names: zuke']
TF-IDF values - First 10: 

Product B001E4KFG0 - First 10 TF-IDF Values:
    00: 0.0000
    000: 0.0000
    0000: 0.0000
    000001: 0.0000
    00001: 0.0000
    000013: 0.0000
    00004: 0.0000
    00006mg: 0.0000
    0000soo: 0.0000
    0001: 0.0000

Product B00813GRG4 - First 10 TF-IDF Values:
    00: 0.0000
    000: 0.0000
    0000: 0.0000
    000001: 0.0000
    00001: 0.0000
    000013: 0.0000
    00004: 0.0000
    00006mg: 0.0000
    0000soo: 0.0000
    0001: 0.0000
