In [2]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tag import pos_tag
from nltk.chunk import ne_chunk
import string
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Download required NLTK data
# nltk.download('punkt')
# nltk.download('stopwords')
# nltk.download('wordnet')
# nltk.download('averaged_perceptron_tagger')
# nltk.download('maxent_ne_chunker')
# nltk.download('words')

# Load the dataset
df = pd.read_csv('SmallReviews.csv')
# df_small = df.head(1000)
# df_small.to_csv('SmallReviews.csv', index=False)
# Extract the 'Text' column and handle missing values
reviews = df['Text'].dropna().tolist()
reviews

['I have bought several of the Vitality canned dog food products and have found them all to be of good quality. The product looks more like a stew than a processed meat and it smells better. My Labrador is finicky and she appreciates this product better than  most.',
 'Product arrived labeled as Jumbo Salted Peanuts...the peanuts were actually small sized unsalted. Not sure if this was an error or if the vendor intended to represent the product as "Jumbo".',
 'This is a confection that has been around a few centuries.  It is a light, pillowy citrus gelatin with nuts - in this case Filberts. And it is cut into tiny squares and then liberally coated with powdered sugar.  And it is a tiny mouthful of heaven.  Not too chewy, and very flavorful.  I highly recommend this yummy treat.  If you are familiar with the story of C.S. Lewis\' "The Lion, The Witch, and The Wardrobe" - this is the treat that seduces Edmund into selling out his Brother and Sisters to the Witch.',
 'If you are looking f

In [3]:

# 1. Tokenization
def tokenize_text(text):
    return word_tokenize(str(text).lower())

tokenized_reviews = [tokenize_text(review) for review in reviews]
print(tokenized_reviews)

[['i', 'have', 'bought', 'several', 'of', 'the', 'vitality', 'canned', 'dog', 'food', 'products', 'and', 'have', 'found', 'them', 'all', 'to', 'be', 'of', 'good', 'quality', '.', 'the', 'product', 'looks', 'more', 'like', 'a', 'stew', 'than', 'a', 'processed', 'meat', 'and', 'it', 'smells', 'better', '.', 'my', 'labrador', 'is', 'finicky', 'and', 'she', 'appreciates', 'this', 'product', 'better', 'than', 'most', '.'], ['product', 'arrived', 'labeled', 'as', 'jumbo', 'salted', 'peanuts', '...', 'the', 'peanuts', 'were', 'actually', 'small', 'sized', 'unsalted', '.', 'not', 'sure', 'if', 'this', 'was', 'an', 'error', 'or', 'if', 'the', 'vendor', 'intended', 'to', 'represent', 'the', 'product', 'as', '``', 'jumbo', "''", '.'], ['this', 'is', 'a', 'confection', 'that', 'has', 'been', 'around', 'a', 'few', 'centuries', '.', 'it', 'is', 'a', 'light', ',', 'pillowy', 'citrus', 'gelatin', 'with', 'nuts', '-', 'in', 'this', 'case', 'filberts', '.', 'and', 'it', 'is', 'cut', 'into', 'tiny', 'squ

In [4]:

# 2. Remove Punctuation
def remove_punctuation(tokens):
    return [token for token in tokens if token not in string.punctuation]

cleaned_reviews = [remove_punctuation(tokens) for tokens in tokenized_reviews]
print(cleaned_reviews)

[['i', 'have', 'bought', 'several', 'of', 'the', 'vitality', 'canned', 'dog', 'food', 'products', 'and', 'have', 'found', 'them', 'all', 'to', 'be', 'of', 'good', 'quality', 'the', 'product', 'looks', 'more', 'like', 'a', 'stew', 'than', 'a', 'processed', 'meat', 'and', 'it', 'smells', 'better', 'my', 'labrador', 'is', 'finicky', 'and', 'she', 'appreciates', 'this', 'product', 'better', 'than', 'most'], ['product', 'arrived', 'labeled', 'as', 'jumbo', 'salted', 'peanuts', '...', 'the', 'peanuts', 'were', 'actually', 'small', 'sized', 'unsalted', 'not', 'sure', 'if', 'this', 'was', 'an', 'error', 'or', 'if', 'the', 'vendor', 'intended', 'to', 'represent', 'the', 'product', 'as', '``', 'jumbo', "''"], ['this', 'is', 'a', 'confection', 'that', 'has', 'been', 'around', 'a', 'few', 'centuries', 'it', 'is', 'a', 'light', 'pillowy', 'citrus', 'gelatin', 'with', 'nuts', 'in', 'this', 'case', 'filberts', 'and', 'it', 'is', 'cut', 'into', 'tiny', 'squares', 'and', 'then', 'liberally', 'coated', 

In [5]:

# 3. Remove Stop Words
stop_words = set(stopwords.words('english'))
def remove_stopwords(tokens):
    return [token for token in tokens if token not in stop_words]

filtered_reviews = [remove_stopwords(tokens) for tokens in cleaned_reviews]
print(filtered_reviews)

[['bought', 'several', 'vitality', 'canned', 'dog', 'food', 'products', 'found', 'good', 'quality', 'product', 'looks', 'like', 'stew', 'processed', 'meat', 'smells', 'better', 'labrador', 'finicky', 'appreciates', 'product', 'better'], ['product', 'arrived', 'labeled', 'jumbo', 'salted', 'peanuts', '...', 'peanuts', 'actually', 'small', 'sized', 'unsalted', 'sure', 'error', 'vendor', 'intended', 'represent', 'product', '``', 'jumbo', "''"], ['confection', 'around', 'centuries', 'light', 'pillowy', 'citrus', 'gelatin', 'nuts', 'case', 'filberts', 'cut', 'tiny', 'squares', 'liberally', 'coated', 'powdered', 'sugar', 'tiny', 'mouthful', 'heaven', 'chewy', 'flavorful', 'highly', 'recommend', 'yummy', 'treat', 'familiar', 'story', 'c.s', 'lewis', '``', 'lion', 'witch', 'wardrobe', "''", 'treat', 'seduces', 'edmund', 'selling', 'brother', 'sisters', 'witch'], ['looking', 'secret', 'ingredient', 'robitussin', 'believe', 'found', 'got', 'addition', 'root', 'beer', 'extract', 'ordered', 'good'

In [6]:

# 4. Stemming
stemmer = PorterStemmer()
stemmed_reviews = [[stemmer.stem(token) for token in tokens] for tokens in filtered_reviews]


# 5. Lemmatization
lemmatizer = WordNetLemmatizer()
lemmatized_reviews = [[lemmatizer.lemmatize(token) for token in tokens] for tokens in filtered_reviews]

# 6. POS Tagging
pos_tagged_reviews = [pos_tag(tokens) for tokens in filtered_reviews]

# 7. Named Entity Recognition (NER)
ner_reviews = [ne_chunk(pos_tag(tokens)) for tokens in filtered_reviews]

In [7]:

# 8. Chunking (Simple Noun Phrase Chunking)
def chunking(tagged_tokens):
    grammar = r"NP: {<DT>?<JJ>*<NN>}"  # Noun phrase chunking rule
    chunk_parser = nltk.RegexpParser(grammar)
    return chunk_parser.parse(tagged_tokens)

chunked_reviews = [chunking(pos_tag(tokens)) for tokens in filtered_reviews]
chunked_reviews

[Tree('S', [('bought', 'VBD'), Tree('NP', [('several', 'JJ'), ('vitality', 'NN')]), ('canned', 'VBD'), Tree('NP', [('dog', 'JJ'), ('food', 'NN')]), ('products', 'NNS'), ('found', 'VBD'), Tree('NP', [('good', 'JJ'), ('quality', 'NN')]), Tree('NP', [('product', 'NN')]), ('looks', 'VBZ'), ('like', 'IN'), Tree('NP', [('stew', 'NN')]), ('processed', 'VBN'), Tree('NP', [('meat', 'NN')]), ('smells', 'NNS'), ('better', 'RBR'), Tree('NP', [('labrador', 'NN')]), ('finicky', 'JJ'), ('appreciates', 'VBZ'), Tree('NP', [('product', 'NN')]), ('better', 'RBR')]),
 Tree('S', [Tree('NP', [('product', 'NN')]), ('arrived', 'VBD'), Tree('NP', [('labeled', 'JJ'), ('jumbo', 'NN')]), ('salted', 'VBD'), ('peanuts', 'NNS'), ('...', ':'), ('peanuts', 'VBZ'), ('actually', 'RB'), ('small', 'JJ'), ('sized', 'VBN'), Tree('NP', [('unsalted', 'JJ'), ('sure', 'JJ'), ('error', 'NN')]), Tree('NP', [('vendor', 'NN')]), ('intended', 'VBN'), Tree('NP', [('represent', 'JJ'), ('product', 'NN')]), ('``', '``'), ('jumbo', 'JJ')

In [8]:

# 9. Remove Rare and Most Common Words
all_words = [word for sublist in filtered_reviews for word in sublist]
word_freq = Counter(all_words)
# Threshold for rare (< 2 occurrences) and common (> 80% of reviews)
rare_threshold = 2
common_threshold = int(0.8 * len(reviews))
filtered_words = [word for word in all_words if rare_threshold <= word_freq[word] <= common_threshold]
final_reviews = [[word for word in tokens if word in filtered_words] for tokens in filtered_reviews]

# 10. Bag of Words
vectorizer = CountVectorizer()
bow_matrix = vectorizer.fit_transform([' '.join(tokens) for tokens in final_reviews])
bow_feature_names = vectorizer.get_feature_names_out()
print("Bag of Words Features (Top 10):", bow_feature_names[:10])

Bag of Words Features (Top 10): ['00' '10' '100' '11' '12' '125' '13' '14' '15' '150']


In [9]:

# 11. TF-IDF
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform([' '.join(tokens) for tokens in final_reviews])
tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()
print("TF-IDF Features (Top 10):", tfidf_feature_names[:10])

# Example: Test a simple model (e.g., inspect results)
print("Sample Processed Review (Lemmatized):", lemmatized_reviews[0])
print("Sample POS Tags:", pos_tagged_reviews[0][:5])
print("Sample NER:", ner_reviews[0])
print("Sample Chunked:", chunked_reviews[0])

TF-IDF Features (Top 10): ['00' '10' '100' '11' '12' '125' '13' '14' '15' '150']
Sample Processed Review (Lemmatized): ['bought', 'several', 'vitality', 'canned', 'dog', 'food', 'product', 'found', 'good', 'quality', 'product', 'look', 'like', 'stew', 'processed', 'meat', 'smell', 'better', 'labrador', 'finicky', 'appreciates', 'product', 'better']
Sample POS Tags: [('bought', 'VBD'), ('several', 'JJ'), ('vitality', 'NN'), ('canned', 'VBD'), ('dog', 'JJ')]
Sample NER: (S
  bought/VBD
  several/JJ
  vitality/NN
  canned/VBD
  dog/JJ
  food/NN
  products/NNS
  found/VBD
  good/JJ
  quality/NN
  product/NN
  looks/VBZ
  like/IN
  stew/NN
  processed/VBN
  meat/NN
  smells/NNS
  better/RBR
  labrador/NN
  finicky/JJ
  appreciates/VBZ
  product/NN
  better/RBR)
Sample Chunked: (S
  bought/VBD
  (NP several/JJ vitality/NN)
  canned/VBD
  (NP dog/JJ food/NN)
  products/NNS
  found/VBD
  (NP good/JJ quality/NN)
  (NP product/NN)
  looks/VBZ
  like/IN
  (NP stew/NN)
  processed/VBN
  (NP meat/N