In [2]:
import pandas as pd
import nltk
import string

# Download required NLTK resources
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('maxent_ne_chunker_tab')
nltk.download('words')


[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Tcs\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Tcs\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\Tcs\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger_eng.zip.
[nltk_data] Downloading package maxent_ne_chunker_tab to
[nltk_data]     C:\Users\Tcs\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping chunkers\maxent_ne_chunker_tab.zip.
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\Tcs\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


True

In [3]:
# Load dataset CSV (adjust path accordingly)
df = pd.read_csv('Reviews.csv')

# Select the column containing the review text
reviews = df['Text']

# Remove missing/null entries
reviews = reviews.dropna()

# Limit to 10,000 reviews
reviews = reviews.iloc[:10000]


In [4]:
# Function to lowercase and remove punctuation
def preprocess_text(text):
    text = text.lower()
    # Remove punctuation using str.translate
    text = text.translate(str.maketrans('', '', string.punctuation))
    return text

reviews = reviews.apply(preprocess_text)


In [5]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

def tokenize_and_clean(text):
    tokens = nltk.word_tokenize(text)
    # Keep only alphabetic tokens and remove stopwords
    tokens = [word for word in tokens if word.isalpha() and word not in stop_words]
    return tokens

tokenized_reviews = reviews.apply(tokenize_and_clean)


In [6]:
pos_tagged_reviews = tokenized_reviews.apply(nltk.pos_tag)

In [9]:
print(tokenized_reviews)

0       [bought, several, vitality, canned, dog, food,...
1       [product, arrived, labeled, jumbo, salted, pea...
2       [confection, around, centuries, light, pillowy...
3       [looking, secret, ingredient, robitussin, beli...
4       [great, taffy, great, price, wide, assortment,...
                              ...                        
9995    [switched, advance, similac, organic, product,...
9996    [like, bad, reviews, say, organic, formula, co...
9997    [wanted, solely, breastfeed, unable, keep, sup...
9998    [love, fact, get, delieved, house, delievy, ch...
9999    [week, old, gas, constipation, problems, first...
Name: Text, Length: 10000, dtype: object


In [8]:
print(pos_tagged_reviews)

0       [(bought, VBD), (several, JJ), (vitality, NN),...
1       [(product, NN), (arrived, VBD), (labeled, JJ),...
2       [(confection, NN), (around, IN), (centuries, N...
3       [(looking, VBG), (secret, JJ), (ingredient, NN...
4       [(great, JJ), (taffy, JJ), (great, JJ), (price...
                              ...                        
9995    [(switched, VBN), (advance, NN), (similac, NN)...
9996    [(like, IN), (bad, JJ), (reviews, NNS), (say, ...
9997    [(wanted, VBN), (solely, RB), (breastfeed, VBN...
9998    [(love, VB), (fact, NN), (get, VB), (delieved,...
9999    [(week, NN), (old, JJ), (gas, NN), (constipati...
Name: Text, Length: 10000, dtype: object


In [7]:
def get_named_entities(pos_tags):
    chunks = nltk.ne_chunk(pos_tags)
    named_entities = []
    for chunk in chunks:
        if hasattr(chunk, 'label'):
            entity_name = ' '.join(c[0] for c in chunk)
            entity_type = chunk.label()
            named_entities.append((entity_name, entity_type))
    return named_entities

ner_reviews = pos_tagged_reviews.apply(get_named_entities)
