In [1]:
import pandas as pd
import spacy
import nltk
import string
from nltk.corpus import stopwords

In [2]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\archa\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [10]:
!python -m spacy download en_core_web_sm

Defaulting to user installation because normal site-packages is not writeable


[notice] A new release of pip is available: 24.2 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip



Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
      --------------------------------------- 0.3/12.8 MB ? eta -:--:--
     - -------------------------------------- 0.5/12.8 MB 1.1 MB/s eta 0:00:11
     --- ------------------------------------ 1.0/12.8 MB 1.7 MB/s eta 0:00:08
     ---- ----------------------------------- 1.3/12.8 MB 1.9 MB/s eta 0:00:07
     ----- ---------------------------------- 1.8/12.8 MB 1.7 MB/s eta 0:00:07
     ------- -------------------------------- 2.4/12.8 MB 1.9 MB/s eta 0:00:06
     --------- ------------------------------ 2.9/12.8 MB 2.0 MB/s eta 0:00:06
     --------- ------------------------------ 3.1/12.8 MB 2.0 MB/s eta 0:00:05
     ----------- ---------------------------- 3.7/12.8 MB 2.0 MB/s eta 0:00:05
     ------------- -------------------------- 4

In [3]:
df = pd.read_csv("Reviews.csv")

In [4]:
reviews = df['Text']

In [5]:
reviews = reviews.dropna()

In [6]:
reviews = reviews[:10000]

In [7]:
def preprocess_text(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    return text

reviews = reviews.apply(preprocess_text)

In [11]:
nlp = spacy.load("en_core_web_sm")

In [12]:
def tokenize_and_clean(text):
    doc = nlp(text)
    tokens = [
        token.text for token in doc 
        if token.is_alpha and token.text not in stop_words
    ]
    return tokens

tokenized_reviews = reviews.apply(tokenize_and_clean)

In [13]:
def pos_tagging(text):
    doc = nlp(text)
    return [(token.text, token.pos_) for token in doc]

pos_tags = reviews[:5].apply(pos_tagging)

In [14]:
def ner_extraction(text):
    doc = nlp(text)
    return [(ent.text, ent.label_) for ent in doc.ents]

ner_entities = reviews[:5].apply(ner_extraction)

In [15]:
print("Sample Tokens:\n", tokenized_reviews.iloc[0][:20])
print("\nPOS Tags (First Review):\n", pos_tags.iloc[0])
print("\nNamed Entities (First Review):\n", ner_entities.iloc[0])

Sample Tokens:
 ['bought', 'several', 'vitality', 'canned', 'dog', 'food', 'products', 'found', 'good', 'quality', 'product', 'looks', 'like', 'stew', 'processed', 'meat', 'smells', 'better', 'labrador', 'finicky']

POS Tags (First Review):
 [('i', 'PRON'), ('have', 'AUX'), ('bought', 'VERB'), ('several', 'ADJ'), ('of', 'ADP'), ('the', 'DET'), ('vitality', 'NOUN'), ('canned', 'VERB'), ('dog', 'NOUN'), ('food', 'NOUN'), ('products', 'NOUN'), ('and', 'CCONJ'), ('have', 'AUX'), ('found', 'VERB'), ('them', 'PRON'), ('all', 'PRON'), ('to', 'PART'), ('be', 'AUX'), ('of', 'ADP'), ('good', 'ADJ'), ('quality', 'NOUN'), ('the', 'DET'), ('product', 'NOUN'), ('looks', 'VERB'), ('more', 'ADV'), ('like', 'ADP'), ('a', 'DET'), ('stew', 'NOUN'), ('than', 'ADP'), ('a', 'DET'), ('processed', 'VERB'), ('meat', 'NOUN'), ('and', 'CCONJ'), ('it', 'PRON'), ('smells', 'VERB'), ('better', 'ADJ'), ('my', 'PRON'), ('labrador', 'NOUN'), ('is', 'AUX'), ('finicky', 'ADJ'), ('and', 'CCONJ'), ('she', 'PRON'), ('appre