<a href="https://colab.research.google.com/github/Darrystic/AI-Tools-Assignment/blob/main/Amazon_Reviews.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# notebooks/03_spacy_ner_reviews.ipynb


In [5]:
# === Kaggle API Setup ===
!pip install -q kaggle

import os
import bz2
import pandas as pd
import spacy
from spacy.matcher import PhraseMatcher

# === Step 1: Locate and extract the correct dataset file ===
compressed_file = "data/train.ft.txt.bz2"
extracted_file = "data/train.ft.txt"

# If the dataset was unzipped but still compressed in .bz2, decompress it
if os.path.exists(compressed_file) and not os.path.exists(extracted_file):
    print("Decompressing Amazon Reviews dataset...")
    with bz2.open(compressed_file, "rt", encoding="utf-8") as bzfile, open(extracted_file, "w", encoding="utf-8") as outfile:
        for i, line in enumerate(bzfile):
            outfile.write(line)
            if i > 20000:  # decompress only first 20k lines to save time and memory
                break
    print("Partial extraction complete (20,000 lines).")

# Check the extracted file exists
if not os.path.exists(extracted_file):
    raise FileNotFoundError("train.ft.txt still not found. Check if the Kaggle dataset unzipped properly.")

# === Step 2: Load a manageable sample ===
sample_lines = []
with open(extracted_file, "r", encoding="utf-8") as f:
    for i, line in enumerate(f):
        if i >= 500:
            break
        label, text = line.split(" ", 1)
        sample_lines.append({"label": label.replace("__label__", ""), "review": text.strip()})

df = pd.DataFrame(sample_lines)
print(df.head())

# === Step 3: NLP Analysis with spaCy ===
!python -m spacy download en_core_web_sm

nlp = spacy.load("en_core_web_sm")
matcher = PhraseMatcher(nlp.vocab, attr="LOWER")

brands = ["Amazon", "Sony", "Apple", "Samsung", "HP", "Lenovo"]
matcher.add("BRAND", [nlp.make_doc(b) for b in brands])

pos_words = {"great", "excellent", "love", "good", "perfect", "amazing"}
neg_words = {"bad", "terrible", "poor", "hate", "disappointing"}

for i in range(3):
    text = df.loc[i, "review"]
    doc = nlp(text)
    matches = matcher(doc)
    found_brands = [doc[start:end].text for _, start, end in matches]
    tokens = {t.lemma_.lower() for t in doc}
    score = sum(1 for w in tokens if w in pos_words) - sum(1 for w in tokens if w in neg_words)
    sentiment = "positive" if score > 0 else "negative" if score < 0 else "neutral"

    print(f"\nReview: {text[:120]}...")
    print(f"Detected brands: {found_brands}")
    print(f"spaCy entities: {[ (ent.text, ent.label_) for ent in doc.ents ]}")
    print(f"Rule sentiment: {sentiment}")


Decompressing Amazon Reviews dataset...
Partial extraction complete (20,000 lines).
  label                                             review
0     2  Stuning even for the non-gamer: This sound tra...
1     2  The best soundtrack ever to anything.: I'm rea...
2     2  Amazing!: This soundtrack is my favorite music...
3     2  Excellent Soundtrack: I truly like this soundt...
4     2  Remember, Pull Your Jaw Off The Floor After He...
Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m40.5 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the 