This Notebook splits the reviews into clean sentences for sentiment analysis

In [25]:
import json, re, os
from nltk.tokenize import sent_tokenize
import nltk

nltk.download('punkt') 

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\nkash\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [26]:
def clean(text):
    # remove HTML tags
    text = re.sub(r'<[^>]+>', '', text)
    # remove anything except word chars, space, .,!?
    text = re.sub(r'[^\w\s\.\,\!\?]', '', text)
    # lowercase and strip
    return text.lower().strip()


def split_sentences(text):
    return sent_tokenize(text,language='english')

In [27]:
IN_FILE  = 'data/processed/merged.json'
OUT_FILE = 'data/processed/cleaned.json'
os.makedirs(os.path.dirname(OUT_FILE), exist_ok=True)

In [28]:
# Load merged reviews
with open(IN_FILE, 'r',encoding='utf-8') as f:
    data = json.load(f)

# Clean & split each review
for entry in data:
    raw = entry.get('text', '') or ''
    cleaned_block = clean(raw)
    entry['sentences'] = split_sentences(cleaned_block)
    #remove encoding mistake
    entry["genre"]=re.sub(r'&amp;','&',entry["genre"])
    #remove dollar signs
    entry["price"]=float(re.sub(",","",entry["price"])[1:])
    #extract author
    entry["author"]=re.sub(r"Visit Amazon's (.*) Page",r"\1",entry["author"])

# Write cleaned data to JSON file
with open(OUT_FILE, 'w', encoding='utf-8') as f:
    json.dump(data, f, indent=2)

print(f"Cleaned {len(data)} reviews → {OUT_FILE}")

Cleaned 398639 reviews → data/processed/cleaned.json
