In [2]:
import pandas as pd
import os
import re

def load_domain_data(data_path):
    data = []
    for domain in os.listdir(data_path):
        domain_path = os.path.join(data_path, domain)
        if os.path.isdir(domain_path):
            
            for sentiment_file in ["positive.review", "negative.review"]:
                sentiment_path = os.path.join(domain_path, sentiment_file)
                sentiment = 'Positive' if 'positive' in sentiment_file else 'Negative'
                
                if os.path.exists(sentiment_path):
                    with open(sentiment_path, 'r', encoding='utf-8', errors='ignore') as f:
                        content = f.read()
                        reviews = re.findall(r'<review_text>(.*?)</review_text>', content, re.DOTALL)
                        for review in reviews:
                            data.append({
                                'domain': domain,
                                'sentiment': sentiment,
                                'review_text': review.strip()
                            })
    return pd.DataFrame(data)

DATASET_PATH = 'sorted_data' 

df = load_domain_data(DATASET_PATH)
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

print(f"Successfully loaded {len(df)} reviews across {df['domain'].nunique()} domains.")
print("\nDataset preview:")
print(df.head())
print("\nReviews per domain:")
print(df['domain'].value_counts())

Successfully loaded 38548 reviews across 25 domains.

Dataset preview:
                  domain sentiment  \
0           gourmet_food  Positive   
1                  video  Positive   
2              magazines  Positive   
3                    dvd  Negative   
4  cell_phones_&_service  Positive   

                                         review_text  
0  My mother lives in a rural part of the country...  
1  This film is about the battle in the Ia Drang ...  
2  Today I received the first issue - April 2006 ...  
3  THE FRIST time i saw this movie i hated it. ho...  
4  I agree it's a waste to get sizes you don't ne...  

Reviews per domain:
domain
video                     2000
health_&_personal_care    2000
dvd                       2000
toys_&_games              2000
apparel                   2000
sports_&_outdoors         2000
books                     2000
electronics               2000
kitchen_&_housewares      2000
music                     2000
camera_&_photo            1999
m

In [3]:
import nltk

In [4]:
nltk.download("punkt_tab")

[nltk_data] Downloading package punkt_tab to C:\Users\Angeline
[nltk_data]     Lewis\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [5]:
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer


lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def clean_text_data(text):
    """Cleans raw text for NLP models."""
    text = text.lower() 
    text = re.sub(r'<.*?>', '', text) 
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    tokens = nltk.word_tokenize(text) 
    
    # Lemmatize 
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words and len(word) > 2]
    
    return ' '.join(tokens)

print("Cleaning and preprocessing review text...")
df['cleaned_review'] = df['review_text'].apply(clean_text_data)
print("Preprocessing complete.")
print("\nPreview of original vs. cleaned text:")
print(df[['review_text', 'cleaned_review']].head())

Cleaning and preprocessing review text...
Preprocessing complete.

Preview of original vs. cleaned text:
                                         review_text  \
0  My mother lives in a rural part of the country...   
1  This film is about the battle in the Ia Drang ...   
2  Today I received the first issue - April 2006 ...   
3  THE FRIST time i saw this movie i hated it. ho...   
4  I agree it's a waste to get sizes you don't ne...   

                                      cleaned_review  
0  mother life rural part country ohio native tha...  
1  film battle drang valley vietnam begin french ...  
2  today received first issue april issue found a...  
3  frist time saw movie hated however saw still g...  
4  agree waste get size need idea public exchange...  


In [6]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report


X = df['cleaned_review']
y_domain = df['domain']


X_train, X_test, y_train_domain, y_test_domain = train_test_split(
    X, y_domain, test_size=0.2, random_state=42, stratify=y_domain
)


domain_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(ngram_range=(1, 2), max_features=5000)), 
    ('clf', LogisticRegression(max_iter=1000)) 
])

print("\nTraining the Domain Classification model...")
domain_pipeline.fit(X_train, y_train_domain)
print("Training complete.")


domain_predictions = domain_pipeline.predict(X_test)
print("\n--- Domain Classification Report ---")
print("This shows how well the model predicts the product category.")
print(classification_report(y_test_domain, domain_predictions))


Training the Domain Classification model...
Training complete.

--- Domain Classification Report ---
This shows how well the model predicts the product category.
                        precision    recall  f1-score   support

               apparel       0.61      0.76      0.67       400
            automotive       0.85      0.46      0.60       147
                  baby       0.75      0.75      0.75       380
                beauty       0.83      0.84      0.83       299
                 books       0.87      0.94      0.90       400
        camera_&_photo       0.84      0.80      0.82       400
 cell_phones_&_service       0.83      0.63      0.72       205
computer_&_video_games       0.87      0.83      0.85       292
                   dvd       0.57      0.54      0.55       400
           electronics       0.57      0.68      0.62       400
          gourmet_food       0.67      0.63      0.65       242
               grocery       0.69      0.70      0.70       271
heal

In [4]:

y_sentiment = df['sentiment']

X_train, X_test, y_train_sentiment, y_test_sentiment = train_test_split(
    X, y_sentiment, test_size=0.2, random_state=42, stratify=y_sentiment
)


sentiment_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(ngram_range=(1, 2), max_features=5000)),
    ('clf', LogisticRegression(max_iter=1000))
])

print("\nTraining the Sentiment Analysis model...")
sentiment_pipeline.fit(X_train, y_train_sentiment)
print("Training complete.")


sentiment_predictions = sentiment_pipeline.predict(X_test)
print("\n--- Sentiment Analysis Report ---")
print("This shows how well the model predicts if a review is positive or negative.")
print(classification_report(y_test_sentiment, sentiment_predictions))


Training the Sentiment Analysis model...
Training complete.

--- Sentiment Analysis Report ---
This shows how well the model predicts if a review is positive or negative.
              precision    recall  f1-score   support

    Negative       0.85      0.81      0.83      3315
    Positive       0.86      0.89      0.88      4395

    accuracy                           0.86      7710
   macro avg       0.86      0.85      0.85      7710
weighted avg       0.86      0.86      0.86      7710



In [7]:
!pip install spacy
!python -m spacy download en_core_web_sm




[notice] A new release of pip is available: 25.1.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ----------------------------------- --- 11.5/12.8 MB 60.7 MB/s eta 0:00:01
     --------------------------------------- 12.8/12.8 MB 51.6 MB/s eta 0:00:00
[38;5;2m[+] Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')



[notice] A new release of pip is available: 25.1.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [11]:
!pip install jupyter ipykernel




[notice] A new release of pip is available: 25.1.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [9]:
import spacy
from spacy import displacy


nlp = spacy.load('en_core_web_sm')


print("\n--- Named Entity Recognition Examples ---")
for domain in df['domain'].unique():
    print(f"\n--- DOMAIN: {domain.upper()} ---")
    
   
    sample_review = df[df['domain'] == domain]['review_text'].iloc[0]
    doc = nlp(sample_review)
    
    print(f"Review Snippet: \"{sample_review[:200]}...\"")
    
  
    if doc.ents:
        for ent in doc.ents:
            print(f"  -> Entity: '{ent.text}', Label: '{ent.label_}' ({spacy.explain(ent.label_)})")
    else:
        print("  -> No entities found by SpaCy in this review.")


--- Named Entity Recognition Examples ---

--- DOMAIN: GOURMET_FOOD ---
Review Snippet: "My mother lives in a rural part of the country in Ohio (she is a native of Thailand) anyway being in a small town, it is not easy to find Thai anything, this makes it so easy to send a little taste of..."
  -> Entity: 'Ohio', Label: 'GPE' (Countries, cities, states)
  -> Entity: 'Thailand', Label: 'GPE' (Countries, cities, states)
  -> Entity: 'Thai', Label: 'NORP' (Nationalities or religious or political groups)

--- DOMAIN: VIDEO ---
Review Snippet: "This film is about the battle in the Ia Drang valley in 1965 Vietnam. It begins in 1954 when a French patrol is attacked and defeated in a surprise ambush. A prophecy for the future? In one scene a wo..."
  -> Entity: 'Ia Drang', Label: 'GPE' (Countries, cities, states)
  -> Entity: '1965', Label: 'DATE' (Absolute or relative dates or periods)
  -> Entity: 'Vietnam', Label: 'GPE' (Countries, cities, states)
  -> Entity: '1954', Label: 'DATE' (Absolu