In [10]:
import pandas as pd
import os 

os.listdir('..')

['LICENSE',
 'requirements.txt',
 'code',
 'README.md',
 '.gitignore',
 '.git',
 'data',
 'notebooks']

| Field            | Type   | Explanation                                                                 |
|------------------|--------|------------------------------------------------------------------------------|
| main_category    | str    | Main category (i.e., domain) of the product.                                 |
| title            | str    | Name of the product.                                                         |
| average_rating   | float  | Rating of the product shown on the product page.                             |
| rating_number    | int    | Number of ratings in the product.                                            |
| features         | list   | Bullet-point format features of the product.                                 |
| description      | list   | Description of the product.                                                  |
| price            | float  | Price in US dollars (at time of crawling).                                   |
| images           | list   | Images of the product. Each image has different sizes (thumb, large, hi_res). The “variant” field shows the position of image. |
| videos           | list   | Videos of the product including title and url.                               |
| store            | str    | Store name of the product.                                                   |
| categories       | list   | Hierarchical categories of the product.                                      |
| details          | dict   | Product details, including materials, brand, sizes, etc.                     |
| parent_asin      | str    | Parent ID of the product.                                                    |
| bought_together  | list   | Recommended bundles from the websites.                                       |

In [11]:
data = pd.read_json('../data/Books_10k.jsonl', lines=True)

In [12]:
data.shape

(10000, 10)

In [13]:
def map_sentiment(rating, numeric=False):
    mapping = {
        "negative": 0,
        "neutral": 1,
        "positive": 2
    }
    
    if pd.isna(rating):
        return None
    
    if rating <= 2:
        sentiment = "negative"
    elif rating == 3:
        sentiment = "neutral"
    else:
        sentiment = "positive"
    
    return mapping[sentiment] if numeric else sentiment


# Example usage
data["sentiment_str"] = data["rating"].apply(lambda x: map_sentiment(x, numeric=False))
data["sentiment"] = data["rating"].apply(lambda x: map_sentiment(x, numeric=True))

df = data.dropna(subset=["rating", "sentiment"])

In [14]:
df

Unnamed: 0,rating,title,text,images,asin,parent_asin,user_id,timestamp,helpful_vote,verified_purchase,sentiment_str,sentiment
0,1,Not a watercolor book! Seems like copies imo.,It is definitely not a watercolor book. The p...,[{'small_image_url': 'https://m.media-amazon.c...,B09BGPFTDB,B09BGPFTDB,AFKZENTNBQ7A7V7UXW5JJI6UGRYQ,2022-01-17 06:06:38.485,0,True,negative,0
1,1,Missing the sketch pad,Missing the sketch pad. Even worse I realized ...,[],1631591290,1631591290,AFKZENTNBQ7A7V7UXW5JJI6UGRYQ,2021-08-05 04:28:04.910,0,True,negative,0
2,1,Crease down entire side of every page!!!,Every page has a crease running the entire len...,[{'small_image_url': 'https://images-na.ssl-im...,1780671067,1780671067,AFKZENTNBQ7A7V7UXW5JJI6UGRYQ,2021-01-26 01:07:03.325,2,True,negative,0
3,1,Written From a Lens of Fear.,Only read and believe things you want to see c...,[],0929385225,0929385225,AG2L7H23R5LLKDKLBEF2Q3L2MVDA,2021-04-05 01:16:52.328,0,False,negative,0
4,1,Good if your little one is unsure/scared of th...,My little one just likes doctors so I thought ...,[],0593426452,0593426452,AE74DYR3QUGVPZJ3P7RFWBGIX7XQ,2022-03-18 04:24:46.871,1,True,negative,0
...,...,...,...,...,...,...,...,...,...,...,...,...
9995,2,Worth every cent I paid,Apparently there are readers who enjoyed this ...,[],B00AAGZ1S0,B00AAGZ1S0,AHXWUCTMVBQXDVMDFPA3NO43QF2Q,2013-06-24 00:18:07.000,2,True,negative,0
9996,2,Confusing,The story takes place over the past fifty year...,[],0544077792,0544077792,AHXWUCTMVBQXDVMDFPA3NO43QF2Q,2013-03-28 02:06:55.000,1,False,negative,0
9997,2,The Case of the Missing Plot,I picked this up because the premise sounded i...,[],B0048EL7YW,B0048EL7YW,AHXWUCTMVBQXDVMDFPA3NO43QF2Q,2012-12-05 19:07:48.000,0,False,negative,0
9998,2,Poorly written biography of a fascinating woman,Calling Julia Child 'remarkable' is an underst...,[],0307272222,0307272222,AHXWUCTMVBQXDVMDFPA3NO43QF2Q,2012-12-01 19:41:44.000,37,False,negative,0


### MANCA CONTROLLARE DUPLICATI !

In [6]:
data[data.duplicated(subset=['user_id', 'asin', 'parent_asin'])]

Unnamed: 0,rating,title,text,images,asin,parent_asin,user_id,timestamp,helpful_vote,verified_purchase,sentiment_str,sentiment
699,1,Deserves ZERO stars,Poorly written. Not much useful information. ...,[],B0BMT2PL6G,B0BMT2PL6G,AFZGIAQOQORI5HDS22LMNLE422OA,2023-02-21 19:38:52.495,0,True,negative,0
700,1,Save your money,What good is a map if you can' t read it? Man...,[],1440354642,1440354642,AFZGIAQOQORI5HDS22LMNLE422OA,2022-10-25 11:47:04.661,0,True,negative,0


In [7]:
df['text_array'] = df['text'].str.split('.')

In [8]:
df.explode(column='text_array')['text'].iloc[0]

'It is definitely not a watercolor book.  The paper bucked completely.  The pages honestly appear to be photo copies of other pictures. I say that bc if you look at the seal pics you can see the tell tale line at the bottom of the page.  As someone who has made many photocopies of pages in my time so I could try out different colors & mediums that black line is a dead giveaway to me. It’s on other pages too.  The entire book just seems off. Nothing is sharp & clear. There is what looks like toner dust on all the pages making them look muddy.  There are no sharp lines & there is no clear definition.  At least there isn’t in my copy.  And the Coloring Book for Adult on the bottom of the front cover annoys me. Why is it singular & not plural?  They usually say coloring book for kids or coloring book for kids & adults or coloring book for adults- plural.  Lol  Plus it would work for kids if you can get over the grey scale nature of it.  Personally I’m not going to waste expensive pens & pa

In [9]:
df = df.explode(column="text".apply(lambda x: nltk.sent_tokenize(x)))

AttributeError: 'str' object has no attribute 'apply'

In [66]:
df.explode(column='text_array')

Unnamed: 0,rating,title,text,images,asin,parent_asin,user_id,timestamp,helpful_vote,verified_purchase,sentiment,text_array
0,1,Not a watercolor book! Seems like copies imo.,It is definitely not a watercolor book. The p...,[{'small_image_url': 'https://m.media-amazon.c...,B09BGPFTDB,B09BGPFTDB,AFKZENTNBQ7A7V7UXW5JJI6UGRYQ,2022-01-17 06:06:38.485,0,True,negative,It is definitely not a watercolor book
0,1,Not a watercolor book! Seems like copies imo.,It is definitely not a watercolor book. The p...,[{'small_image_url': 'https://m.media-amazon.c...,B09BGPFTDB,B09BGPFTDB,AFKZENTNBQ7A7V7UXW5JJI6UGRYQ,2022-01-17 06:06:38.485,0,True,negative,The paper bucked completely
0,1,Not a watercolor book! Seems like copies imo.,It is definitely not a watercolor book. The p...,[{'small_image_url': 'https://m.media-amazon.c...,B09BGPFTDB,B09BGPFTDB,AFKZENTNBQ7A7V7UXW5JJI6UGRYQ,2022-01-17 06:06:38.485,0,True,negative,The pages honestly appear to be photo copies...
0,1,Not a watercolor book! Seems like copies imo.,It is definitely not a watercolor book. The p...,[{'small_image_url': 'https://m.media-amazon.c...,B09BGPFTDB,B09BGPFTDB,AFKZENTNBQ7A7V7UXW5JJI6UGRYQ,2022-01-17 06:06:38.485,0,True,negative,I say that bc if you look at the seal pics yo...
0,1,Not a watercolor book! Seems like copies imo.,It is definitely not a watercolor book. The p...,[{'small_image_url': 'https://m.media-amazon.c...,B09BGPFTDB,B09BGPFTDB,AFKZENTNBQ7A7V7UXW5JJI6UGRYQ,2022-01-17 06:06:38.485,0,True,negative,As someone who has made many photocopies of ...
...,...,...,...,...,...,...,...,...,...,...,...,...
9999,2,Dr von Igelfeld afield,Professor Dr Moritz-Maria von Igelfeld leaves ...,[],1400095093,1400095093,AHXWUCTMVBQXDVMDFPA3NO43QF2Q,2012-11-17 23:52:38.000,0,False,negative,<br /><br />Fans of McCall Smith's other serie...
9999,2,Dr von Igelfeld afield,Professor Dr Moritz-Maria von Igelfeld leaves ...,[],1400095093,1400095093,AHXWUCTMVBQXDVMDFPA3NO43QF2Q,2012-11-17 23:52:38.000,0,False,negative,Dr von Igelfeld is not as nearly sympathetic...
9999,2,Dr von Igelfeld afield,Professor Dr Moritz-Maria von Igelfeld leaves ...,[],1400095093,1400095093,AHXWUCTMVBQXDVMDFPA3NO43QF2Q,2012-11-17 23:52:38.000,0,False,negative,Dr von Igelfeld's exploits lean more towards...
9999,2,Dr von Igelfeld afield,Professor Dr Moritz-Maria von Igelfeld leaves ...,[],1400095093,1400095093,AHXWUCTMVBQXDVMDFPA3NO43QF2Q,2012-11-17 23:52:38.000,0,False,negative,Unlike the other series I am not particularl...


### SPLIT SENTENCES

In [15]:
import nltk
nltk.download("punkt")

# Function to split text into sentences safely
def split_into_sentences(text):
    try:
        return nltk.sent_tokenize(text)
    except:
        return [text]  # fallback if text is invalid

# Apply sentence splitting
df["sentences"] = df["text"].apply(split_into_sentences)

# Each sentence gets the same sentiment as the review
df = df.explode("sentences").rename(columns={"sentences": "sentence"})

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/andreaierardi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### TEXT CLEANING

In [16]:
import re
import string

def clean_text(text):
    text = text.lower()                                     # lowercase
    text = re.sub(r"http\S+|www\S+", "", text)              # remove URLs
    text = re.sub(r"@\w+|#\w+", "", text)                   # remove mentions/hashtags
    text = text.translate(str.maketrans("", "", string.punctuation))  # remove punctuation
    text = re.sub(r"\d+", "", text)                         # remove numbers
    text = re.sub(r"\s+", " ", text).strip()                # normalize whitespace
    return text

In [17]:
df["clean_text"] = df["sentence"].apply(clean_text)

In [18]:
df['clean_text']

0                  it is definitely not a watercolor book
0                             the paper bucked completely
0       the pages honestly appear to be photo copies o...
0       i say that bc if you look at the seal pics you...
0       as someone who has made many photocopies of pa...
                              ...                        
9999    this short volume like the rest of this series...
9999    it is not necessary to read the earlier novels...
9999    dr von igelfeld is not as nearly sympathetic a...
9999    dr von igelfelds exploits lean more towards th...
9999    unlike the other series i am not particularly ...
Name: clean_text, Length: 68881, dtype: object

In [19]:
import spacy
nlp = spacy.load("en_core_web_sm")

def spacy_tokenizer(text):
    doc = nlp(text)
    return [token.lemma_ for token in doc if not token.is_stop and not token.is_punct]

In [20]:
spacy_tokenizer(df['clean_text'].iloc[0])

['definitely', 'watercolor', 'book']

In [111]:
spacy_tokenizer(df['clean_text'].iloc[1])

['paper', 'buck', 'completely']

In [112]:
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def remove_stopwords(tokens):
    return [t for t in tokens if t not in stop_words]

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/andreaierardi/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [117]:
df['clean_text'].values

array(['it is definitely not a watercolor book',
       'the paper bucked completely',
       'the pages honestly appear to be photo copies of other pictures',
       ...,
       'dr von igelfeld is not as nearly sympathetic a character as others of mccall smiths nor is there an ongoing story being told',
       'dr von igelfelds exploits lean more towards the absurd than the other series',
       'unlike the other series i am not particularly anxious to read the next book in this series'],
      dtype=object)

In [125]:
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()

def lemmatize_tokens(tokens):
    return [lemmatizer.lemmatize(t) for t in tokens]

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/andreaierardi/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [126]:
df['clean_text_lemm'] = lemmatize_tokens(df['clean_text'])

In [130]:
lemmatize_tokens(df['clean_text'].values)

['it is definitely not a watercolor book',
 'the paper bucked completely',
 'the pages honestly appear to be photo copies of other pictures',
 'i say that bc if you look at the seal pics you can see the tell tale line at the bottom of the page',
 'as someone who has made many photocopies of pages in my time so i could try out different colors mediums that black line is a dead giveaway to me',
 'it’s on other pages too',
 'the entire book just seems off',
 'nothing is sharp clear',
 'there is what looks like toner dust on all the pages making them look muddy',
 'there are no sharp lines there is no clear definition',
 'at least there isn’t in my copy',
 'and the coloring book for adult on the bottom of the front cover annoys me',
 'why is it singular not plural',
 'they usually say coloring book for kids or coloring book for kids adults or coloring book for adults plural',
 'lol plus it would work for kids if you can get over the grey scale nature of it',
 'personally i’m not going to w

In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1,2))
X = tfidf.fit_transform(df["clean_text"])

In [1]:
import torch
from transformers import pipeline
sentiment_model = pipeline("sentiment-analysis", model="distilbert-base-uncased")

  from .autonotebook import tqdm as notebook_tqdm
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Device set to use mps:0


### MODEL TRAINING

In [41]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

from sklearn.utils.class_weight import compute_class_weight
RANDOM_SEED = 42

classes = df['sentiment'].unique()
weights = compute_class_weight('balanced', classes=classes, y=df['sentiment'])
dict(zip(classes, weights))

from sklearn.metrics import classification_report
# Target labels
y = df["sentiment"]

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_SEED)

models = {
    "Naive Bayes": MultinomialNB(),
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=RANDOM_SEED),
    "Logistic Regression w": LogisticRegression(max_iter=1000, class_weight='balanced', random_state=RANDOM_SEED),
    "Linear SVM": LinearSVC(random_state=RANDOM_SEED),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=RANDOM_SEED)
}

for name, clf in models.items():
    clf.fit(X_train, y_train)
    preds = clf.predict(X_test)
    print(f"\n{name}\n", classification_report(y_test, preds))


Naive Bayes
               precision    recall  f1-score   support

           0       0.56      0.63      0.60      4809
           1       0.51      0.08      0.13      3127
           2       0.58      0.78      0.67      5841

    accuracy                           0.57     13777
   macro avg       0.55      0.50      0.47     13777
weighted avg       0.56      0.57      0.52     13777


Logistic Regression
               precision    recall  f1-score   support

           0       0.59      0.61      0.60      4809
           1       0.42      0.19      0.27      3127
           2       0.60      0.75      0.67      5841

    accuracy                           0.58     13777
   macro avg       0.54      0.52      0.51     13777
weighted avg       0.55      0.58      0.55     13777


Logistic Regression w
               precision    recall  f1-score   support

           0       0.59      0.59      0.59      4809
           1       0.34      0.42      0.38      3127
           2   

In [45]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

from sklearn.utils.class_weight import compute_class_weight
RANDOM_SEED = 42

classes = df['sentiment'].unique()
weights = compute_class_weight('balanced', classes=classes, y=df['sentiment'])
dict(zip(classes, weights))

from sklearn.metrics import classification_report
# Target labels
y = df["sentiment"]

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_SEED)

clf = LogisticRegression(max_iter=1000, class_weight='balanced', random_state=RANDOM_SEED)
clf.fit(X_train, y_train)

preds = clf.predict(X_test)
print(classification_report(y_test, preds))

              precision    recall  f1-score   support

           0       0.59      0.59      0.59      4809
           1       0.34      0.42      0.38      3127
           2       0.66      0.58      0.62      5841

    accuracy                           0.55     13777
   macro avg       0.53      0.53      0.53     13777
weighted avg       0.56      0.55      0.55     13777



In [113]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

RANDOM_SEED = 42

classes = df['sentiment'].unique()
weights = compute_class_weight('balanced', classes=classes, y=df['sentiment'])
dict(zip(classes, weights))

X_train, X_test, y_train, y_test = train_test_split(
    df["clean_text"], df["sentiment"], test_size=0.2, random_state=42
)

model = Pipeline([
    ("tfidf", TfidfVectorizer(max_features=5000, ngram_range=(1,2))),
    ("clf", LogisticRegression(max_iter=1000,  class_weight='balanced', random_state=RANDOM_SEED))
])

model.fit(X_train, y_train)

preds = model.predict(X_test)
print(classification_report(y_test, preds))

              precision    recall  f1-score   support

           0       0.59      0.58      0.58      4809
           1       0.34      0.41      0.37      3127
           2       0.66      0.59      0.62      5841

    accuracy                           0.54     13777
   macro avg       0.53      0.53      0.52     13777
weighted avg       0.56      0.54      0.55     13777



In [114]:
import joblib
joblib.dump(model, "sentiment_model.pkl")


['sentiment_model.pkl']

In [None]:
import requests

url = "http://127.0.0.1:8002/predict"  # note: 8000
payload = {"text": "This is quite good, but can be. "}

r = requests.post(url, json=payload)  # JSON, not form data
print(r.status_code)   # should be 200
print(r.json())        # {'sentiment': 'positive'} (example)

200
{'sentiment': 'positive'}


'Every page has a crease running the entire length of the book about an inch and a half from the edges.'

In [248]:
mapping = {0: 'negative', 1: 'neutral', 2: 'positive'}
data['sentiment_pred']= pd.Series(pipeline.predict(data['text'])).replace(mapping)

In [249]:
print(classification_report(data['sentiment_str'], data['sentiment_pred']))

              precision    recall  f1-score   support

    negative       0.87      0.77      0.82      4000
     neutral       0.52      0.80      0.63      2000
    positive       0.90      0.77      0.83      4000

    accuracy                           0.77     10000
   macro avg       0.76      0.78      0.76     10000
weighted avg       0.81      0.77      0.78     10000



In [251]:
mapping = {0: 'negative', 1: 'neutral', 2: 'positive'}
df['sentiment_pred']= pd.Series(pipeline.predict(df['sentence'])).replace(mapping)

In [252]:
print(classification_report(df['sentiment_str'],df['sentiment_pred']))

              precision    recall  f1-score   support

    negative       0.34      0.71      0.46     23717
     neutral       0.23      0.15      0.18     15581
    positive       0.43      0.13      0.20     29583

    accuracy                           0.33     68881
   macro avg       0.33      0.33      0.28     68881
weighted avg       0.35      0.33      0.29     68881



In [207]:
import requests, time

texts = df['clean_text'].iloc[1000]
start = time.time()
for t in texts:
    requests.post("http://127.0.0.1:8002/predict", json={"text": t})
end = time.time()
print(f"Average latency: {(end-start)/len(texts)*1000:.2f} ms")

Average latency: 3.51 ms


In [192]:

import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

# Download if not already done
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

from nltk.tokenize import word_tokenize

stop_words = set(stopwords.words('english'))

def clean_text(text):
    # Lowercase
    text = text.lower()
    
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)

    # Remove special characters, punctuation, digits
    text = re.sub(r'\d+', '', text)  # remove digits
    text = text.translate(str.maketrans('', '', string.punctuation))

    # Remove non-alphabetic characters and extra spaces
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()

    # Remove stopwords and apply stemming
    tokens = text.split()

    tokens = [stemmer.stem(word) for word in tokens if word not in stop_words]
    tokens = [word for word in tokens if word.isalpha() and word not in stop_words]

    return " ".join(tokens)


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/andreaierardi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [198]:
import re
import html
import string
import nltk
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

# Downloads (run once)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger_eng')
STOP_WORDS = set(stopwords.words('english'))
LEMMATIZER = WordNetLemmatizer()

CONTRACTIONS = {
    "can't":"cannot", "won't":"will not", "n't":" not", "'re":" are", "'s":" is",
    "'d":" would", "'ll":" will", "'t":" not", "'ve":" have", "'m":" am"
}

def _expand_contractions(text: str) -> str:
    for k, v in CONTRACTIONS.items():
        text = re.sub(k, v, text)
    return text

def _get_wordnet_pos(tag):
    if tag.startswith('J'): return wordnet.ADJ
    if tag.startswith('V'): return wordnet.VERB
    if tag.startswith('N'): return wordnet.NOUN
    if tag.startswith('R'): return wordnet.ADV
    return wordnet.NOUN

def clean_text2(text):
    if not isinstance(text, str):
        return ""

    # Unescape HTML
    text = html.unescape(text)

    # Lowercase
    text = text.lower()

    # URLs
    text = re.sub(r'http\S+|www\S+|https?\S+', ' ', text)

    # Expand contractions (keeps negation words)
    text = _expand_contractions(text)

    # Keep emojis if you want sentiment info; here we drop non text/emoji:
    # text = emoji.replace_emoji(text, replace=' EMOJI ')  # if using emoji lib

    # Remove HTML tags if any
    text = re.sub(r'<[^>]+>', ' ', text)

    # Tokenize
    tokens = word_tokenize(text)

    # Optional: join negation with following word to preserve scope
    joined = []
    skip_next = False
    for i, tok in enumerate(tokens):
        if skip_next:
            skip_next = False
            continue
        if tok == "not" and i + 1 < len(tokens):
            joined.append(f"not_{tokens[i+1]}")
            skip_next = True
        else:
            joined.append(tok)
    tokens = joined

    # Remove punctuation tokens & digits
    tokens = [t for t in tokens if any(c.isalpha() for c in t)]
    tokens = [t.translate(str.maketrans('', '', string.punctuation)) for t in tokens]
    tokens = [t for t in tokens if t]  # drop empties

    # POS tagging for better lemmatization
    pos_tags = nltk.pos_tag(tokens)
    tokens = [LEMMATIZER.lemmatize(w, _get_wordnet_pos(p)) for w, p in pos_tags]

    # Stopwords AFTER lemmatization; keep negation tokens
    tokens = [t for t in tokens if t not in STOP_WORDS or t.startswith("not_")]

    # Collapse whitespace
    return " ".join(tokens)

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/andreaierardi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/andreaierardi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/andreaierardi/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/andreaierardi/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /Users/andreaierardi/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


In [199]:
df['clean_text2'] = df['sentence'].apply(clean_text)

df['clean_text3'] = df['sentence'].apply(clean_text2)
df.head()

Unnamed: 0,rating,title,text,images,asin,parent_asin,user_id,timestamp,helpful_vote,verified_purchase,sentiment_str,sentiment,sentence,clean_text,clean_text2,cleaned_reviewText,clean_text3
0,1,Not a watercolor book! Seems like copies imo.,It is definitely not a watercolor book. The p...,[{'small_image_url': 'https://m.media-amazon.c...,B09BGPFTDB,B09BGPFTDB,AFKZENTNBQ7A7V7UXW5JJI6UGRYQ,2022-01-17 06:06:38.485,0,True,negative,0,It is definitely not a watercolor book.,it is definitely not a watercolor book,definit watercolor book,definitely watercolor book,definitely nota watercolor book
0,1,Not a watercolor book! Seems like copies imo.,It is definitely not a watercolor book. The p...,[{'small_image_url': 'https://m.media-amazon.c...,B09BGPFTDB,B09BGPFTDB,AFKZENTNBQ7A7V7UXW5JJI6UGRYQ,2022-01-17 06:06:38.485,0,True,negative,0,The paper bucked completely.,the paper bucked completely,paper buck complet,paper bucked completely,paper buck completely
0,1,Not a watercolor book! Seems like copies imo.,It is definitely not a watercolor book. The p...,[{'small_image_url': 'https://m.media-amazon.c...,B09BGPFTDB,B09BGPFTDB,AFKZENTNBQ7A7V7UXW5JJI6UGRYQ,2022-01-17 06:06:38.485,0,True,negative,0,The pages honestly appear to be photo copies o...,the pages honestly appear to be photo copies o...,page honestli appear photo copi pictur,pages honestly appear photo copies pictures,page honestly appear photo copy picture
0,1,Not a watercolor book! Seems like copies imo.,It is definitely not a watercolor book. The p...,[{'small_image_url': 'https://m.media-amazon.c...,B09BGPFTDB,B09BGPFTDB,AFKZENTNBQ7A7V7UXW5JJI6UGRYQ,2022-01-17 06:06:38.485,0,True,negative,0,I say that bc if you look at the seal pics you...,i say that bc if you look at the seal pics you...,say bc look seal pic see tell tale line bottom...,say bc look seal pics see tell tale line botto...,say bc look seal pic see tell tale line bottom...
0,1,Not a watercolor book! Seems like copies imo.,It is definitely not a watercolor book. The p...,[{'small_image_url': 'https://m.media-amazon.c...,B09BGPFTDB,B09BGPFTDB,AFKZENTNBQ7A7V7UXW5JJI6UGRYQ,2022-01-17 06:06:38.485,0,True,negative,0,As someone who has made many photocopies of pa...,as someone who has made many photocopies of pa...,someon made mani photocopi page time could tri...,someone made many photocopies pages time could...,someone make many photocopy page time could tr...


In [200]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=10000, ngram_range=(1, 2))
X = vectorizer.fit_transform(df['clean_text3'])

from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

svm_model = LinearSVC(class_weight='balanced')
svm_model.fit(X_train, y_train)

y_pred = svm_model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.5702983232924439
              precision    recall  f1-score   support

           0       0.58      0.62      0.60      4744
           1       0.39      0.36      0.37      3116
           2       0.65      0.64      0.65      5917

    accuracy                           0.57     13777
   macro avg       0.54      0.54      0.54     13777
weighted avg       0.57      0.57      0.57     13777



In [None]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC

from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=10000, ngram_range=(1, 2))
X = vectorizer.fit_transform(df['clean_text2'])

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

svm_model = LinearSVC(class_weight='balanced')
svm_model.fit(X_train, y_train)

y_pred = svm_model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.5686288742106409
              precision    recall  f1-score   support

           0       0.58      0.62      0.60      4744
           1       0.39      0.36      0.37      3116
           2       0.65      0.64      0.64      5917

    accuracy                           0.57     13777
   macro avg       0.54      0.54      0.54     13777
weighted avg       0.57      0.57      0.57     13777



In [136]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

svm_model = LinearSVC(class_weight='balanced')
svm_model.fit(X_train, y_train)

y_pred = svm_model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.5684111199825796
              precision    recall  f1-score   support

           0       0.58      0.62      0.60      4744
           1       0.39      0.36      0.37      3116
           2       0.65      0.64      0.64      5917

    accuracy                           0.57     13777
   macro avg       0.54      0.54      0.54     13777
weighted avg       0.56      0.57      0.57     13777



In [None]:
logistic_model = LogisticRegression(class_weight='balanced', max_iter=1000)
logistic_model.fit(X_train, y_train)

y_pred = logistic_model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.5639108659359803
              precision    recall  f1-score   support

           0       0.59      0.60      0.59      4744
           1       0.37      0.43      0.40      3116
           2       0.67      0.61      0.64      5917

    accuracy                           0.56     13777
   macro avg       0.54      0.54      0.54     13777
weighted avg       0.57      0.56      0.57     13777



In [169]:
from nltk.tokenize import word_tokenize

stop_words = set(stopwords.words('english'))
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Tokenization
    tokens = word_tokenize(text)
    # Remove stopwords
    tokens = [word for word in tokens if word.isalpha() and word not in stop_words]
    # Join tokens back to text
    return ' '.join(tokens)

In [170]:
df['cleaned_reviewText'] = df['sentence'].apply(lambda x: preprocess_text(x) if pd.notnull(x) else '')

In [171]:
df['cleaned_reviewText'] 

0                              definitely watercolor book
0                                 paper bucked completely
0             pages honestly appear photo copies pictures
0       say bc look seal pics see tell tale line botto...
0       someone made many photocopies pages time could...
                              ...                        
9999    short volume like rest series one also pages l...
9999    necessary read earlier novels appreciate one e...
9999    dr von igelfeld nearly sympathetic character o...
9999    dr von igelfeld exploits lean towards absurd s...
9999    unlike series particularly anxious read next b...
Name: cleaned_reviewText, Length: 68881, dtype: object

In [180]:
df[['clean_text2', 'cleaned_reviewText']]

Unnamed: 0,clean_text2,cleaned_reviewText
0,definit watercolor book,definitely watercolor book
0,paper buck complet,paper bucked completely
0,page honestli appear photo copi pictur,pages honestly appear photo copies pictures
0,say bc look seal pic see tell tale line bottom...,say bc look seal pics see tell tale line botto...
0,someon made mani photocopi page time could tri...,someone made many photocopies pages time could...
...,...,...
9999,short volum like rest seri one also page long ...,short volume like rest series one also pages l...
9999,necessari read earlier novel appreci one even ...,necessary read earlier novels appreciate one e...
9999,dr von igelfeld nearli sympathet charact other...,dr von igelfeld nearly sympathetic character o...
9999,dr von igelfeld exploit lean toward absurd seri,dr von igelfeld exploits lean towards absurd s...


In [178]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=10000, ngram_range=(1, 2))
X = vectorizer.fit_transform(df['cleaned_reviewText'])

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [179]:
svm_model = LinearSVC(class_weight='balanced')
svm_model.fit(X_train, y_train)

y_pred = svm_model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.5594831966320679
              precision    recall  f1-score   support

           0       0.57      0.61      0.59      4744
           1       0.38      0.35      0.36      3116
           2       0.65      0.62      0.63      5917

    accuracy                           0.56     13777
   macro avg       0.53      0.53      0.53     13777
weighted avg       0.56      0.56      0.56     13777



In [202]:
import re, string, html
import numpy as np
import joblib
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

# Minimal cleaner as a transformer (so it’s serialized with the pipeline)
class TextCleaner(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None): return self
    def transform(self, X):
        def clean(t):
            if not isinstance(t, str): return ""
            t = html.unescape(t).lower()
            t = re.sub(r'http\S+|www\S+', ' ', t)
            t = re.sub(r'<[^>]+>', ' ', t)
            t = re.sub(r'\d+', ' ', t)
            t = t.translate(str.maketrans('', '', string.punctuation))
            t = re.sub(r'\s+', ' ', t).strip()
            return t
        return np.array([clean(x) for x in X])

pipeline = Pipeline([
    ("clean", TextCleaner()),
    ("tfidf", TfidfVectorizer(max_features=5000, ngram_range=(1,2))),
    ("clf", LogisticRegression(max_iter=1000, class_weight="balanced"))
])

# X_text: your raw text column; y: labels (negative/neutral/positive)
pipeline.fit(df['sentence'], y)
joblib.dump(pipeline, "sentiment_pipeline.pkl")

['sentiment_pipeline.pkl']

In [246]:
pipeline

0,1,2
,steps,"[('clean', ...), ('tfidf', ...), ...]"
,transform_input,
,memory,
,verbose,False

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'word'
,stop_words,
,token_pattern,'(?u)\\b\\w\\w+\\b'

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,'balanced'
,random_state,
,solver,'lbfgs'
,max_iter,1000
