# Project: Sentiment Analysis on Predoct Reviews

[Dataset name (Women’s Clothing E-Commerce Reviews)](https://www.kaggle.com/datasets/nicapotato/womens-ecommerce-clothing-reviews)

## Imports

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
import kagglehub

from transformers import AutoTokenizer, TFAutoModelForSequenceClassification


# Config/Hyper Params

In [None]:
# Model (online)
MODEL_NAME = "LiYuan/amazon-review-sentiment-analysis"

# We keep 3-class sentiment labels in this notebook (0=neg, 1=neutral, 2=pos)
# and convert the LiYuan 1-5 star prediction into those 3 buckets.

# Batching
BATCH_SIZE = 16

# Tokenization
MAX_LEN = 256

# Random seed
SEED = 42


## Load the Dataset

In [None]:
path = kagglehub.dataset_download("nicapotato/womens-ecommerce-clothing-reviews")

df = pd.read_csv(path + '/Womens Clothing E-Commerce Reviews.csv')

df.head()


Using Colab cache for faster access to the 'womens-ecommerce-clothing-reviews' dataset.


Unnamed: 0.1,Unnamed: 0,Clothing ID,Age,Title,Review Text,Rating,Recommended IND,Positive Feedback Count,Division Name,Department Name,Class Name
0,0,767,33,,Absolutely wonderful - silky and sexy and comf...,4,1,0,Initmates,Intimate,Intimates
1,1,1080,34,,Love this dress! it's sooo pretty. i happene...,5,1,4,General,Dresses,Dresses
2,2,1077,60,Some major design flaws,I had such high hopes for this dress and reall...,3,0,0,General,Dresses,Dresses
3,3,1049,50,My favorite buy!,"I love, love, love this jumpsuit. it's fun, fl...",5,1,0,General Petite,Bottoms,Pants
4,4,847,47,Flattering shirt,This shirt is very flattering to all due to th...,5,1,6,General,Tops,Blouses


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23486 entries, 0 to 23485
Data columns (total 11 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   Unnamed: 0               23486 non-null  int64 
 1   Clothing ID              23486 non-null  int64 
 2   Age                      23486 non-null  int64 
 3   Title                    19676 non-null  object
 4   Review Text              22641 non-null  object
 5   Rating                   23486 non-null  int64 
 6   Recommended IND          23486 non-null  int64 
 7   Positive Feedback Count  23486 non-null  int64 
 8   Division Name            23472 non-null  object
 9   Department Name          23472 non-null  object
 10  Class Name               23472 non-null  object
dtypes: int64(6), object(5)
memory usage: 2.0+ MB


## Preprocessing

In [None]:
df = df.drop(columns=["Unnamed: 0", "Clothing ID", "Age", "Positive Feedback Count", \
"Division Name", "Department Name", "Class Name"]) #removing all unecessary data
#df = df.drop(columns=["Recommended IND"]) #If conclusively not needed

In [None]:
#df = df.dropna(subset=["Review Text"])
df = df.dropna(subset=["Title", "Review Text"], how='all') #To filter out completely empty reviews
df = df[(df["Review Text"].str.strip() != "") & (df["Title"].str.strip() != "")] #In case both are whitespace

In [None]:
#df["text"] = df["Title"].fillna("") + ". " + df["Review Text"]
df["text"] = df["Title"].fillna("").str.strip() + ". " + df["Review Text"].fillna("").str.strip() #In case we decide to only throw out empty reviews

In [None]:
def rating_to_sentiment(r):
    if r <= 2:
        return 0  # negative
    elif r == 3:
        return 1  # neutral
    else:
        return 2  # positive

df["sentiment"] = df["Rating"].apply(rating_to_sentiment)
df["sentiment"].value_counts()


Unnamed: 0_level_0,count
sentiment,Unnamed: 1_level_1
2,17449
1,2823
0,2370


In [None]:
#print(df[df["text"].str.split().str.len() < 5]) #Checking what we're missing
df = df[df["text"].str.split().str.len() >= 5] #Remove short reviews

In [None]:
df[["text", "sentiment"]].head()

Unnamed: 0,text,sentiment
0,. Absolutely wonderful - silky and sexy and co...,2
1,. Love this dress! it's sooo pretty. i happe...,2
2,Some major design flaws. I had such high hopes...,1
3,"My favorite buy!. I love, love, love this jump...",2
4,Flattering shirt. This shirt is very flatterin...,2


In [None]:
df["sentiment"].value_counts(normalize=True)

Unnamed: 0_level_0,proportion
sentiment,Unnamed: 1_level_1
2,0.77058
1,0.124696
0,0.104724


In [None]:
df[["text", "sentiment"]].head()
df["sentiment"].value_counts(normalize=True)


Unnamed: 0_level_0,proportion
sentiment,Unnamed: 1_level_1
2,0.77058
1,0.124696
0,0.104724


## Train / Validation / Test Split

In [None]:
X = df["text"].values
y = df["sentiment"].values

In [None]:
import numpy as np

np.random.seed(SEED)

indices = np.arange(len(X))
np.random.shuffle(indices)

train_end = int(0.8 * len(X))
val_end = int(0.9 * len(X))

train_idx = indices[:train_end]
val_idx = indices[train_end:val_end]
test_idx = indices[val_end:]

X_train, y_train = X[train_idx], y[train_idx]
X_val, y_val = X[val_idx], y[val_idx]
X_test, y_test = X[test_idx], y[test_idx]

In [None]:
from sklearn.model_selection import train_test_split

# First split: 80% train, 20% temp (for val+test)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=SEED, stratify=y)

# Second split: 10% val, 10% test from the 20% temp
X_val, X_test, y_val, y_test = train_test_split(
    X_temp,
    y_temp,
    test_size=0.50, # This will make val_size=0.1 and test_size=0.1 overall (0.5 * 0.2)
    random_state=SEED,
    stratify=y_temp
)


In [None]:
def show_distribution(name, labels):
    unique, counts = np.unique(labels, return_counts=True)
    print(name, dict(zip(unique, counts)))

show_distribution("Train", y_train)
show_distribution("Validation", y_val)
show_distribution("Test", y_test)


Train {np.int64(0): np.int64(1896), np.int64(1): np.int64(2257), np.int64(2): np.int64(13951)}
Validation {np.int64(0): np.int64(237), np.int64(1): np.int64(282), np.int64(2): np.int64(1744)}
Test {np.int64(0): np.int64(237), np.int64(1): np.int64(283), np.int64(2): np.int64(1744)}


## tokenizer + model skeleton

In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/556 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

### Tokenizing

In [None]:
def tokenize(text):
    return tokenizer(
        list(text),
        padding=True,
        truncation=True,
        max_length=MAX_LEN,
        return_tensors="tf"
    )

train_encodings = tokenize(X_train)
val_encodings = tokenize(X_val)
test_encodings = tokenize(X_test)

TensorFlow and JAX classes are deprecated and will be removed in Transformers v5. We recommend migrating to PyTorch classes or pinning your version of Transformers.


### TensorFlow Datasets

In [None]:
train_ds = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    y_train
)).shuffle(1000).batch(BATCH_SIZE)

val_ds = tf.data.Dataset.from_tensor_slices((
    dict(val_encodings),
    y_val
)).batch(BATCH_SIZE)

test_ds = tf.data.Dataset.from_tensor_slices((
    dict(test_encodings),
    y_test
)).batch(BATCH_SIZE)


### Load Model

In [None]:
from transformers import AutoConfig, TFAutoModelForSequenceClassification, TFAutoModel
import tensorflow as tf

# First, load the configuration. This will tell us about the model's architecture.
config = AutoConfig.from_pretrained(MODEL_NAME)

# Set the number of labels for our specific task (0=neg, 1=neutral, 2=pos).
# The new classification head will be initialized with this number of outputs.
config.num_labels = 3

# Load the base model (e.g., TFBertModel), which is the encoder part without the classification head.
# We use `from_pt=True` to load from the PyTorch checkpoint.
# `ignore_mismatched_sizes=True` is used for robustness, though less critical for the base model.
base_model = TFAutoModel.from_pretrained(MODEL_NAME, from_pt=True, ignore_mismatched_sizes=True)

# Create a new TFAutoModelForSequenceClassification instance with our modified config.
# This will construct the full model, including a *new*, randomly initialized
# classification head with 3 output labels.
model = TFAutoModelForSequenceClassification.from_config(config)

# Transfer the pre-trained weights from the `base_model` (encoder) to the `model`'s encoder part.
# For BERT-like models, the main layer is often under an attribute like 'bert'.
# The classification head of 'model' remains randomly initialized, as desired.
model.bert = base_model.bert


Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['classifier.bias', 'bert.embeddings.position_ids', 'classifier.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


### Compile the Model (Fine-Tuning Setup)

In [None]:
import os
import datetime
import tensorflow as tf

# =========================
# Train (so you see Epoch 1/3 style logs)
# =========================

# Optimizer / Loss / Metric
INITIAL_LR = 2e-5
optimizer = tf.keras.optimizers.Adam(learning_rate=INITIAL_LR)
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metrics = [tf.keras.metrics.SparseCategoricalAccuracy(name="accuracy")]

model.compile(optimizer=optimizer, loss=loss_fn, metrics=metrics)

# Run folder (matches your example style)
run_id = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
run_dir = f"runs/run_{run_id}"
ckpt_dir = os.path.join(run_dir, "checkpoints")
os.makedirs(ckpt_dir, exist_ok=True)

callbacks = [
    tf.keras.callbacks.ModelCheckpoint(
        filepath=os.path.join(ckpt_dir, "best_model.keras"),
        monitor="val_loss",
        save_best_only=True,
        mode="min",
        verbose=1
    ),
    tf.keras.callbacks.ReduceLROnPlateau(
        monitor="val_loss",
        factor=0.5,
        patience=1,
        verbose=1
    ),
    tf.keras.callbacks.EarlyStopping(
        monitor="val_loss",
        patience=2,
        restore_best_weights=True,
        verbose=1
    ),
]

EPOCHS = 3

print(f"Fine-tuning for up to {EPOCHS} epochs...")
print("=" * 60)

history = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=EPOCHS,
    callbacks=callbacks
)

print("\nFine-tuning complete!")

Fine-tuning for up to 3 epochs...
Epoch 1/3
Epoch 1: val_loss improved from inf to 0.31165, saving model to runs/run_20251231_200750/checkpoints/best_model.keras




Epoch 2/3
Epoch 2: val_loss did not improve from 0.31165

Epoch 2: ReduceLROnPlateau reducing learning rate to 9.999999747378752e-06.
Epoch 3/3
Epoch 3: val_loss did not improve from 0.31165

Epoch 3: ReduceLROnPlateau reducing learning rate to 4.999999873689376e-06.
Epoch 3: early stopping
Restoring model weights from the end of the best epoch: 1.

Fine-tuning complete!


### Fine-Tune the Model

In [None]:
# ✅ Skipping training (pretrained model).
print("Skipping model.fit(...) because we are using a pretrained model.")


Skipping model.fit(...) because we are using a pretrained model.


### Evaluate on the Test Set - One time

In [None]:
# Evaluate on test set (3-class sentiment: 0=neg, 1=neutral, 2=pos)

y_true = []
y_pred = []

for batch_inputs, batch_labels in test_ds:
    logits = model(**batch_inputs).logits
    pred = tf.argmax(logits, axis=1).numpy()  # 0..2

    y_true.extend(batch_labels.numpy().tolist())
    y_pred.extend(pred.tolist())

y_true = np.array(y_true)
y_pred = np.array(y_pred)

test_acc = (y_true == y_pred).mean()
print(f"Test accuracy: {test_acc:.4f}")


Test accuracy: 0.8587


### Save the Model

In [None]:
# Optional: save locally if you want (not required to run online)
# model.save_pretrained("liyuan_finetuned_reviews")
# tokenizer.save_pretrained("liyuan_finetuned_reviews")
print("Done (model loaded online).")


Done (model loaded online).


### Prediction



In [None]:
label_map = {
    0: "negative",
    1: "neutral",
    2: "positive"
}

def predict_sentiment(text: str):
    inputs = tokenizer(
        text,
        return_tensors="tf",
        truncation=True,
        padding=True,
        max_length=MAX_LEN
    )

    logits = model(**inputs).logits
    probs = tf.nn.softmax(logits, axis=1).numpy()[0]  # 3 classes
    pred = int(np.argmax(probs))  # 0..2

    print("Text:", text)
    print("Prediction:", label_map[pred])
    print("Probabilities:", {label_map[i]: float(probs[i]) for i in range(3)})

# Example
predict_sentiment("This product is amazing and works perfectly!")


Text: This product is amazing and works perfectly!
Prediction: positive
Probabilities: {'negative': 0.0009640577482059598, 'neutral': 0.00115761358756572, 'positive': 0.9978783130645752}


### Testing It

In [None]:
predict_sentiment("This product was okay, but nothing special.")


Text: This product was okay, but nothing special.
Prediction: neutral
Probabilities: {'negative': 0.38883596658706665, 'neutral': 0.5433573722839355, 'positive': 0.0678066611289978}


In [None]:
predict_sentiment("Absolutely terrible quality. Waste of money.")

Text: Absolutely terrible quality. Waste of money.
Prediction: negative
Probabilities: {'negative': 0.9156205654144287, 'neutral': 0.08040796965360641, 'positive': 0.003971491940319538}


In [None]:
predict_sentiment("I love this dress, it fits perfectly.")

Text: I love this dress, it fits perfectly.
Prediction: positive
Probabilities: {'negative': 0.0006500277668237686, 'neutral': 0.0012372034834697843, 'positive': 0.9981127977371216}
