In [9]:
import swifter
import numpy as np
import pandas as pd
from tqdm import tqdm
import nlpaug.augmenter.word as naw
from sklearn.model_selection import train_test_split

import tensorflow as tf
from transformers import DistilBertTokenizerFast
from transformers import TFDistilBertForSequenceClassification

In [2]:
df = pd.read_excel('data/sephora_review (5 products).xlsx', sheet_name='sephora_review', engine='openpyxl')
df = df[df['review_text'].notna()]
df.head()

Unnamed: 0,product_id,review_title,review_text,rating,age_range,skin_type,skin_tone,eye_color,reviewer_username,tags,review_id
0,P38217,Worth the money,"Sometimes I stray from this cleanser, but I al...",5,,normal,light,,katechatte,"{foamy,exfoliating}",6611717f-2636-4756-bf36-66c81cc267a7
1,P38217,Great,I am a 41 year old African American woman with...,5,,combination,deep,,snook41,{foamy},e7d3307e-02ff-45a1-8fc3-6bd628bedd86
2,P38217,Great Product,I'm really enjoying this product. Received a s...,5,,combination,olive,,wahinewarrior,"{foamy,milky,exfoliating}",4188d728-fde6-4d06-984e-164cca2b8781
3,P38217,"Nice, but not great for combination skin","I tried this cleanser at a friends house, and ...",3,,combination,fair,,jenlines22,"{hydrating,creamy}",248c904c-6e30-4929-8228-87b03ad7a921
4,P38217,great moisturizer,leaves the skin feeling fresh and revived... j...,5,,dry,light,,jessea,{exfoliating},654bdb99-9371-4440-a540-0dd2a73da339


We will generate sentiment labels from the data using the ratings given by each user. The reviews that have a rating of less than 3 are given negative sentiment labels and those having 3 or more than 3 stars are given positive sentiment labels. For training purposes, we map 'positive' and 'negative' labels to 1 and 0 respectively.

In [3]:
df['sentiment'] = df['rating'].swifter.apply(lambda score: "positive" if score >= 3 else "negative")
df['sentiment'] = df['sentiment'].map({'positive': 1, 'negative': 0})
df = df[["review_text", "sentiment"]]
df.sample(frac=1).reset_index(inplace=True)
df.head()

Pandas Apply:   0%|          | 0/1000 [00:00<?, ?it/s]

Unnamed: 0,review_text,sentiment
0,"Sometimes I stray from this cleanser, but I al...",1
1,I am a 41 year old African American woman with...,1
2,I'm really enjoying this product. Received a s...,1
3,"I tried this cleanser at a friends house, and ...",1
4,leaves the skin feeling fresh and revived... j...,1


In [4]:
df['sentiment'].value_counts()

1    891
0    109
Name: sentiment, dtype: int64

Observing the distribution of labels in the data, we see that there are far more positive reviews than negative reviews. During training, this might induce bias into the model. To counter this, we will apply a text augmentation method using which we will generate new texts from a given text which has the same semantics and meaning but will just have a different framing. We apply this text augmentation to only the negative reviewed texts since there are very less number of negative reviewed texts as compared to positive reviewed texts. For this we use a back translation technique, which essentialy converts English text to German and then back to English.

In [5]:
back_translation_aug = naw.BackTranslationAug(
    from_model_name='facebook/wmt19-en-de', 
    to_model_name='facebook/wmt19-de-en'
)
    
def generate_augmented_text(text):
    augmented_text = back_translation_aug.augment(text)
    return augmented_text

Downloading:   0%|          | 0.00/825 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.08G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/825 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.08G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/67.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/849k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/849k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/315k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/67.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/849k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/849k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/315k [00:00<?, ?B/s]

In [37]:
# Example of text augmentation:

text = df['review_text'][2]
print("Original Text: ", text)
print()
print("Augmented Text: ", generate_augmented_text(text))

Original Text:  I'm really enjoying this product. Received a sample trio of the Murad products to try first, then purchased the cleanser. Great deal at $35 as a little goes a long way! I use it nightly with my Clarisonic Mia and in the morning by itself. Leaves my skin feeling clean and smooth and appears to be helping to even out my skin tone.

Augmented Text:  I got a sample trio of Murad products to try first, then I bought the cleanser. A lot for $35, because a bit much works! I use it every night with my Clarisonic Mia and in the morning alone. It leaves my skin feeling clean and smooth and seems to help balance my skin tone.


We can see that the augmented text basically means the same as the original text. Only the framing of the sentence is changed a bit in the augmented text. We will apply this technique to all the negatively reviewed sentences. 

In [6]:
neg_reviews = df[df['sentiment'] == 0]['review_text'].values

for text in tqdm(neg_reviews, desc="Augmenting text data. This might take a while..."):
    row = {'review_text': generate_augmented_text(text), 'sentiment': 0}
    df = df.append(row, ignore_index=True)
    
df['sentiment'].value_counts()

To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor'). (Triggered internally at  /pytorch/aten/src/ATen/native/BinaryOps.cpp:467.)
  return torch.floor_divide(self, other)
100%|██████████| 109/109 [07:07<00:00,  3.93s/it]


1    891
0    218
Name: sentiment, dtype: int64

After performing text augmentation on the negatively reviewed texts, we see that the number of texts with negative(i.e. 0 labels) has doubled. This will certainly help in training the sentiment analysis model better.

Next, we will simply use the data that we have processed and generated to train a sentiment analysis model. We will fine-tune a DistilBERT pre-trained model according to our usecase using the huggingface transformers library.

In [7]:
df.to_csv('data/augmented_data.csv', index=False)

# df = pd.read_csv('data/augmented_data.csv')

In [8]:
reviews = df['review_text'].values.tolist()
labels = df['sentiment'].tolist()

In [10]:
train_sents, val_sents, train_labels, val_labels = train_test_split(reviews, labels, test_size=.2)

In [12]:
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased-finetuned-sst-2-english')
print("Text: ", train_sents[0])
print()
print("Tokenized Text: ", tokenizer([train_sents[0]], truncation=True, padding=True, max_length=128))

{'input_ids': [[101, 1045, 2031, 2042, 2478, 2023, 2005, 2055, 1037, 3204, 1998, 1045, 2293, 2009, 1012, 2009, 2003, 2200, 11052, 9496, 6774, 1998, 26018, 3436, 2006, 2026, 5257, 3096, 1012, 1045, 2293, 2129, 2009, 5683, 1999, 1996, 2851, 2043, 1045, 5256, 2039, 1011, 2145, 4550, 1998, 2025, 3514, 2100, 1006, 2144, 1045, 2175, 3442, 2000, 1996, 9726, 2302, 12699, 2026, 2227, 1007, 1012, 1045, 2097, 3613, 2000, 2224, 2023, 2004, 2146, 2004, 2009, 7906, 2026, 3096, 3110, 3730, 1010, 11052, 28405, 1010, 1998, 4550, 1012, 102]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}

In [17]:
train_encodings = tokenizer(train_sents, truncation=True, padding=True, max_length=128)

val_encodings = tokenizer(val_sents, truncation=True, padding=True, max_length=128)

train_dataset = tf.data.Dataset.from_tensor_slices((
                            dict(train_encodings),
                            train_labels
                            ))

val_dataset = tf.data.Dataset.from_tensor_slices((
                            dict(val_encodings),
                            val_labels
                            ))

In [34]:
model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased-finetuned-sst-2-english', num_labels=2)
model.summary()

Some layers from the model checkpoint at distilbert-base-uncased-finetuned-sst-2-english were not used when initializing TFDistilBertForSequenceClassification: ['dropout_19']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased-finetuned-sst-2-english and are newly initialized: ['dropout_119']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model: "tf_distil_bert_for_sequence_classification_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
distilbert (TFDistilBertMain multiple                  66362880  
_________________________________________________________________
pre_classifier (Dense)       multiple                  590592    
_________________________________________________________________
classifier (Dense)           multiple                  1538      
_________________________________________________________________
dropout_119 (Dropout)        multiple                  0         
Total params: 66,955,010
Trainable params: 66,955,010
Non-trainable params: 0
_________________________________________________________________


In [35]:
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-5, epsilon=1e-07)

model.compile(optimizer=optimizer, loss=model.compute_loss, metrics=['accuracy'])

model.fit(train_dataset.shuffle(100).batch(16),
          epochs=10,
          batch_size=32,
          validation_data=val_dataset.shuffle(100).batch(16))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f13541bc6d0>

In [46]:
model.save_pretrained('models/sent_analyzer')
loaded_model = TFDistilBertForSequenceClassification.from_pretrained('models/sent_analyzer')

Some layers from the model checkpoint at models/sent_analyzer were not used when initializing TFDistilBertForSequenceClassification: ['dropout_119']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at models/sent_analyzer and are newly initialized: ['dropout_139']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Predicting on a sample text

In [47]:
test_sentence = "I have dry skin and I got a sample of this product, I tried it, but it didn't work. I really wanted to love it because I heard a lot of good things about it, but unfortunately it wasn't for me."
predict_input = tokenizer.encode(test_sentence,
                                 truncation=True,
                                 padding=True,
                                 return_tensors="tf")

In [50]:
tf_output = loaded_model.predict(predict_input)[0]
tf_prediction = tf.nn.softmax(tf_output, axis=1)
labels = ['Negative', 'Positive']
label = tf.argmax(tf_prediction, axis=1)
label = label.numpy()
print("TEXT: ", test_sentence)
print()
print("SENTIMENT: ", labels[label[0]])

TEXT:  I have dry skin and I got a sample of this product, I tried it, but it didn't work. I really wanted to love it because I heard a lot of good things about it, but unfortunately it wasn't for me.

SENTIMENT:  Negative
