In [2]:
import swifter
import numpy as np
import pandas as pd
from tqdm import tqdm

import tensorflow as tf
from transformers import DistilBertTokenizerFast
from transformers import TFDistilBertForSequenceClassification

In [23]:
excel_file = 'data/sephora_review (5 products).xlsx'
xls = pd.ExcelFile(excel_file)

In [6]:
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased-finetuned-sst-2-english')
model = TFDistilBertForSequenceClassification.from_pretrained('models/sent_analyzer')
sentiment_labels = ['Negative', 'Positive']

Some layers from the model checkpoint at models/sent_analyzer were not used when initializing TFDistilBertForSequenceClassification: ['dropout_119']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at models/sent_analyzer and are newly initialized: ['dropout_59']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [26]:
def analyze_sentiment(text):
    tokenized_input = tokenizer.encode(text, truncation=True, padding=True, max_length=128,
                                       return_tensors="tf")
    tf_output = model.predict(tokenized_input)[0]
    tf_prediction = tf.nn.softmax(tf_output, axis=1)
    label = tf.argmax(tf_prediction, axis=1).numpy()
    return label[0]

In [39]:
def get_product_sentiment(product: str):
    df = pd.read_excel(xls, product)
    df = df[df['review_text'].notna()]
    df['sentiment'] = df['review_text'].swifter.apply(lambda x: analyze_sentiment(x))
    
    for skin_type in ['dry', 'oily', 'normal', 'combination']:
        skin_df = df[df['skin_type'] == skin_type]
        sentiment_dict = skin_df['sentiment'].value_counts().to_dict()
        pos_percent = round((sentiment_dict.get(1, 0)/sum(sentiment_dict.values())) * 100, 2)
        neg_percent = round((sentiment_dict.get(0, 0)/sum(sentiment_dict.values())) * 100, 2)
        print(f"For {product}, out of {len(skin_df)} people with {skin_type} skin type, {pos_percent}% " \
              f"posted positive reviews while {neg_percent}% posted negative reviews.")
        print()

In [41]:
get_product_sentiment(product="Product 3")

Pandas Apply:   0%|          | 0/120 [00:00<?, ?it/s]

For Product 3, out of 43 people, 90.7% of people with dry skin type posted positive reviews while 9.3% posted negative reviews.

For Product 3, out of 10 people, 100.0% of people with oily skin type posted positive reviews while 0.0% posted negative reviews.

For Product 3, out of 13 people, 84.62% of people with normal skin type posted positive reviews while 15.38% posted negative reviews.

For Product 3, out of 54 people, 90.74% of people with combination skin type posted positive reviews while 9.26% posted negative reviews.

