In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import IsolationForest
import numpy as np
from sklearn.ensemble import IsolationForest
from transformers import BertTokenizer, BertModel
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = pd.read_csv("data/All_beauty.csv")
df.head()

Unnamed: 0,rating,title,text,asin,parent_asin,user_id,timestamp,helpful_vote,verified_purchase,main_category,product_title,average_rating,rating_number,store
0,5,This really works!,"At first when I saw this, I wasn't sure what t...",B07GDQPG12,B07GDQPG12,AFSKPY37N3C43SOI5IEXEK5JSIYA,2019-01-15 22:04:03.451,0,False,All Beauty,Spa Grade Jade Roller for Face with Gua Sha | ...,4.3,139,Jane Cosmetics
1,5,Nice manicure set for men or women,This a really cute kit which would make for a ...,B07J3GH1W1,B07J3GH1W1,AFSKPY37N3C43SOI5IEXEK5JSIYA,2019-01-15 21:55:56.557,0,False,All Beauty,Manicure and Pedicure Nail Clipper from POWERG...,4.1,35,POWERGROOMING
2,5,Great combo pack. Wish I had been using this y...,"I love this combo package, particularly the fl...",B01M7UMAUG,B01M7UMAUG,AFSKPY37N3C43SOI5IEXEK5JSIYA,2017-10-23 14:57:04.887,0,False,All Beauty,Philips Sonicare Essence+ Gum Health & Airflos...,4.5,235,Philips Sonicare
3,5,Five Stars,So many great diverse colors,B01ETWL5B2,B01ETWL5B2,AG35BKPUEUMX7LV5YLOQ5YCQ3GOA,2016-10-06 12:17:06.000,0,True,All Beauty,Beauty Brags Mini Nail Polish 10-Piece Collect...,3.9,10,Beauty Brags
4,3,just ok,its OK not as good as the original Wet Brush...,B077SRDVG9,B077SRDVG9,AHGAOIZVODNHYMNCBV4DECZH42UQ,2018-03-26 19:17:04.726,0,True,All Beauty,Original Detangler Hair Brush,3.2,7,Fine Touch


# Isolation Forest on textual data

## title

In [3]:
data = df[~df.title.isna()].title.to_list()

In [4]:
# Convert text data into TF-IDF features
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(data)

In [5]:
iso_forest = IsolationForest(contamination=0.2, random_state=42)
iso_forest.fit(tfidf_matrix)

In [6]:
# Predict anomalies (-1 for outliers, 1 for inliers)
predictions = iso_forest.predict(tfidf_matrix)

# Combine results into a DataFrame
df_iso_fr = pd.DataFrame({'Review': data, 'Anomaly': predictions})
print(df_iso_fr)

                                                   Review  Anomaly
0                                      This really works!       -1
1                      Nice manicure set for men or women        1
2       Great combo pack. Wish I had been using this y...        1
3                                              Five Stars        1
4                                                 just ok        1
...                                                   ...      ...
302550                                Not a good product.        1
302551                                           One Star        1
302552                                           One Star        1
302553                                         Four Stars        1
302554                                             Pretty        1

[302555 rows x 2 columns]


In [7]:
df_iso_fr[df_iso_fr.Anomaly == -1]

Unnamed: 0,Review,Anomaly
0,This really works!,-1
9,It does work,-1
11,Great for sensitive skin!,-1
12,"Great value, stay power is good",-1
16,Not 100% sure how safe it is particularly for ...,-1
...,...,...
302523,I ordered 2 and received 4 tubes Don't like it...,-1
302524,by morning my skin is super soft!,-1
302529,This is the best eyeliner I have ever used,-1
302532,"If you’re looking for a reason to buy these, h...",-1


In [12]:
print("Pourcentage d'anomalie détectée" , round(df_iso_fr[df_iso_fr.Anomaly == -1].shape[0]*100/df.shape[0]), "%")

Pourcentage d'anomalie détectée 20 %


### Bert for embedding

In [4]:
from tqdm import tqdm
# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Function to generate BERT embeddings for a list of sentences
def get_bert_embeddings(text_list):
    embeddings = []
    for text in tqdm(text_list):
        # Tokenize the text and add special tokens
        inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=128)
        
        # Get hidden states from BERT
        with torch.no_grad():
            outputs = model(**inputs)
        
        # Use the mean of the token embeddings from the last hidden layer
        last_hidden_states = outputs.last_hidden_state
        embeddings.append(last_hidden_states.mean(dim=1).squeeze().numpy())
    
    return np.array(embeddings)

# Generate embeddings
title_embeddings = get_bert_embeddings(data)

100%|██████████| 302555/302555 [1:19:14<00:00, 63.63it/s]


In [5]:
import pickle
with open("data/bert_title_embedding.pkl", "wb") as file:
    pickle.dump(title_embeddings, file)

## Text

In [9]:
data = df[~df.text.isna()].text.to_list()

In [7]:
# Convert text data into TF-IDF features
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(data)

In [12]:
iso_forest = IsolationForest(contamination=0.01, random_state=42)
iso_forest.fit(tfidf_matrix)

In [13]:
# Predict anomalies (-1 for outliers, 1 for inliers)
predictions = iso_forest.predict(tfidf_matrix)

# Combine results into a DataFrame
df_iso_fr = pd.DataFrame({'Review': data, 'Anomaly': predictions})
print(df_iso_fr)

                                                   Review  Anomaly
0       At first when I saw this, I wasn't sure what t...       -1
1       This a really cute kit which would make for a ...        1
2       I love this combo package, particularly the fl...        1
3                            So many great diverse colors        1
4       its OK  not as good as the  original Wet Brush...        1
...                                                   ...      ...
302485  The scent doesn't last,it seems to be watered ...        1
302486  Cant do anything with it. No Heat, cant get it...        1
302487  Cant do anything with it. No Heat, cant get it...        1
302488    Conditioner is great  shampoo not as I expected        1
302489  Did not work! Used the whole bottle and my hai...        1

[302490 rows x 2 columns]


In [14]:
df_iso_fr[df_iso_fr.Anomaly == -1]

Unnamed: 0,Review,Anomaly
0,"At first when I saw this, I wasn't sure what t...",-1
24,My daughter received a UV Light Manicure Light...,-1
27,I am in my late 40's and started using anti-ag...,-1
28,This is the 3rd peel foot mask that I have tri...,-1
29,My daughter and I discovered facial sheet mask...,-1
...,...,...
302170,Bought this product since it stated it was hea...,-1
302209,I've used this face wash a few times now. It c...,-1
302404,This product reminds me a lot of a product I u...,-1
302415,I wanted a product that would allow me to have...,-1


In [15]:
print("Pourcentage d'anomalie détectée" , round(df_iso_fr[df_iso_fr.Anomaly == -1].shape[0]*100/df.shape[0]), "%")

Pourcentage d'anomalie détectée 1 %


### Bert for embedding

In [4]:
from tqdm import tqdm
# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Function to generate BERT embeddings for a list of sentences
def get_bert_embeddings(text_list):
    embeddings = []
    for text in tqdm(text_list):
        # Tokenize the text and add special tokens
        inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=128)
        
        # Get hidden states from BERT
        with torch.no_grad():
            outputs = model(**inputs)
        
        # Use the mean of the token embeddings from the last hidden layer
        last_hidden_states = outputs.last_hidden_state
        embeddings.append(last_hidden_states.mean(dim=1).squeeze().numpy())
    
    return np.array(embeddings)

# Generate embeddings
embeddings = get_bert_embeddings(data)

100%|██████████| 302555/302555 [1:20:12<00:00, 62.86it/s]


In [6]:
import pickle
with open("data/bert_embedding.pkl", "wb") as file:
    pickle.dump(embeddings, file)

In [12]:
iso_forest = IsolationForest(contamination=0.001, random_state=53)
iso_forest.fit(embeddings)

In [13]:
# Predict anomalies (-1 for outliers, 1 for inliers)
predictions = iso_forest.predict(embeddings)

# Combine results into a DataFrame
df_iso_fr = pd.DataFrame({'Review': data, 'Anomaly': predictions})
print(df_iso_fr)

                                                   Review  Anomaly
0                                      This really works!        1
1                      Nice manicure set for men or women        1
2       Great combo pack. Wish I had been using this y...        1
3                                              Five Stars        1
4                                                 just ok        1
...                                                   ...      ...
302550                                Not a good product.        1
302551                                           One Star        1
302552                                           One Star        1
302553                                         Four Stars        1
302554                                             Pretty        1

[302555 rows x 2 columns]


In [6]:
pd.options.display.max_rows = 20

In [18]:
display(df_iso_fr[df_iso_fr.Anomaly == -1])

Unnamed: 0,Review,Anomaly
315,Love it!!!!!!!!,-1
2212,*****,-1
4903,Uuuhhhhmmmmmmm.,-1
5225,*****Formerly Skeptical*****,-1
9636,OUTSTANDING!!!!!!!!!!!!,-1
10134,"Love, love, love, love, love!",-1
11207,Not sure yet.........,-1
13232,No siempre funciona bien,-1
13247,???,-1
14943,???,-1


In [15]:
print("Pourcentage d'anomalie détectée" , round(df_iso_fr[df_iso_fr.Anomaly == -1].shape[0]*100/df.shape[0]), "%")

Pourcentage d'anomalie détectée 0 %


## Product title

In [7]:
data = df[~df.product_title.isna()].product_title.to_list()

In [9]:
prod_title_embeddings = get_bert_embeddings(data)

100%|██████████| 302599/302599 [2:00:39<00:00, 41.80it/s]  


In [10]:
import pickle
with open("data/product_title_bert_embedding.pkl", "wb") as file:
    pickle.dump(prod_title_embeddings, file)