In [62]:
# Import Libraries
import numpy as np
import pandas as pd

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

import jaxlib
from transformers import AutoModelForQuestionAnswering, AutoTokenizer, pipeline

# Topic Modelling

- create a topic modelling to sort through the reviews to get an exact topic

In [189]:
raw_df = pd.read_csv('dog_supplement_no_na.csv')
raw_df.head()

Unnamed: 0,name,date,title,rating,review,country,sentiment
0,Blayne,2023-02-17,please avoid,1.0,you try to do right by your dog by getting the...,United States,negative
1,Jake Morrison,2023-02-17,dog was allergic to these,1.0,i have an 80lb weimaraner and started him off ...,United States,negative
2,le’MamaAmazon Customer,2021-02-01,these made my dog sick,1.0,poor little guy apparently i have one of those...,United States,negative
3,Captain,2023-03-04,12 yo italian grey hound very sick,1.0,i have two dogs and my boxer mix doesnt seem t...,United States,negative
4,Dio I,2022-05-28,not for every dog,1.0,i wanted to take my time before i write this n...,United States,negative


In [190]:
raw_df.isna().sum()

name         0
date         0
title        0
rating       0
review       0
country      0
sentiment    0
dtype: int64

In [191]:
# concatenate the review and title into a single column
raw_df['total_text'] = raw_df['title'] + ' ' + raw_df['review']

In [192]:
# check df
raw_df.head()

Unnamed: 0,name,date,title,rating,review,country,sentiment,total_text
0,Blayne,2023-02-17,please avoid,1.0,you try to do right by your dog by getting the...,United States,negative,please avoid you try to do right by your dog b...
1,Jake Morrison,2023-02-17,dog was allergic to these,1.0,i have an 80lb weimaraner and started him off ...,United States,negative,dog was allergic to these i have an 80lb weima...
2,le’MamaAmazon Customer,2021-02-01,these made my dog sick,1.0,poor little guy apparently i have one of those...,United States,negative,these made my dog sick poor little guy apparen...
3,Captain,2023-03-04,12 yo italian grey hound very sick,1.0,i have two dogs and my boxer mix doesnt seem t...,United States,negative,12 yo italian grey hound very sick i have two ...
4,Dio I,2022-05-28,not for every dog,1.0,i wanted to take my time before i write this n...,United States,negative,not for every dog i wanted to take my time bef...


In [193]:
# dropping some rows that don't give additional information
raw_df = raw_df.drop(columns=['name', 'date', 'country'])
raw_df.head()

Unnamed: 0,title,rating,review,sentiment,total_text
0,please avoid,1.0,you try to do right by your dog by getting the...,negative,please avoid you try to do right by your dog b...
1,dog was allergic to these,1.0,i have an 80lb weimaraner and started him off ...,negative,dog was allergic to these i have an 80lb weima...
2,these made my dog sick,1.0,poor little guy apparently i have one of those...,negative,these made my dog sick poor little guy apparen...
3,12 yo italian grey hound very sick,1.0,i have two dogs and my boxer mix doesnt seem t...,negative,12 yo italian grey hound very sick i have two ...
4,not for every dog,1.0,i wanted to take my time before i write this n...,negative,not for every dog i wanted to take my time bef...


In [194]:
# create samples with a balanced dataset
# Select 20 negative samples
negative_df = raw_df[raw_df['sentiment'] == 'negative'].sample(20)

# Select 30 positive samples
positive_df = raw_df[raw_df['sentiment'] == 'positive'].sample(30)

# Concatenate the two subsets
pred_df = pd.concat([negative_df, positive_df])
pred_df.head()

Unnamed: 0,title,rating,review,sentiment,total_text
352,my dog keeps on throwing it all up,1.0,so ive started giving a small dosage less than...,negative,my dog keeps on throwing it all up so ive star...
520,dont buy this product,1.0,this product made my dogs very sick and almost...,negative,dont buy this product this product made my dog...
841,dogs dont like it,2.0,the vision bites are perfect but the dogs 2 di...,negative,dogs dont like it the vision bites are perfect...
869,dogs would not eat,2.0,no idea if these will work or not the dogs re...,negative,dogs would not eat no idea if these will work ...
964,not a good buy,2.0,made my dog sick,negative,not a good buy made my dog sick


In [195]:
# topic modelling using pipeline
clf = pipeline('zero-shot-classification', model='facebook/bart-large-mnli', tokenizer='facebook/bart-large-mnli')

# some positive and negative topics chosen from WordCloud and some concern topics
candidate_labels = [
    'good product',
    'bad product', 
    'highly recommended',
    'concern: joint', 
    'concern: skin',
    'concern: coat',
    'concern: heart',
    'concern: gut',
    'concern: liver',
    'made dog sick',
    'vomited',
    'returned product',
    'diarrhea',
    'allergic to product'
]

# applying to sample df
pred_df['prediction'] = pred_df['total_text'].apply(lambda x: clf(x, candidate_labels)['labels'][0])

In [196]:
# reset the index
pred_df = pred_df.reset_index()

# drop index row
pred_df = pred_df.drop(columns='index')

In [197]:
# check topics
pred_df

Unnamed: 0,title,rating,review,sentiment,total_text,prediction
0,my dog keeps on throwing it all up,1.0,so ive started giving a small dosage less than...,negative,my dog keeps on throwing it all up so ive star...,made dog sick
1,dont buy this product,1.0,this product made my dogs very sick and almost...,negative,dont buy this product this product made my dog...,bad product
2,dogs dont like it,2.0,the vision bites are perfect but the dogs 2 di...,negative,dogs dont like it the vision bites are perfect...,made dog sick
3,dogs would not eat,2.0,no idea if these will work or not the dogs re...,negative,dogs would not eat no idea if these will work ...,bad product
4,not a good buy,2.0,made my dog sick,negative,not a good buy made my dog sick,made dog sick
5,bad idea,1.0,i bought this for our dog molly who is turning...,negative,bad idea i bought this for our dog molly who i...,bad product
6,product seemed to be old smelled and crumbled,1.0,the product smelled bad my dog would not even ...,negative,product seemed to be old smelled and crumbled ...,bad product
7,would not eat,1.0,my puppy disliked tryed mixing with other food...,negative,would not eat my puppy disliked tryed mixing w...,concern: joint
8,i would pass,2.0,didnt do much make my dogs poop loose,negative,i would pass didnt do much make my dogs poop l...,bad product
9,dogs didnt like taste,2.0,dogs were reluctant to eat it,negative,dogs didnt like taste dogs were reluctant to e...,returned product


In [198]:
# checking value count for predicted topics
pred_df['prediction'].value_counts()

good product          26
made dog sick          7
bad product            7
returned product       5
concern: joint         3
highly recommended     1
concern: heart         1
Name: prediction, dtype: int64