In [2]:
import pandas as pd
import numpy as np

from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification 

from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.special import softmax

import torch
import re


In [3]:
dataFrame = pd.read_csv("data/clean.csv")

In [4]:
dataFrame["rating"].value_counts()

rating
5    12540
4     4908
3     2823
2     1549
1      821
Name: count, dtype: int64

In [None]:
MODEL = f"cardiffnlp/twitter-roberta-base-sentiment-latest"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)


In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

sentimentScore


In [6]:
labelRating1 = {
    "neg":[1,2,3],
    "neu":[4,2,3],
    "pos":[3,5,4]
}



In [7]:
labelRating2 = {
    "neg":[1,2],
    "neu":[4,2,3],
    "pos":[5,4]
}

In [8]:
sentimentRes=[]
outlires1 = []
outlires2 = []
for i,row in tqdm(dataFrame.iterrows(),total=len(dataFrame)):
    review = row["review"]
    rating = row["rating"]
    tokenReview = tokenizer(review,return_tensors="pt").input_ids.to(device)
    output = model(tokenReview)
    scores = output[0][0].cpu().detach().numpy()
    scores = softmax(scores)
    scoreDict = {
                "neg":scores[0], 
                "neu":scores[1],
                "pos":scores[2]
                }
    max_key = max(scoreDict, key=scoreDict.get)
    if rating not in labelRating1[max_key]:
        outlires1.append({
            "index":i,
            "review":review,
            "rating":rating,
            "result":max_key
        })
    
    if rating not in labelRating2[max_key]:
        outlires2.append({
            "index":i,
            "review":review,
            "rating":rating,
            "result":max_key
        })

    sentimentRes.append(
        {
        "id":i,
        "neg":scores[0],
        "neu":scores[1],
        "pos":scores[2],
        "result":max_key
        }
    )


  attn_output = torch.nn.functional.scaled_dot_product_attention(
100%|██████████| 22641/22641 [03:49<00:00, 98.51it/s] 


In [9]:
sentimentDataframe = pd.DataFrame(sentimentRes).drop(columns="id")
sentimentDataframe

Unnamed: 0,neg,neu,pos,result
0,0.002245,0.010576,0.987180,pos
1,0.001609,0.007436,0.990955,pos
2,0.366803,0.380140,0.253057,neu
3,0.002247,0.004622,0.993131,pos
4,0.001239,0.006733,0.992028,pos
...,...,...,...,...
22636,0.001105,0.006981,0.991914,pos
22637,0.129048,0.323072,0.547880,pos
22638,0.046035,0.150881,0.803084,pos
22639,0.390427,0.373043,0.236531,neg


In [10]:
mergedDF = dataFrame.join(sentimentDataframe)
mergedDF

Unnamed: 0,review,rating,neg,neu,pos,result
0,Absolutely wonderful - silky and sexy and comf...,4,0.002245,0.010576,0.987180,pos
1,Love this dress! it's sooo pretty. i happene...,5,0.001609,0.007436,0.990955,pos
2,I had such high hopes for this dress and reall...,3,0.366803,0.380140,0.253057,neu
3,"I love, love, love this jumpsuit. it's fun, fl...",5,0.002247,0.004622,0.993131,pos
4,This shirt is very flattering to all due to th...,5,0.001239,0.006733,0.992028,pos
...,...,...,...,...,...,...
22636,I was very happy to snag this dress at such a ...,5,0.001105,0.006981,0.991914,pos
22637,"It reminds me of maternity clothes. soft, stre...",3,0.129048,0.323072,0.547880,pos
22638,"This fit well, but the top was very see throug...",3,0.046035,0.150881,0.803084,pos
22639,I bought this dress for a wedding i have this ...,3,0.390427,0.373043,0.236531,neg


In [11]:
mergedDF.to_csv("data/roberta.csv",index=False)

In [12]:
print(len(outlires1))
print(outlires1[86])
outlireIndex1 = [item["index"] for item in outlires1]

1349
{'index': 1691, 'review': "I ordered this in my usual size 0 and couldn't finish zipping up the top, but also couldn't fill in the boobs. i am 5'7, 116 lbs, 32a. it is very cute, but the top was pretty stiff and unless it draped on you correctly, could probably be bothersome. i was disappointed in the quality of the skirt- no lining, simple cotton/polyester. i will probably return, try the size up, but was disappointed compared to how excited i was for it. i did like that it had pockets, and would be a fun dress to wear f", 'rating': 4, 'result': 'neg'}


In [13]:
print(len(mergedDF))
filtered1=mergedDF.drop(index=outlireIndex1)
print(len(filtered1))
filtered1.to_csv("data/robertaOutlires1.csv",index=False)

22641
21292


In [14]:
print(len(outlires2))
print(outlires2[86])
outlireIndex2 = [item["index"] for item in outlires2]

3758
{'index': 569, 'review': "My store had this and i was so intrigued with its nearly $400 price tag that i decided to play dress up. the colors in person match the colors online, so that was nice to see consistency. overall though, i feel like a dress of that price would need to make me feel like a million bucks, which it did not. most items that fit my slender frame are too short, and this was no exception. the xs fit but i wouldn't be doing much moving, walking or dancing in this dress, which brings me to the question of", 'rating': 3, 'result': 'pos'}


In [15]:
print(len(mergedDF))
filtered2=mergedDF.drop(index=outlireIndex2)
print(len(filtered2))
filtered2.to_csv("data/robertaOutlires2.csv",index=False)

22641
18883
