# Step 0: Load Libraries

In [1]:
import pandas as pd
import json
import requests
import csv
import numpy as np

import nltk
from nltk.corpus import stopwords
from nltk import FreqDist

import spacy
from spacy import displacy
nlp = spacy.load("en_core_web_sm")

import pickle as pk

import os
from pathlib import Path


folderpath = Path(os.getcwd()).absolute()
data_folderpath = folderpath.parent.joinpath("Data")
amazon_model_folderpath = folderpath.parent.joinpath("SentimentModels").joinpath("Inference").joinpath("Full_Amazon")
shopee_model_folderpath = folderpath.parent.joinpath("SentimentModels").joinpath("Inference").joinpath("Shopee_Aspect")
wordnet_folderpath = folderpath.parent.joinpath("WordNet")

print(folderpath)
print(data_folderpath)
print(amazon_model_folderpath)
print(shopee_model_folderpath)
print(wordnet_folderpath)

C:\Users\onnwe\Desktop\ISS_PLP_Project\Sentiment Analysis with Aspect
C:\Users\onnwe\Desktop\ISS_PLP_Project\Data
C:\Users\onnwe\Desktop\ISS_PLP_Project\SentimentModels\Inference\Full_Amazon
C:\Users\onnwe\Desktop\ISS_PLP_Project\SentimentModels\Inference\Shopee_Aspect
C:\Users\onnwe\Desktop\ISS_PLP_Project\WordNet


# Step 1: Reading Data

In [2]:
filename = "meta_AMAZON_FASHION.json"

filepath = data_folderpath.joinpath(filename)
meta = pd.read_json(filepath, lines=True)

In [3]:
meta

Unnamed: 0,title,brand,feature,rank,date,asin,imageURL,imageURLHighRes,description,price,also_view,also_buy,fit,details,similar_item,tech1
0,Slime Time Fall Fest [With CDROM and Collector...,Group Publishing (CO),[Product Dimensions:\n \n8....,"13,052,976inClothing,Shoesamp;Jewelry(",8.70 inches,0764443682,[https://images-na.ssl-images-amazon.com/image...,[https://images-na.ssl-images-amazon.com/image...,,,,,,,,
1,XCC Qi promise new spider snake preparing men'...,,,"11,654,581inClothing,Shoesamp;Jewelry(",5 star,1291691480,[https://images-na.ssl-images-amazon.com/image...,[https://images-na.ssl-images-amazon.com/image...,,,,,,,,
2,Magical Things I Really Do Do Too!,Christopher Manos,[Package Dimensions:\n \n8....,"19,308,073inClothing,ShoesJewelry(",5 star,1940280001,[https://images-na.ssl-images-amazon.com/image...,[https://images-na.ssl-images-amazon.com/image...,[For the professional or amateur magician. Ro...,,,,,,,
3,"Ashes to Ashes, Oranges to Oranges",Flickerlamp Publishing,[Package Dimensions:\n \n8....,"19,734,184inClothing,ShoesJewelry(",5 star,1940735033,[https://images-na.ssl-images-amazon.com/image...,[https://images-na.ssl-images-amazon.com/image...,,,,,,,,
4,Aether & Empire #1 - 2016 First Printing Comic...,,[Package Dimensions:\n \n10...,"10,558,646inClothing,Shoesamp;Jewelry(",5 star,1940967805,[https://images-na.ssl-images-amazon.com/image...,[https://images-na.ssl-images-amazon.com/image...,,$4.50,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
186632,JT Women's Elegant Off Shoulder Chiffon Maxi L...,JT,,"9,835,890inClothing,ShoesJewelry(",5 star,B01HJGXL4O,[https://images-na.ssl-images-amazon.com/image...,[https://images-na.ssl-images-amazon.com/image...,,,,,,,,
186633,Microcosm Retro Vintage Black Crochet Lace One...,Microcosm,[Package Dimensions:\n \n7....,"11,390,771inClothing,ShoesJewelry(",5 star5 star (0%),B01HJHF97K,[https://images-na.ssl-images-amazon.com/image...,[https://images-na.ssl-images-amazon.com/image...,,,,,,,,
186634,Lookatool Classic Plain Vintage Army Military ...,Lookatool,"[Cotton+Polyester, Imported, Item type:Basebal...","972,275inClothing,ShoesJewelry(",5 star,B01HJGJ9LS,[https://images-na.ssl-images-amazon.com/image...,[https://images-na.ssl-images-amazon.com/image...,,$8.53,"[B00XLECZMS, B0018MQAOY, B00N833I4Q, B074DQSPP...","[B07BHQ1FXL, B00XLECZMS, B07CJWM5WY, B07CS97C1...","class=""a-normal a-align-center a-spacing-smal...",,,
186635,Edith Windsor Women's Deep V-neck Beaded Sequi...,Edith Windsor,[Product Dimensions:\n \n9....,"1,964,585inClothing,ShoesJewelry(",5 star,B01HJHTH5U,[https://images-na.ssl-images-amazon.com/image...,[https://images-na.ssl-images-amazon.com/image...,,,,[B077ZLGMJ3],,,,


In [4]:
chunksize = 10000

filename = 'AMAZON_FASHION.json'
filepath = data_folderpath.joinpath(filename)

iter = pd.read_json(filepath, chunksize = chunksize, lines = True)

In [None]:
isFirstChunk = True
count = 0

for chunk in iter:
    #print(count)
    if isFirstChunk:
        chunk.to_json("Sample_Review", orient='records', lines=True)
        review_with_meta = pd.merge(chunk, meta, how="left", on="asin")
        review_with_meta["reviewText"] = review_with_meta["reviewText"].replace('\"',' ', regex=True).str.encode('ascii', 'ignore').str.decode('ascii')
        isFirstChunk = False
    else:
        chunk.to_json("Sample_Review", orient='records', lines=True)
        additional_review_with_meta = pd.merge(chunk, meta, how="left", on="asin")
        additional_review_with_meta["reviewText"] = additional_review_with_meta["reviewText"].replace('\"',' ', regex=True).str.encode('ascii', 'ignore').str.decode('ascii')
        review_with_meta = review_with_meta.append(additional_review_with_meta)
    count += 1

In [None]:
review_with_meta

# Step 2: Loading Sentiment Analysis Model (Self-trained)

In [None]:
filename = 'vectorizer_full_amazon.pk'
filepath = amazon_model_folderpath.joinpath(filename)
vectorizer = pk.load(open(filepath, 'rb'))

filename = 'logreg_model_full_amazon.sav'
filepath = amazon_model_folderpath.joinpath(filename)
loaded_model = pk.load(open(filepath, 'rb'))

In [None]:
def logreg_model(TEST_REVIEW, negation_tag):
    test_vectors = vectorizer.transform([TEST_REVIEW])
    predME = loaded_model.predict(test_vectors)
    pred= list(predME)
    if pred==[1.0]:
        if negation_tag:
            prediction = "Negative"
        else:
            prediction = "Positive"
    else:
        if negation_tag:
            prediction = "Positive"
        else:
            prediction = "Negative"
    return prediction

In [None]:
print(logreg_model("This shirt is good", False))
print(logreg_model("This shirt is bad", False))
print(logreg_model("This shirt is not bad", False))

In [None]:
filename = 'vectorizer_full_amazon.pk'
filepath = amazon_model_folderpath.joinpath(filename)
vectorizer = pk.load(open(filepath, 'rb'))

filename = 'SVM_model_full_amazon.sav'
filepath = amazon_model_folderpath.joinpath(filename)
loaded_model = pk.load(open(filepath, 'rb'))

In [None]:
def SVM_model(TEST_REVIEW, negation_tag):
    test_vectors = vectorizer.transform([TEST_REVIEW])
    predME = loaded_model.predict(test_vectors)
    pred= list(predME)
    if pred==[1.0]:
        if negation_tag:
            prediction = "Negative"
        else:
            prediction = "Positive"
    else:
        if negation_tag:
            prediction = "Positive"
        else:
            prediction = "Negative"
    return prediction

In [None]:
print(SVM_model("This shirt is good", False))
print(SVM_model("This shirt is bad", False))
print(SVM_model("This shirt is not bad", False))

In [None]:
#Trained Spacy Model
def spacy_model(TEST_REVIEW, negation_tag):

    url_start = 'https://danieltanhx.pythonanywhere.com/?input='
    url = url_start+str(TEST_REVIEW)
    
    r = requests.get(url)
    pred = json.loads(r.text)["Predicted sentiment"]
    
    if negation_tag:
        if pred == 'Positive':
            prediction = 'Negative'
        elif pred == 'Negative':
            prediction = 'Positive'
    else:
        prediction = pred
    
    return prediction

In [None]:
print(spacy_model("This shirt is good", False))
print(spacy_model("This shirt is bad", False))
print(spacy_model("This shirt is not bad", False))

# Step 3: Determine Nouns and Adjective Keywords For Different Aspects

In [9]:
# mystopwords = stopwords.words("english")
# WNlemma = nltk.WordNetLemmatizer()

# def pre_process(text):
#     text_pos = nltk.pos_tag(nltk.word_tokenize(str(text)))
#     tokens   = [ t for t in lemmaNVAR(text_pos) ]
#     tokens   = [ t for t in tokens if t not in mystopwords]
#     tokens   = [ t for t in tokens if len(t) >= 3 ]
#     return(tokens)

# def lemmaNVAR(wpos):
#     lemmas = []
#     for w, pos in wpos:
#         if pos == 'NNS':
#             lemmas.append(WNlemma.lemmatize(w.lower(), pos = pos[0].lower()))
#     return lemmas

# reviewText = review_with_meta["reviewText"].apply(pre_process)

In [10]:
# my_stopwords =  stopwords.words('english') + ['#']

# def preprocess_noun(doc):
#     doc = nlp(str(doc))
#     noun_list = [token for token in doc if token.pos_== "NOUN" and not token.text.isnumeric()]
#     noun_list = [noun.lemma_ for noun in noun_list if noun.text not in my_stopwords]

#     return noun_list

# def preprocess_adjective(doc):
#     doc = nlp(str(doc))
#     adjective_list = [token for token in doc if token.pos_ == "ADJ" and not token.text.isnumeric()]
#     adjective_list = [adjective.lemma_ for adjective in adjective_list if adjective.text not in my_stopwords]
    
#     return adjective_list

In [11]:
# noun_list = preprocess_noun(review_with_meta["reviewText"].iloc[1])
# adjective_list = preprocess_adjective(review_with_meta["reviewText"].iloc[1])

# print(noun_list)
# print(adjective_list)

['review', 'opening', 'hook', 'earring', 'end', 'price']
['small', 'expensive', 'high']


In [12]:
# import time
# start_time = time.time()
# noun_list = review_with_meta["reviewText"].apply(preprocess_noun)
# end_time = time.time()
# print(f"Time taken {end_time-start_time}")

Time taken 5065.516253948212


In [13]:
# import time
# start_time = time.time()
# adjective_list = review_with_meta["reviewText"].apply(preprocess_adjective)
# end_time = time.time()
# print(f"Time taken {end_time-start_time}")

Time taken 4935.9662482738495


In [14]:
# noun_toks = [toks for tokens in noun_list for toks in tokens]
# fdist = FreqDist(noun_toks)
# fdist.most_common(100)

[('size', 147658),
 ('quality', 84732),
 ('color', 69443),
 ('dress', 68854),
 ('shirt', 62614),
 ('price', 59233),
 ('material', 58669),
 ('product', 57751),
 ('time', 53015),
 ('picture', 40356),
 ('bit', 38459),
 ('top', 37026),
 ('fit', 34854),
 ('one', 33885),
 ('day', 32619),
 ('review', 32392),
 ('ring', 29979),
 ('way', 29466),
 ('shoe', 29085),
 ('year', 28858),
 ('daughter', 28344),
 ('fabric', 28222),
 ('hat', 28190),
 ('pair', 26899),
 ('bag', 26738),
 ('lot', 25455),
 ('item', 25403),
 ('gift', 23940),
 ('thing', 22321),
 ('month', 21587),
 ('medium', 21358),
 ('pant', 20744),
 ('belt', 18589),
 ('money', 17995),
 ('pocket', 17696),
 ('compliment', 17355),
 ('bottom', 17296),
 ('side', 17235),
 ('son', 16899),
 ('waist', 16800),
 ('star', 16697),
 ('foot', 16277),
 ('wallet', 16069),
 ('design', 15851),
 ('style', 15704),
 ('length', 15430),
 ('purchase', 15363),
 ('week', 15323),
 ('piece', 15192),
 ('suit', 14916),
 ('strap', 14601),
 ('problem', 13849),
 ('necklace', 13

In [15]:
# adjective_toks = [toks for tokens in adjective_list for toks in tokens]
# fdist = FreqDist(adjective_toks)
# fdist.most_common(100)

[('great', 153509),
 ('small', 131651),
 ('good', 112526),
 ('nice', 96258),
 ('little', 71422),
 ('large', 69101),
 ('perfect', 67577),
 ('cute', 63769),
 ('big', 61944),
 ('comfortable', 50356),
 ('beautiful', 40741),
 ('cheap', 34467),
 ('fit', 32519),
 ('soft', 29777),
 ('long', 28413),
 ('short', 26075),
 ('tight', 25992),
 ('happy', 25985),
 ('old', 25805),
 ('thin', 23655),
 ('different', 19120),
 ('well', 18450),
 ('first', 16898),
 ('many', 16612),
 ('warm', 16389),
 ('easy', 16097),
 ('sure', 15922),
 ('high', 15475),
 ('worth', 15422),
 ('bad', 15386),
 ('pretty', 15037),
 ('black', 14962),
 ('light', 14061),
 ('right', 13950),
 ('loose', 13863),
 ('true', 13239),
 ('disappointed', 13079),
 ('excellent', 12646),
 ('thick', 12443),
 ('new', 11834),
 ('fine', 11758),
 ('much', 11705),
 ('awesome', 11179),
 ('white', 11047),
 ('able', 10926),
 ('comfy', 10532),
 ('extra', 10447),
 ('heavy', 10443),
 ('amazing', 10356),
 ('hard', 10091),
 ('pleased', 10024),
 ('cool', 9154),
 ('

# Step 4: Aspect Detection Model

In [None]:
topic_list = ['size', 'comfort', 'appearance', 'quality', 'price', 'delivery']

In [None]:
# Aspect Types from common Amazon data
noun_keywords = { 'size'          : ['size', 'fit', 'length'],
                  'comfort'       : [],
                  'appearance'    : ['colour', 'picture', 'design', 'style', 'photo'],
                  'quality'       : ['quality', 'material', 'fabric', 'leather'],
                  'price'         : ['price', 'money'],
                  'delivery'      : ['time', 'day', 'seller', 'shipping', 'order']}
                     
for topic in topic_list:
    filename = topic+'.csv'
    filepath = wordnet_folderpath.joinpath(topic).joinpath(filename)
    
    with open(filepath, newline='\n') as f:
        reader = csv.reader(f)
        data = list(reader)
    nouns = noun_keywords[topic] + [pair[0] for pair in data if pair[1] == 'noun']
    noun_keywords[topic] = list(dict.fromkeys(nouns))
    print(noun_keywords[topic])

In [None]:
# Aspect Types from common Amazon data
adjective_keywords = { 'size'          : ['small', 'large', 'little', 'big', 'fit', 'long', 'short', 'tight', 'loose', 'medium', 'tiny', 'huge'],
                       'comfort'       : ['comfortable', 'uncomfortable', 'soft', 'lightweight','comfy'],
                       'appearance'    : ['beautiful', 'stylish', 'flattering', 'gorgeoous', 'lovely', 'sexy', 'adorable', 'cool'],
                       'quality'       : ['durable', 'sturdy', 'heavy', 'thick', 'new', 'old', 'hard'],
                       'price'         : ['cheap', 'expensive', 'honest', 'worth'],
                       'delivery'      : ['fast', 'quick']}

for topic in topic_list:
    filename = topic+'.csv'
    filepath = wordnet_folderpath.joinpath(topic).joinpath(filename)
    
    with open(filepath, newline='\n') as f:
        reader = csv.reader(f)
        data = list(reader)
    adjectives = adjective_keywords[topic] + [pair[0] for pair in data if pair[1] == 'adj']
    adjective_keywords[topic] = list(dict.fromkeys(adjectives))
    print(adjective_keywords[topic])

In [None]:
def find_noun_head(token):
    #print(f"Finding Token: {token}")
    #print(f"Token Head: {token.head}")
    if token.head.pos_ == "NOUN" or token.head.pos_ == "NOUN" or token.head.dep_ == "ROOT":
        #print("Found")
        return token.head
    else:
        #print("Continue Search")
        return find_noun_head(token.head)
    
def find_negation_tag(adjective):
    #print(f"Finding Negatation for Adjective: {adjective}")
    #print(f"Token Head: {adjective.head}")
    if adjective.head.dep_ == "neg":
        return True
    elif adjective.head.pos_ == "AUX":
        for child in adjective.head.children:
            if child.dep_ =='neg':
                return True
    return False

def find_span_start(sent, token):
    #for tok in sent:
    #    print(f"{tok.i}: {tok} - {tok.pos_}")
    #print(f"Start {token.i}: {token} - {token.pos_}")  
    if token.i == sent.start or sent[token.i-1].pos_ == "PUNCT":
        #print(token.i)
        return token.i
    else:
        #print(token.i-1)
        return find_span_start(sent, sent[token.i-1])

def find_span_end(sent, token):
    #for tok in sent:
    #    print(f"{tok.i}: {tok} - {tok.pos_}")
    #print(f"End {token.i}: {token} - {token.pos_}")        
    if token.i+1 == sent.end or sent[token.i+1].pos_ == "PUNCT":
        #print(token.i+1)
        return token.i+1
    else:
        #print(token.i+1)
        return find_span_end(sent, sent[token.i+1])
    
def match_topics(noun_token, adjective_token, topic_list, noun_keywords, adjective_keywords):

    for topic in topic_list:
        if noun_token.lemma_ in noun_keywords[topic]:
            return topic
        elif adjective_token.lemma_ in adjective_keywords[topic]:
            return topic

    return None

In [None]:
def adsa_model(review, DEBUG = False, SENTI_MODEL = 'logreg'):

    #Model Setting
    SENTI_MODEL = SENTI_MODEL
    DEBUG = DEBUG

    #Model    
    doc = nlp(review)

    topic_prediction = {'size'          : None,
                        'comfort'       : None,
                        'appearance'    : None,
                        'quality'       : None,
                        'price'         : None, 
                        'delivery'      : None}

    sent_count = 0
    for sent in doc.sents:
        sent_count += 1

        descriptor_pair = []
        adjectives = [tok for tok in sent if tok.pos_ == "ADJ"]
        pronouns = [tok for tok in sent if tok.pos_ == "PRON"]
        nouns = [tok for tok in sent if tok.pos_ == "NOUN"]
        negations = [tok for tok in sent if tok.dep_ == "neg"]
        
        if DEBUG:
            print(f"Sentence {sent_count}: {sent}")
            displacy.render(sent, style="dep")
            print(f"Adjectives: {adjectives}")
            print(f"Nouns: {nouns}")
            print(f"Pronouns: {pronouns}")
            print(f"Negations: {negations}")

        for adjective in adjectives:
            isFound = False
            topic = None
            
            try:
                descriptor = ""
                for child in adjective.children:
                    if child.pos_ == "ADV":
                        descriptor += child.text + " "
                descriptor += adjective.text

                negation_tag = find_negation_tag(adjective)


                #Direct (i.e. The dress has a beautiful colour)
                noun = find_noun_head(adjective)
                if noun.pos_ == "NOUN" or noun.pos_ == "PRON":
                    trace = f"Direct Reference Detected for Adjective: {adjective}"
                    for chunk in sent.noun_chunks:
                        if chunk.root == noun and adjective not in chunk:
                            isFound = True
                            noun_subj = chunk
                        else:
                            isFound = True
                            noun_subj = noun             

                #Passive Voice (i.e. Colour of the dress was beautiful)
                elif adjective.head.pos_ == "AUX" or adjective.head.pos_ == "VERB":
                    trace = f"Indirect Reference Detected for Adjective: {adjective}"
                    for child in adjective.head.children:
                        if child.pos_ == "NOUN" or child.pos_ == "PRON":
                            noun = child                      
                            for chunk in sent.noun_chunks:
                                if chunk.root == child and adjective not in chunk:
                                    isFound = True
                                    noun_subj = chunk
                                else:
                                    isFound = True
                                    noun_subj = noun
                                    
                #Guessing when improper grammer is used (i.e. Colour of the dress was beautiful vs also dress colour beautiful)
                else:
                    trace = f"Guessing for Adjective: {adjective}\n"
                    start = find_span_start(sent, adjective)
                    end = find_span_end(sent, adjective)
                    extract = sent[start:end]
                    trace += f"Extract: {extract}"

                    for token in extract:
                        if token.pos_ == "NOUN" and token.dep_ == "nsubj":
                            isFound = True
                            noun_subj = token
                        if token.dep_ == "neg":
                            negation_tag = True
            except:
                isFound = False

            if isFound:
                topic = match_topics(noun, adjective, topic_list, noun_keywords, adjective_keywords)
            else:
                noun_subj = None
                topic = match_topics(nlp("I")[0], adjective, topic_list, noun_keywords, adjective_keywords)

            if topic != None:
                if noun_subj == None:                    
                    if SENTI_MODEL == 'logreg':
                        prediction = logreg_model(str(descriptor) + " " + str(topic), negation_tag)
                    elif SENTI_MODEL == 'svm':
                        prediction = SVM_model(str(descriptor) + " " + str(topic), negation_tag)
                    elif SENTI_MODEL == 'spacy':
                        prediction = spacy_model(str(descriptor) + " " + str(topic), negation_tag)
                else:
                    if SENTI_MODEL == 'logreg':
                        prediction = logreg_model(str(descriptor) + " " + str(noun_subj), negation_tag)
                    elif SENTI_MODEL == 'svm':
                        prediction = SVM_model(str(descriptor) + " " + str(noun_subj), negation_tag)
                    elif SENTI_MODEL == 'spacy':
                        prediction = spacy_model(str(descriptor) + " " + str(noun_subj), negation_tag)
                    

                descriptor_pair.append((descriptor, noun_subj, negation_tag, topic, prediction))
                topic_prediction[topic] = prediction

            if DEBUG:
                print(f"Descriptor_pair: {descriptor_pair}")
                print("")

    print(f"Review: {review}")
    print(f"Topic_prediction: {topic_prediction}")
    print("---------------------------------------------------------------")
    print("")
    
    return topic_prediction

# Step 5a: Evaluation Functions

In [None]:
def process_evaluation(spans):
    
    topic_list = ['size', 'comfort', 'appearance', 'quality', 'price', 'delivery']    
    topic_prediction = {'size'          : None,
                        'comfort'       : None,
                        'appearance'    : None,
                        'quality'       : None,
                        'price'         : None, 
                        'delivery'      : None}
    
    if not isinstance(spans, float):
        for span in spans:
            topic_prediction[span['label'].lower()] = "Found"

    return topic_prediction

In [None]:
def accuracy(series):
    topic_list = ['size', 'comfort', 'appearance', 'quality', 'price', 'delivery']    
    
    correct = 0
    for topic in topic_list:
        if series['topics_target'][topic] != None and series['topics_predicted'][topic] != None:
            correct += 1
        elif series['topics_target'][topic] == None and series['topics_predicted'][topic] == None:
            correct += 1
    
    accuracy = correct / len(topic_list)
    
    return accuracy

In [None]:
def to_list(dict):
    topic_list = ['size', 'comfort', 'appearance', 'quality', 'price', 'delivery']
    
    list = []
    
    for topic in topic_list:
        if dict[topic] != None:
            list.append(1)
        else:
            list.append(0)
    
    return list

# Step 5b: Apply Model to Evaluation Dataset (Amazon)

In [None]:
filename = 'amazon_adsa_ner.jsonl'
filepath = data_folderpath.joinpath(filename)
amazon_test_data = pd.read_json(filepath, lines=True)

In [None]:
amazon_test_data['topics_target'] = amazon_test_data['spans'].apply(process_evaluation)

In [None]:
amazon_test_data['topics_predicted'] = amazon_test_data['text'].apply(adsa_model)

In [None]:
amazon_test_data['accuracy'] = amazon_test_data[['topics_target', 'topics_predicted']].apply(accuracy, axis=1)

In [None]:
amazon_test_data['target_list'] = amazon_test_data['topics_target'].apply(to_list)
amazon_test_data['predicted_list'] = amazon_test_data['topics_predicted'].apply(to_list)

In [None]:
amazon_target_list = []
amazon_predicted_list = []

for index in range(len(amazon_test_data)):
    amazon_target_list.append(amazon_test_data.iloc[index]['target_list'])
    amazon_predicted_list.append(amazon_test_data.iloc[index]['predicted_list'])

In [None]:
from sklearn.metrics import multilabel_confusion_matrix
from sklearn.metrics import classification_report

print(multilabel_confusion_matrix(amazon_target_list, amazon_predicted_list))
print(classification_report(amazon_target_list, amazon_predicted_list))

In [None]:
amazon_test_data[amazon_test_data['accuracy'] != 1.0]

In [None]:
index = 7

print(amazon_test_data.iloc[index]['text'])
print(amazon_test_data.iloc[index]['topics_target'])
print(amazon_test_data.iloc[index]['topics_predicted'])
print(amazon_test_data.iloc[index]['accuracy'])

In [None]:
import numpy as np

np.average(amazon_test_data['accuracy'])

# Step 6: Apply Model to Evaluation Dataset (Shopee)

In [None]:
filename = 'shopee_adsa_ner.jsonl'
filepath = data_folderpath.joinpath(filename)
shopee_test_data = pd.read_json(filepath, lines=True)

In [None]:
shopee_test_data['topics_target'] = shopee_test_data['spans'].apply(process_evaluation)

In [None]:
shopee_test_data['topics_predicted'] = shopee_test_data['text'].apply(adsa_model)

In [None]:
shopee_test_data['accuracy'] = shopee_test_data[['topics_target', 'topics_predicted']].apply(accuracy, axis=1)

In [None]:
shopee_test_data['target_list'] = shopee_test_data['topics_target'].apply(to_list)
shopee_test_data['predicted_list'] = shopee_test_data['topics_predicted'].apply(to_list)

In [None]:
shopee_target_list = []
shopee_predicted_list = []

for index in range(len(shopee_test_data)):
    shopee_target_list.append(shopee_test_data.iloc[index]['target_list'])
    shopee_predicted_list.append(shopee_test_data.iloc[index]['predicted_list'])

In [None]:
from sklearn.metrics import multilabel_confusion_matrix
from sklearn.metrics import classification_report

print(multilabel_confusion_matrix(shopee_target_list, shopee_predicted_list))
print(classification_report(shopee_target_list, shopee_predicted_list))

In [None]:
np.average(shopee_test_data['accuracy'])

# Step 7: Apply Model to Selected Brands

In [None]:
brand_name = "adidas"

print(meta[meta["brand"]==brand_name])

In [None]:
reduced_review_with_meta = review_with_meta[review_with_meta["brand"]==brand_name]

In [None]:
reduced_review_with_meta

In [None]:
for index in range(len(reduced_review_with_meta)):
    (brand, product, review, rating) = (str(reduced_review_with_meta.iloc[index]["brand"]), 
                                        str(reduced_review_with_meta.iloc[index]["title"]), 
                                        str(reduced_review_with_meta.iloc[index]["reviewText"]),
                                        str(reduced_review_with_meta.iloc[index]["overall"]))

    print(f"Index: {index}")
    print(f"Brand: {brand}")
    print(f"Product: {product}")
    print(f"Review Text: {review}")
    print(f"Product Rating: {rating}")
    adsa_model(review)