### Clothes

In [1]:
import ast
import json
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import seaborn as sns
import torch
import warnings
warnings.filterwarnings('ignore')
from collections import defaultdict, Counter
from matplotlib.cm import get_cmap
from matplotlib.colors import to_hex
from tqdm import tqdm

In [2]:
dataset_name = "Clothes"
dataset_dir = "/data/common/RecommendationDatasets/{dataset_name}_Amazon14/topics/".format(dataset_name=dataset_name)

topics = [
    "fit",
    "material",
    "comfort",
    "appearance",
    "construction",
    "price",
    "care",
    "functionality",
    "shipping",
    "service"
]

sentiments = [
    "positive",
    "negative",
    "neutral"
]

In [3]:
data_df = pd.read_csv(os.path.join(dataset_dir, "statement.csv"))
data_df.head()

Unnamed: 0,helpful,reviewTime,rating,timestamp,review,user_name,user_id,item_id,review_title,statements
0,"[0, 0]","02 12, 2011",5.0,1297468800,This is a great tutu and at a really great pri...,"Amazon Customer ""cameramom""",A1KLRMWW2FWPL4,31887,Great tutu- not cheaply made,[{'statement': 'has a low price relative to ex...
1,"[0, 0]","01 19, 2013",5.0,1358553600,I bought this for my 4 yr old daughter for dan...,Amazon Customer,A2G5TCU2WDFZ65,31887,Very Cute!!,[{'statement': 'is suitable for 4-year-old chi...
2,"[0, 0]","01 4, 2013",5.0,1357257600,What can I say... my daughters have it in oran...,Carola,A1RLQXYNCMWRWN,31887,I have buy more than one,"[{'statement': 'comes in multiple colors', 'to..."
3,"[0, 0]","04 27, 2014",5.0,1398556800,"We bought several tutus at once, and they are ...",Caromcg,A8U3FAMSJVHS5,31887,"Adorable, Sturdy","[{'statement': 'has high reviews', 'topic': 's..."
4,"[0, 0]","03 15, 2014",5.0,1394841600,Thank you Halo Heaven great product for Little...,CJ,A3GEOILWLK86XM,31887,Grammy's Angels Love it,"[{'statement': 'is well-made', 'topic': 'const..."


In [4]:
statement_triplets = data_df["statements"].tolist()
print(statement_triplets[0:5])

["[{'statement': 'has a low price relative to expectations', 'topic': 'price', 'sentiment': 'positive'}, {'statement': 'does not look cheap', 'topic': 'appearance', 'sentiment': 'positive'}, {'statement': 'is not poorly made', 'topic': 'construction', 'sentiment': 'positive'}]", "[{'statement': 'is suitable for 4-year-old children', 'topic': 'fit', 'sentiment': 'neutral'}, {'statement': 'is considered adorable by a teacher', 'topic': 'appearance', 'sentiment': 'positive'}, {'statement': 'matches color of a light blue long sleeve leotard', 'topic': 'appearance', 'sentiment': 'positive'}, {'statement': 'has a competitive price compared to similar products', 'topic': 'price', 'sentiment': 'positive'}]", "[{'statement': 'comes in multiple colors', 'topic': 'appearance', 'sentiment': 'positive'}, {'statement': 'is comfortable to wear', 'topic': 'comfort', 'sentiment': 'positive'}, {'statement': 'looks great', 'topic': 'appearance', 'sentiment': 'positive'}, {'statement': 'is easy to wear', 

In [6]:
i = 0
for triplet_list in statement_triplets:
    triplet_list = ast.literal_eval(triplet_list)
    for triplet in triplet_list:
        statement = triplet.get("statement").lower().strip()
        topic = triplet.get("topic").lower().strip()
        sentiment = triplet.get("sentiment").lower().strip()

        print(statement.center(60), topic.center(20), sentiment.center(10), sep="|\t\t\t")
        i += 1

    if i > 40:
        break

          has a low price relative to expectations          |			       price        |			 positive 
                    does not look cheap                     |			     appearance     |			 positive 
                     is not poorly made                     |			    construction    |			 positive 
            is suitable for 4-year-old children             |			        fit         |			 neutral  
            is considered adorable by a teacher             |			     appearance     |			 positive 
     matches color of a light blue long sleeve leotard      |			     appearance     |			 positive 
    has a competitive price compared to similar products    |			       price        |			 positive 
                  comes in multiple colors                  |			     appearance     |			 positive 
                   is comfortable to wear                   |			      comfort       |			 positive 
                        looks great                         |			     appearance     |			 positive 
          

In [5]:
def process_triplets(statement_triplets, topics, sentiments):
    correct_triplets = {}
    wrong_topics = {}
    wrong_sentiments = {}
    wrong_topics_and_sentiments = {}
    
    for triplet_list in statement_triplets:
        if not triplet_list: continue
        if type(triplet_list) != list:
            try:
                triplet_list = ast.literal_eval(triplet_list)
            except:
                continue
    
        for triplet in triplet_list:
            if not triplet: continue

            accepted_topic = True
            accepted_sentiment = True
                
            topic = triplet.get("topic")
            if topic: topic = topic.lower().replace("-", " ").replace("_", " ").strip()
            else: continue
            if topic not in topics: accepted_topic = False

            sentiment = triplet.get("sentiment")
            if sentiment: sentiment = sentiment.lower().strip()
            else: continue
            if sentiment not in sentiments: accepted_sentiment = False
        
            statement = triplet.get("statement")
            if statement: statement = statement.lower().replace("-", " ").replace("_", " ").strip()
            else: continue
            
            if not (accepted_topic or accepted_sentiment):
                if topic not in wrong_topics_and_sentiments:
                    wrong_topics_and_sentiments[topic] = {}
                if sentiment not in wrong_topics_and_sentiments[topic]:
                    wrong_topics_and_sentiments[topic][sentiment] = {}
                if statement not in wrong_topics_and_sentiments[topic][sentiment]:
                    wrong_topics_and_sentiments[topic][sentiment][statement] = 0
                wrong_topics_and_sentiments[topic][sentiment][statement] += 1

            elif accepted_sentiment and not accepted_topic:
                if topic not in wrong_topics:
                    wrong_topics[topic] = {}
                if sentiment not in wrong_topics[topic]:
                    wrong_topics[topic][sentiment] = {}
                if statement not in wrong_topics[topic][sentiment]:
                    wrong_topics[topic][sentiment][statement] = 0
                wrong_topics[topic][sentiment][statement] += 1

            elif accepted_topic and not accepted_sentiment:
                if topic not in wrong_sentiments:
                    wrong_sentiments[topic] = {}
                if sentiment not in wrong_sentiments[topic]:
                    wrong_sentiments[topic][sentiment] = {}
                if statement not in wrong_sentiments[topic][sentiment]:
                    wrong_sentiments[topic][sentiment][statement] = 0
                wrong_sentiments[topic][sentiment][statement] += 1      

            else:
                if topic not in correct_triplets:
                    correct_triplets[topic] = {}
                if sentiment not in correct_triplets[topic]:
                    correct_triplets[topic][sentiment] = {}
                if statement not in correct_triplets[topic][sentiment]:
                    correct_triplets[topic][sentiment][statement] = 0
                correct_triplets[topic][sentiment][statement] += 1    
                
    return correct_triplets, wrong_topics, wrong_sentiments, wrong_topics_and_sentiments

In [7]:
correct_triplets, wrong_topics, wrong_sentiments, wrong_topics_and_sentiments = process_triplets(
    statement_triplets, topics, sentiments
)

In [8]:
len(correct_triplets), len(wrong_topics), len(wrong_sentiments), len(wrong_topics_and_sentiments)

(10, 156, 4, 1)

In [9]:
def build_topic_stats_df(topics):
    all_topics = sorted(topics.keys())
    all_sentiments = sorted({s for t in topics.values() for s in t.keys()})
    rows = []
    for topic in all_topics:
        row = {}
        uniq_total = 0
        freq_total = 0
        for s in all_sentiments:
            counts = topics[topic].get(s, {})
            u = len(counts)
            f = int(sum(counts.values())) if counts else 0
            row[(s, "unique")] = u
            row[(s, "total")] = f
            uniq_total += u
            freq_total += f
        row[("TOTAL", "unique")] = uniq_total
        row[("TOTAL", "total")] = freq_total
        rows.append(row)
    cols = []
    for s in all_sentiments:
        cols.append((s, "unique"))
        cols.append((s, "total"))
    cols.extend([("TOTAL", "unique"), ("TOTAL", "total")])
    df = pd.DataFrame(rows, index=all_topics, columns=pd.MultiIndex.from_tuples(cols))
    df = df.fillna(0).astype(int)
    return df

In [10]:
correct_triplets_df_stats = build_topic_stats_df(correct_triplets)
print(correct_triplets_df_stats)

              negative         neutral        positive           TOTAL        
                unique   total  unique  total   unique   total  unique   total
appearance       19273   26988   10310  12251    62575  153030   92158  192269
care              6262    7645    5588   6429     9408   17864   21258   31938
comfort          18423   25253    4636   5625    44529  130134   67588  161012
construction     28632   35486    6638   7327    25351   57713   60621  100526
fit              59483  100116   35525  45372    67915  141357  162923  286845
functionality    21441   24315   14137  15163    96933  143768  132511  183246
material         19044   28719   11904  15966    33158   82619   64106  127304
price             6596   11888    4046   5300    16909   57990   27551   75178
service           4992    6634    2239   2466    10679   19152   17910   28252
shipping          3598    4832    2693   3211     5630   18952   11921   26995


In [11]:
correct_triplets_df_stats.sum(axis=0)

negative  unique     187744
          total      271876
neutral   unique      97716
          total      119110
positive  unique     373087
          total      822579
TOTAL     unique     658547
          total     1213565
dtype: int64

In [12]:
wrong_sentiments

{'comfort': {'mixed': {'is comfortable except for a specific issue': 1,
   'are comfortable except for toe touch': 1}},
 'functionality': {'suggestion': {'have zip off pant legs': 1}},
 'service': {'cautionary': {'require caution': 1}},
 'care': {'unknown': {'has not shrunk after washing': 1}}}

In [13]:
wrong_topics_df_stats = build_topic_stats_df(wrong_topics)
print(wrong_topics_df_stats)

                negative       neutral       positive        TOTAL      
                  unique total  unique total   unique total unique total
accessories            1     1       0     0        1     1      2     2
accuracy               8     8       9     9        8     8     25    25
assortment             0     0       0     0        1     1      1     1
authenticity           0     0       0     0        3     3      3     3
availability         151   173     106   115       53    56    310   344
...                  ...   ...     ...   ...      ...   ...    ...   ...
wear                   0     0       2     2        0     0      2     2
wear and tear          1     1       0     0        0     0      1     1
weather                1     1       2     2        1     1      4     4
weight                76   129      81   124      106   171    263   424
wind resistance        0     0       0     0        1     1      1     1

[156 rows x 8 columns]


In [14]:
wrong_topics_df_stats.sum(axis=0)

negative  unique     766
          total      903
neutral   unique     726
          total      806
positive  unique    1905
          total     2326
TOTAL     unique    3397
          total     4035
dtype: int64

In [15]:
(wrong_topics_df_stats.sum(axis=0) / (wrong_topics_df_stats.sum(axis=0) + correct_triplets_df_stats.sum(axis=0))) * 100

negative  unique    0.406344
          total     0.331037
neutral   unique    0.737490
          total     0.672137
positive  unique    0.508011
          total     0.281972
TOTAL     unique    0.513185
          total     0.331390
dtype: float64

In [16]:
wrong_topics_stats = wrong_topics_df_stats["TOTAL"]["unique"].to_dict()
print(wrong_topics_stats)

{'accessories': 2, 'accuracy': 25, 'assortment': 1, 'authenticity': 3, 'availability': 310, 'battery': 3, 'battery life': 1, 'brand': 238, 'brand loyalty': 12, 'brand preference': 2, 'brand reputation': 18, 'breathability': 3, 'capacity': 3, 'cold resistance': 1, 'collection': 1, 'color': 59, 'color accuracy': 32, 'color options': 1, 'color retention': 2, 'colorfastness': 7, 'comparison': 16, 'compatibility': 1, 'concept': 1, 'condition': 3, 'culture': 1, 'description': 5, 'design': 7, 'desire': 3, 'display': 1, 'durability': 492, 'ecodrive': 1, 'emotional value': 1, 'experience': 5, 'eyelash skirt': 1, 'fragrance': 1, 'frequency': 3, 'frequency of use': 2, 'frequency of wear': 1, 'freshness': 1, 'general': 150, 'general opinion': 2, 'general quality': 1, 'gift giving': 1, 'health': 14, 'height': 2, 'history': 8, 'identification': 1, 'information': 8, 'inseam': 4, 'insoles': 1, 'insulation': 3, 'inventory': 1, 'label': 1, 'labeling': 3, 'labor practices': 3, 'language': 1, 'length': 16

In [17]:
remain_wrong_topics = [t for t, n_st in wrong_topics_stats.items() if n_st >= 10]
print(remain_wrong_topics)
print(len(remain_wrong_topics))

['accuracy', 'availability', 'brand', 'brand loyalty', 'brand reputation', 'color', 'color accuracy', 'comparison', 'durability', 'general', 'health', 'length', 'none', 'overall', 'overall quality', 'packaging', 'product', 'purchase', 'quality', 'quantity', 'recommendation', 'shopping', 'size', 'sound', 'style', 'temperature', 'unknown', 'usage', 'value', 'water resistance', 'waterproof', 'waterproofing', 'weight']
33


In [18]:
# try to assign correct topic if possible
for topic in wrong_topics:
    print(topic)
    for sent in wrong_topics[topic]:
        statements = wrong_topics[topic][sent]
        statements = list(dict(sorted(statements.items(), key=lambda x: x[1])))
        if len(statements) > 4:
            statements = statements[:4]
        print("\t", sent, ":", "\n\t\t".join(statements))

durability
	 negative : may not withstand heavy use
		would not last more than 6 months for construction work
		only gets a few wears
		has a limited lifespan
	 neutral : will probably last as long as his attention
		may not last long
		gets about 1 year out of a pair
		are expected to last one night
	 positive : are worn until they are in poor condition
		expect to need a new pair soon due to frequent wear
		can be worn for a year or more
		lasts for a year with heavy use
language
	 neutral : no english spoken, only italian
size
	 positive : is small and lightweight
		are small enough to wear to work
		is small and easy to carry
		is smaller than most ditty bags
	 negative : is very small
		is large
		is not big enough for all products
		is too big for personal use
	 neutral : comes in size 32x31
		is available in kids' size 1
		is very big
		bag is large
weight
	 positive : is not as heavy as other coats
		strap is not heavy
		remains under weight limit
		is heavy, in a positive way


**Prompt** : ```The extraction of triplets (atomic statement, topic, sentiment) from reviews produced topics that are not in the accepted topics list (represented by their short name). I will give you the list of incorrect topics, with a few example statements per sentiment. Analyze the statements and estimate the closest accepted topic it can be assigned to. If no statement is provided, do not assign anything; and if an assignment is impossible, do not assign one either. Here are the examples [EXAMPLES]. Here is the list of accepted topics for the Amazon Clothes dataset:
ACCEPTED TOPICS (short name (long name): description) [TOPICS]
You must output a JSON object; keys are the incorrect topics and their values are the short name of the accepted topic they’re assigned to. Use None for incorrect topics that are not assigned.```

In [19]:
gpt_assignation = {
    "durability": "construction",
    "language": None,
    "size": "fit",
    "weight": "comfort",
    "usage": None,
    "availability": None,
    "overall": None,
    "brand reputation": None,
    "brand": None,
    "history": None,
    "recommendation": None,
    "frequency": None,
    "quantity": None,
    "color accuracy": "appearance",
    "none": None,
    "color": "appearance",
    "shopping": None,
    "water resistance": "functionality",
    "manufacturing": None,
    "temperature": "functionality",
    "general": None,
    "value": "price",
    "sound": None,
    "overall quality": None,
    "quality": None,
    "colorfastness": "care",
    "accuracy": "functionality",
    "desire": None,
    "packaging": "shipping",
    "length": "fit",
    "production location": None,
    "waterproofing": "functionality",
    "labeling": None,
    "product": None,
    "manufacturing location": None,
    "wear": None,
    "color retention": "care",
    "style": "appearance",
    "name": None,
    "product lifecycle": None,
    "lume": "functionality",
    "gift giving": None,
    "overall impression": None,
    "brand loyalty": None,
    "waterproof": "functionality",
    "waterproofness": "functionality",
    "noise": None,
    "warmth": "functionality",
    "temperature regulation": "functionality",
    "authenticity": None,
    "visibility": "functionality",
    "accessories": "functionality",
    "comparison": None,
    "product overall": None,
    "product line": None,
    "health": "comfort",
    "size and portability": "functionality",
    "concept": None,
    "security": "functionality",
    "target audience": None,
    "inventory": None,
    "stability": "functionality",
    "warranty": "service",
    "design": "functionality",
    "unknown": None,
    "description": "service",
    "purchase": None,
    "self perception": None,
    "occasion": None,
    "personal preference": None,
    "pain": "comfort",
    "water exposure": "functionality",
    "identification": None,
    "general opinion": None,
    "insoles": "construction",
    "longevity": "construction",
    "temperature tolerance": "functionality",
    "luxury lane products": None,
    "condition": "shipping",
    "product name": None,
    "insulation": "functionality",
    "cold resistance": "functionality",
    "slipperiness": "functionality",
    "culture": None,
    "information": None,
    "reliability": "construction",
    "wear and tear": "construction",
    "location": None,
    "purchase intention": None,
    "marketing": None,
    "capacity": "functionality",
    "collection": None,
    "meaningfulness": None,
    "ecodrive": "functionality",
    "timekeeping": "functionality",
    "organization": "functionality",
    "battery life": "functionality",
    "portability": "functionality",
    "seller": None,
    "sentiment": None,
    "seasonal usage": "functionality",
    "breathability": "comfort",
    "safety": "functionality",
    "seasonality": "functionality",
    "shopping experience": None,
    "experience": None,
    "eyelash skirt": None,
    "weather": "functionality",
    "light": "functionality",
    "returns": "service",
    "frequency of use": None,
    "order": None,
    "memory": None,
    "height": "fit",
    "assortment": None,
    "label": None,
    "return": "service",
    "temperature suitability": "functionality",
    "freshness": None,
    "labor practices": None,
    "inseam": "fit",
    "traction": "functionality",
    "brand preference": None,
    "subjective": None,
    "purpose": None,
    "product quality": None,
    "water repellency": "functionality",
    "uniqueness": "appearance",
    "social impact": None,
    "emotional value": None,
    "frequency of wear": None,
    "thickness": "material",
    "seasonal suitability": "functionality",
    "origin": None,
    "general quality": None,
    "popularity": None,
    "display": "functionality",
    "battery": "functionality",
    "temperature retention": "functionality",
    "wind resistance": "functionality",
    "support": "comfort",
    "waist": "fit",
    "sleeves": "appearance",
    "topic": None,
    "personality": None,
    "ownership": None,
    "overall satisfaction": None,
    "pity": None,
    "production": None,
    "research": None,
    "sleeve length": "fit",
    "purchase quantity": None,
    "fragrance": None,
    "color options": "appearance",
    "product knowledge": None,
    "compatibility": "functionality"
}

In [20]:
claude_assignation = {
  "durability": "construction",
  "language": None,
  "size": "fit",
  "weight": "material",
  "usage": None,
  "availability": None,
  "overall": None,
  "brand reputation": None,
  "brand": None,
  "history": None,
  "recommendation": None,
  "frequency": None,
  "quantity": None,
  "color accuracy": "appearance",
  "none": None,
  "color": "appearance",
  "shopping": None,
  "water resistance": "functionality",
  "manufacturing": None,
  "temperature": "functionality",
  "general": None,
  "value": "price",
  "sound": None,
  "overall quality": None,
  "quality": None,
  "colorfastness": "care",
  "accuracy": None,
  "desire": None,
  "packaging": "shipping",
  "length": "fit",
  "production location": None,
  "waterproofing": "functionality",
  "labeling": None,
  "product": None,
  "manufacturing location": None,
  "wear": None,
  "color retention": "care",
  "style": "appearance",
  "name": None,
  "product lifecycle": None,
  "lume": None,
  "gift giving": None,
  "overall impression": None,
  "brand loyalty": None,
  "waterproof": "functionality",
  "waterproofness": "functionality",
  "noise": None,
  "warmth": "functionality",
  "temperature regulation": "functionality",
  "authenticity": None,
  "visibility": None,
  "accessories": None,
  "comparison": None,
  "product overall": None,
  "product line": None,
  "health": None,
  "size and portability": "fit",
  "concept": None,
  "security": "functionality",
  "target audience": None,
  "inventory": None,
  "stability": None,
  "warranty": None,
  "design": "appearance",
  "unknown": None,
  "description": None,
  "purchase": None,
  "self perception": None,
  "occasion": None,
  "personal preference": None,
  "pain": "comfort",
  "water exposure": None,
  "identification": None,
  "general opinion": None,
  "insoles": "comfort",
  "longevity": "construction",
  "temperature tolerance": "functionality",
  "luxury lane products": None,
  "condition": None,
  "product name": None,
  "insulation": "functionality",
  "cold resistance": "functionality",
  "slipperiness": None,
  "culture": None,
  "information": None,
  "reliability": "construction",
  "wear and tear": "construction",
  "location": None,
  "purchase intention": None,
  "marketing": None,
  "capacity": "functionality",
  "collection": None,
  "meaningfulness": None,
  "ecodrive": None,
  "timekeeping": None,
  "organization": None,
  "battery life": None,
  "portability": "functionality",
  "seller": "service",
  "sentiment": None,
  "seasonal usage": "functionality",
  "breathability": "comfort",
  "safety": None,
  "seasonality": "functionality",
  "shopping experience": None,
  "experience": None,
  "eyelash skirt": None,
  "weather": "functionality",
  "light": None,
  "returns": "service",
  "frequency of use": None,
  "order": None,
  "memory": None,
  "height": "fit",
  "assortment": None,
  "label": None,
  "return": "service",
  "temperature suitability": "functionality",
  "freshness": None,
  "labor practices": None,
  "inseam": "fit",
  "traction": "functionality",
  "brand preference": None,
  "subjective": None,
  "purpose": None,
  "product quality": None,
  "water repellency": "functionality",
  "uniqueness": None,
  "social impact": None,
  "emotional value": None,
  "frequency of wear": None,
  "thickness": "material",
  "seasonal suitability": "functionality",
  "origin": None,
  "general quality": None,
  "popularity": None,
  "display": None,
  "battery": None,
  "temperature retention": "functionality",
  "wind resistance": "functionality",
  "support": "comfort",
  "waist": "fit",
  "sleeves": "fit",
  "topic": None,
  "personality": None,
  "ownership": None,
  "overall satisfaction": None,
  "pity": None,
  "production": None,
  "research": None,
  "sleeve length": "fit",
  "purchase quantity": None,
  "fragrance": None,
  "color options": "appearance",
  "product knowledge": None,
  "compatibility": "functionality"
}

In [21]:
gemini_assignation = {
  "durability": "construction",
  "language": None,
  "size": "fit",
  "weight": "material",
  "usage": None,
  "availability": None,
  "overall": None,
  "brand reputation": None,
  "brand": None,
  "history": None,
  "recommendation": None,
  "frequency": None,
  "quantity": None,
  "color accuracy": "appearance",
  "none": None,
  "color": "appearance",
  "shopping": None,
  "water resistance": "functionality",
  "manufacturing": None,
  "temperature": "functionality",
  "general": None,
  "value": "price",
  "sound": None,
  "overall quality": "construction",
  "quality": "construction",
  "colorfastness": "care",
  "accuracy": None,
  "desire": None,
  "packaging": "shipping",
  "length": "fit",
  "production location": None,
  "waterproofing": "functionality",
  "labeling": None,
  "product": None,
  "manufacturing location": None,
  "wear": None,
  "color retention": "care",
  "style": "appearance",
  "name": None,
  "product lifecycle": None,
  "lume": None,
  "gift giving": None,
  "overall impression": None,
  "brand loyalty": None,
  "waterproof": "functionality",
  "waterproofness": "functionality",
  "noise": None,
  "warmth": "functionality",
  "temperature regulation": "functionality",
  "authenticity": None,
  "visibility": "functionality",
  "accessories": None,
  "comparison": None,
  "product overall": None,
  "product line": None,
  "health": None,
  "size and portability": "fit",
  "concept": None,
  "security": "functionality",
  "target audience": None,
  "inventory": None,
  "stability": "construction",
  "warranty": "service",
  "design": None,
  "unknown": None,
  "description": None,
  "purchase": None,
  "self perception": None,
  "occasion": None,
  "personal preference": None,
  "pain": None,
  "water exposure": None,
  "identification": None,
  "general opinion": None,
  "insoles": "construction",
  "longevity": "construction",
  "temperature tolerance": "functionality",
  "luxury lane products": None,
  "condition": None,
  "product name": None,
  "insulation": "functionality",
  "cold resistance": "functionality",
  "slipperiness": "functionality",
  "culture": None,
  "information": None,
  "reliability": "construction",
  "wear and tear": "construction",
  "location": None,
  "purchase intention": None,
  "marketing": None,
  "capacity": "functionality",
  "collection": None,
  "meaningfulness": None,
  "ecodrive": "functionality",
  "timekeeping": "functionality",
  "organization": "functionality",
  "battery life": "functionality",
  "portability": "fit",
  "seller": None,
  "sentiment": None,
  "seasonal usage": "functionality",
  "breathability": "comfort",
  "safety": "construction",
  "seasonality": "functionality",
  "shopping experience": None,
  "experience": None,
  "eyelash skirt": None,
  "weather": "functionality",
  "light": "functionality",
  "returns": "service",
  "frequency of use": None,
  "order": None,
  "memory": None,
  "height": "fit",
  "assortment": None,
  "label": None,
  "return": "service",
  "temperature suitability": "functionality",
  "freshness": None,
  "labor practices": None,
  "inseam": "fit",
  "traction": "functionality",
  "brand preference": None,
  "subjective": None,
  "purpose": None,
  "product quality": "construction",
  "water repellency": "functionality",
  "uniqueness": None,
  "social impact": None,
  "emotional value": None,
  "frequency of wear": None,
  "thickness": "material",
  "seasonal suitability": "functionality",
  "origin": None,
  "general quality": "construction",
  "popularity": None,
  "display": "functionality",
  "battery": "functionality",
  "temperature retention": "functionality",
  "wind resistance": "functionality",
  "support": "construction",
  "waist": "fit",
  "sleeves": "fit",
  "topic": None,
  "personality": None,
  "ownership": None,
  "overall satisfaction": None,
  "pity": None,
  "production": None,
  "research": None,
  "sleeve length": "fit",
  "purchase quantity": None,
  "fragrance": None,
  "color options": "appearance",
  "product knowledge": None,
  "compatibility": "functionality"
}

In [22]:
final_assignation = {}
for topic in wrong_topics:
    assignations = [
        gpt_assignation.get(topic, None),
        claude_assignation.get(topic, None),
        gemini_assignation.get(topic, None)
    ]
    value_set = set(assignations)
    for value in value_set:
        if value is None: continue
        if assignations.count(value) == 3:
            final_assignation[topic] = value

In [23]:
final_assignation

{'durability': 'construction',
 'size': 'fit',
 'color accuracy': 'appearance',
 'color': 'appearance',
 'water resistance': 'functionality',
 'temperature': 'functionality',
 'value': 'price',
 'colorfastness': 'care',
 'packaging': 'shipping',
 'length': 'fit',
 'waterproofing': 'functionality',
 'color retention': 'care',
 'style': 'appearance',
 'waterproof': 'functionality',
 'waterproofness': 'functionality',
 'warmth': 'functionality',
 'temperature regulation': 'functionality',
 'security': 'functionality',
 'longevity': 'construction',
 'temperature tolerance': 'functionality',
 'insulation': 'functionality',
 'cold resistance': 'functionality',
 'reliability': 'construction',
 'wear and tear': 'construction',
 'capacity': 'functionality',
 'seasonal usage': 'functionality',
 'breathability': 'comfort',
 'seasonality': 'functionality',
 'weather': 'functionality',
 'returns': 'service',
 'height': 'fit',
 'return': 'service',
 'temperature suitability': 'functionality',
 'inse

In [24]:
extend_correct_triplets = correct_triplets.copy()
for wrong, correct in final_assignation.items():
    for sent in wrong_topics[wrong]:
        if sent not in extend_correct_triplets[correct]:
            extend_correct_triplets[correct][sent] = {}
        for statement in wrong_topics[wrong][sent]:
            if statement not in extend_correct_triplets[correct][sent]:
                extend_correct_triplets[correct][sent][statement] = 0
            extend_correct_triplets[correct][sent][statement] += wrong_topics[wrong][sent][statement]

In [25]:
extend_correct_triplets_df_stats = build_topic_stats_df(extend_correct_triplets)
print(extend_correct_triplets_df_stats)

              negative         neutral        positive           TOTAL        
                unique   total  unique  total   unique   total  unique   total
appearance       19294   27012   10325  12266    62620  153106   92239  192384
care              6266    7649    5589   6430     9411   17868   21266   31947
comfort          18424   25254    4636   5625    44530  130136   67590  161015
construction     28740   35683    6686   7380    25558   58050   60984  101113
fit              59530  100201   35603  45489    68037  141573  163170  287263
functionality    21519   24397   14166  15193    97147  144134  132832  183724
material         19044   28719   11905  15967    33158   82619   64107  127305
price             6599   11895    4051   5306    16943   58051   27593   75252
service           4993    6636    2239   2466    10680   19153   17912   28255
shipping          3600    4834    2697   3216     5640   18962   11937   27012


In [26]:
extend_correct_triplets_df_stats.sum(axis=0)

negative  unique     188009
          total      272280
neutral   unique      97897
          total      119338
positive  unique     373724
          total      823652
TOTAL     unique     659630
          total     1215270
dtype: int64

In [27]:
def process_dataset(data_df, topics, corrected_topics, sentiments):
    corrected_topics_list = list(corrected_topics)

    cleaned_statements = []
    for index, triplet_list in enumerate(data_df["statements"].tolist(), start=1):
        try:
            triplet_list = ast.literal_eval(triplet_list)
        except:
            cleaned_statements.append(None)
            continue

        new_triplet_list = []
        for triplet in triplet_list:
            if not triplet: continue

            topic = triplet.get("topic")
            if topic: topic = topic.lower().replace("-", " ").replace("_", " ").strip()
            else: continue
            if topic not in (topics + corrected_topics_list): continue
            if topic in corrected_topics_list: topic = corrected_topics[topic]

            sentiment = triplet.get("sentiment")
            if sentiment: sentiment = sentiment.lower().strip()
            if sentiment not in sentiments: continue
                
            statement = triplet.get("statement")
            if statement: statement = statement.lower().replace("-", " ").replace("_", " ").strip()
            else: continue

            new_triplet = {"statement": statement, "topic": topic, "sentiment": sentiment}
            new_triplet_list.append(new_triplet)

        if len(new_triplet_list) == 0:
            new_triplet_list = None
        cleaned_statements.append(new_triplet_list)

        if index % 10_000 == 0:
            print("10000 samples processed...")

    print("Done!")
    n_none = cleaned_statements.count(None)
    print("Number of fails:", n_none)
    print("% of fails:", n_none/len(data_df))

    new_data_df = pd.DataFrame(data_df)
    new_data_df["statements"] = cleaned_statements
        
    columns = ["user_id", "item_id", "timestamp", "rating", "statements"]
    #new_data_df = new_data_df[columns]
    new_data_df = new_data_df.dropna(subset=columns)

    return new_data_df

In [28]:
new_data_df = process_dataset(data_df, topics, final_assignation, sentiments)
print(new_data_df.head())

10000 samples processed...
10000 samples processed...
10000 samples processed...
10000 samples processed...
10000 samples processed...
10000 samples processed...
10000 samples processed...
10000 samples processed...
10000 samples processed...
10000 samples processed...
10000 samples processed...
10000 samples processed...
10000 samples processed...
10000 samples processed...
10000 samples processed...
10000 samples processed...
10000 samples processed...
10000 samples processed...
10000 samples processed...
10000 samples processed...
10000 samples processed...
10000 samples processed...
10000 samples processed...
10000 samples processed...
10000 samples processed...
10000 samples processed...
10000 samples processed...
Done!
Number of fails: 3903
% of fails: 0.01400546151996756
  helpful   reviewTime  rating   timestamp  \
0  [0, 0]  02 12, 2011     5.0  1297468800   
1  [0, 0]  01 19, 2013     5.0  1358553600   
2  [0, 0]   01 4, 2013     5.0  1357257600   
3  [0, 0]  04 27, 2014     

In [29]:
statement_triplets = new_data_df["statements"].to_list()
len(statement_triplets)

274774

In [None]:
correct_triplets, wrong_topics, wrong_sentiments, wrong_topics_and_sentiments = process_triplets(
    statement_triplets, topics, sentiments
)

In [None]:
len(correct_triplets), len(wrong_topics), len(wrong_sentiments), len(wrong_topics_and_sentiments)

In [None]:
final_correct_triplets_df_stats = build_topic_stats_df(correct_triplets)
print(final_correct_triplets_df_stats)

In [None]:
def plot_topic_sentiment_bars(
    df: pd.DataFrame,
    save_pdf_path: str | None = None,
    figsize=(11, 6.5)
):
    mpl.rcParams['axes.prop_cycle'] = plt.cycler(color=[
        "#8da0cb", "#fc8d62", "#66c2a5", "#e78ac3", "#a6d854"
    ])
    mpl.rcParams['grid.linestyle'] = ":"
    mpl.rcParams['grid.linewidth'] = 0.6
    mpl.rcParams['grid.alpha'] = 0.7
    mpl.rcParams['font.family'] = 'serif'
    mpl.rcParams['font.size'] = 12
    mpl.rcParams['legend.fontsize'] = 10
    mpl.rcParams['legend.frameon'] = False
    mpl.rcParams['axes.spines.top'] = False
    mpl.rcParams['axes.spines.right'] = False

    def get_col(df, sent, kind):
        if isinstance(df.columns, pd.MultiIndex):
            if (sent, kind) in df.columns:
                return df[(sent, kind)]
            for a, b in df.columns:
                if str(a).lower() == sent and str(b).lower() == kind:
                    return df[(a, b)]
        else:
            candidates = [
                f"{sent} {kind}",
                f"{sent}_{kind}",
                f"{sent}.{kind}",
                f"{sent.capitalize()} {kind}",
                f"{sent.capitalize()}_{kind}",
            ]
            for c in candidates:
                if c in df.columns:
                    return df[c]
            for c in df.columns:
                if sent in str(c).lower() and kind in str(c).lower():
                    return df[c]
        return pd.Series(0, index=df.index, dtype=float)

    def drop_total(df):
        if isinstance(df.columns, pd.MultiIndex):
            keep = [c for c in df.columns if str(c[0]).lower() != "total"]
            return df.loc[:, keep]
        else:
            keep = [c for c in df.columns if "total" != str(c).strip().lower()]
            return df[[c for c in df.columns if not str(c).lower().startswith("total ")]]

    df = drop_total(df).copy()
    df = df.sort_index(ascending=False)

    neg_u = get_col(df, "negative", "unique").astype(float)
    neg_t = get_col(df, "negative", "total").astype(float)
    neu_u = get_col(df, "neutral",  "unique").astype(float)
    neu_t = get_col(df, "neutral",  "total").astype(float)
    pos_u = get_col(df, "positive", "unique").astype(float)
    pos_t = get_col(df, "positive", "total").astype(float)

    fig, ax = plt.subplots(figsize=figsize)

    topics = df.index.to_list()
    y = np.arange(len(topics))

    group_height = 0.78
    n_series = 6
    bar_h = group_height / n_series

    NEG = "#d73027"
    NEU = "#377eb8"
    POS = "#1a9850"

    colors = {
        "neg_u": (NEG, 0.55),
        "neg_t": (NEG, 1.00),
        "neu_u": (NEU, 0.55),
        "neu_t": (NEU, 1.00),
        "pos_u": (POS, 0.55),
        "pos_t": (POS, 1.00),
    }

    series = [
        ("neg_u", neg_u, "Negative · Unique"),
        ("neg_t", neg_t, "Negative · Total"),
        ("neu_u", neu_u, "Neutral · Unique"),
        ("neu_t", neu_t, "Neutral · Total"),
        ("pos_u", pos_u, "Positive · Unique"),
        ("pos_t", pos_t, "Positive · Total"),
    ]

    start = y - group_height / 2 + 0.05
    for i, (key, values, label) in enumerate(series):
        c, a = colors[key]
        ax.barh(start + i * bar_h,
                values.values,
                height=bar_h * 0.92,
                label=label,
                color=c,
                alpha=a,
                edgecolor="none")

    ax.set_yticks(y)
    ax.set_yticklabels(topics)
    ax.set_xlabel("Count of statements")
    #ax.set_title("Topic × Sentiment")

    ax.grid(axis="x", which="both")
    ax.set_axisbelow(True)

    handles, labels = ax.get_legend_handles_labels()
    handles = handles[0::2] + handles[1::2]
    labels = labels[0::2] + labels[1::2]
    seen = set()
    h2, l2 = [], []
    for h, l in zip(handles, labels):
        if l not in seen:
            seen.add(l)
            h2.append(h)
            l2.append(l)
    ax.legend(h2, l2, ncols=2, loc="lower right", bbox_to_anchor=(1.0, 0.0), frameon=True)

    plt.tight_layout()

    if save_pdf_path:
        plt.savefig(save_pdf_path, dpi=300, bbox_inches="tight")
    plt.show()


In [None]:
os.makedirs("plots", exist_ok=True)
plot_topic_sentiment_bars(
    final_correct_triplets_df_stats, 
    os.path.join("plots", dataset_name + "_topic_sentiment_statement_distribution.pdf")
)

In [None]:
new_data_df.to_csv(os.path.join(dataset_dir, "processed_dataset.csv"))

In [None]:
all_statements = []
all_topics = []
all_sentiments = []
all_freq = []

for topic, topic_data in correct_triplets.items():
    for sent, sent_data in topic_data.items():
        for i, (statement, freq) in enumerate(sent_data.items()):
            all_statements.append(statement)
            all_topics.append(topic)
            all_sentiments.append(sent)
            all_freq.append(freq)
            if i == 0:
                print((statement, topic, sent), freq)

statement_topic_sentiment_freq_df = pd.DataFrame({
    "statement": all_statements,
    "topic": all_topics,
    "sentiment": all_sentiments,
    "frequency": all_freq
})

print(len(statement_topic_sentiment_freq_df))

In [None]:
print(statement_topic_sentiment_freq_df.sample(n=10))

In [None]:
statement_topic_sentiment_freq_df.to_csv(os.path.join(dataset_dir, "statement_topic_sentiment_freq.csv"))

In [None]:
## embed with all-MiniLM-L6-v2
embeddings_dir = os.path.join(dataset_dir, "vectors", "all-MiniLM-L6-v2")

# from sentence_transformers import SentenceTransformer
# import umap

# model = SentenceTransformer("all-MiniLM-L6-v2")
# embeddings = model.encode(all_statements, normalize_embeddings=True)
# embeddings = torch.from_numpy(embeddings)
# os.makedirs(embeddings_dir, exist_ok=True)
# torch.save(embeddings, os.path.join(embeddings_dir, "embeddings.pt"))

# reducer = umap.UMAP(random_state=42, metric="cosine", n_components=2)
# embeddings_umap = reducer.fit_transform(embeddings.numpy())
# embeddings_umap = torch.from_numpy(embeddings_umap)
# torch.save(embeddings_umap, os.path.join(embeddings_dir, "embeddings_umap.pt"))

In [None]:
embeddings_umap = torch.load(os.path.join(embeddings_dir, "embeddings_umap.pt")).numpy()
embeddings_umap.shape

In [None]:
def plot_statements_2d(
    points,
    topics,
    save_pdf_path=None,
    figsize=(8.5, 6.0),
    point_size=5.0,
    alpha= 0.55,
    title=None
):
    topics = list(topics)
    unique_topics = sorted({t for t in topics})
    n_topics = len(unique_topics)

    base_cmap = get_cmap("tab10")
    if n_topics <= 10:
        palette = [to_hex(base_cmap(i)) for i in range(n_topics)]
    else:
        ext = get_cmap("tab20")
        palette = [to_hex(ext(i % ext.N)) for i in range(n_topics)]

    color_map = {topic: palette[i] for i, topic in enumerate(unique_topics)}

    fig, ax = plt.subplots(figsize=figsize)

    ax.spines["top"].set_visible(False)
    ax.spines["right"].set_visible(False)
    ax.grid(True, linestyle=":", linewidth=0.6, alpha=0.6)
    ax.set_axisbelow(True)

    handles = []
    labels = []
    for topic in unique_topics:
        mask = np.array([t == topic for t in topics], dtype=bool)
        h = ax.scatter(
            points[mask, 0],
            points[mask, 1],
            s=point_size,
            c=color_map[topic],
            alpha=alpha,
            linewidths=0.0,
            label=str(topic),
        )
        handles.append(h)
        labels.append(str(topic))

    ax.set_xlabel("UMAP-1")
    ax.set_ylabel("UMAP-2")
    if title:
        ax.set_title(title)
    ax.legend(
        handles,
        labels,
        loc="best",
        frameon=True,
        ncols=1 if len(unique_topics) > 12 else 2,  # compact layout when many topics
        fontsize=9
    )
    ax.set_aspect("equal")

    plt.tight_layout()

    # Save if requested
    if save_pdf_path:
        plt.savefig(save_pdf_path, format="pdf", bbox_inches="tight", dpi=300)

    plt.show()


In [None]:
plot_statements_2d(embeddings_umap, all_topics, save_pdf_path="plots/Toys_umap_topics.pdf", figsize=(9, 6))

In [None]:
def analyze_statements_dataset(df):
    print("=" * 80)
    print("STATEMENT DATASET STATISTICAL ANALYSIS")
    print("=" * 80)
    
    # General statistics
    print("\nGENERAL STATISTICS")
    print("-" * 40)
    n_users = df['user_id'].nunique()
    n_items = df['item_id'].nunique()
    n_interactions = len(df)
    
    print(f"Unique users: {n_users:,}")
    print(f"Unique items: {n_items:,}")
    print(f"Total interactions: {n_interactions:,}")
    
    # Expand statements for analysis
    expanded_data = []
    for idx, row in df.iterrows():
        user_id = row['user_id']
        item_id = row['item_id']
        statements = row['statements']
        
        if isinstance(statements, list):
            for stmt_dict in statements:
                if isinstance(stmt_dict, dict):
                    expanded_data.append({
                        'user_id': user_id,
                        'item_id': item_id,
                        'interaction_id': idx,
                        'statement': stmt_dict.get('statement', ''),
                        'topic': stmt_dict.get('topic', ''),
                        'sentiment': stmt_dict.get('sentiment', '')
                    })
    
    expanded_df = pd.DataFrame(expanded_data)
    
    if expanded_df.empty:
        print("No valid statements found in the dataset")
        return
    
    print(f"Total statements: {len(expanded_df):,}")
    print(f"Unique statements: {expanded_df['statement'].nunique():,}")
    
    # Get unique sentiments and topics
    sentiments = sorted(expanded_df['sentiment'].unique())
    topics = sorted(expanded_df['topic'].unique())
    
    print(f"Sentiments: {sentiments}")
    print(f"Topics ({len(topics)}): {topics}")
    
    # Function to calculate statistics
    def calculate_stats(data, group_by, count_by='statement', unique=True):
        if unique:
            # Count unique statements
            stats = data.groupby(group_by)[count_by].nunique()
        else:
            # Count with frequency (all statements)
            stats = data.groupby(group_by)[count_by].count()
        
        return {
            'average': stats.mean(),
            'maximum': stats.max(),
            'minimum': stats.min(),
            'median': stats.median(),
            'std': stats.std()
        }
    
    # 1. USER STATISTICS
    print("\n" + "=" * 80)
    print("USER STATISTICS")
    print("=" * 80)
    
    user_stats_results = []
    
    # Global (all sentiments)
    for unique in [True, False]:
        label = "Unique" if unique else "With frequency"
        stats = calculate_stats(expanded_df, 'user_id', unique=unique)
        user_stats_results.append({
            'Metric': f'Statements per user ({label})',
            'Sentiment': 'All',
            'Topic': 'All',
            'Average': f"{stats['average']:.2f}",
            'Maximum': f"{stats['maximum']:.0f}",
            'Minimum': f"{stats['minimum']:.0f}",
            'Median': f"{stats['median']:.2f}",
            'Std_Dev': f"{stats['std']:.2f}"
        })
    
    # By sentiment
    for sentiment in sentiments:
        sentiment_data = expanded_df[expanded_df['sentiment'] == sentiment]
        for unique in [True, False]:
            label = "Unique" if unique else "With frequency"
            if not sentiment_data.empty:
                stats = calculate_stats(sentiment_data, 'user_id', unique=unique)
                user_stats_results.append({
                    'Metric': f'Statements per user ({label})',
                    'Sentiment': sentiment,
                    'Topic': 'All',
                    'Average': f"{stats['average']:.2f}",
                    'Maximum': f"{stats['maximum']:.0f}",
                    'Minimum': f"{stats['minimum']:.0f}",
                    'Median': f"{stats['median']:.2f}",
                    'Std_Dev': f"{stats['std']:.2f}"
                })
    
    # Per user/topic
    for unique in [True, False]:
        label = "Unique" if unique else "With frequency"
        stats = calculate_stats(expanded_df, ['user_id', 'topic'], unique=unique)
        user_stats_results.append({
            'Metric': f'Statements per user/topic ({label})',
            'Sentiment': 'All',
            'Topic': 'All',
            'Average': f"{stats['average']:.2f}",
            'Maximum': f"{stats['maximum']:.0f}",
            'Minimum': f"{stats['minimum']:.0f}",
            'Median': f"{stats['median']:.2f}",
            'Std_Dev': f"{stats['std']:.2f}"
        })
    
    # Per topic/user
    for topic in topics:
        topic_data = expanded_df[expanded_df['topic'] == topic]
        for unique in [True, False]:
            label = "Unique" if unique else "With frequency"
            if not topic_data.empty:
                stats = calculate_stats(topic_data, 'user_id', unique=unique)
                user_stats_results.append({
                    'Metric': f'Statements per user for topic "{topic}" ({label})',
                    'Sentiment': 'All',
                    'Topic': topic,
                    'Average': f"{stats['average']:.2f}",
                    'Maximum': f"{stats['maximum']:.0f}",
                    'Minimum': f"{stats['minimum']:.0f}",
                    'Median': f"{stats['median']:.2f}",
                    'Std_Dev': f"{stats['std']:.2f}"
                })
    
    user_stats_df = pd.DataFrame(user_stats_results)
    print(user_stats_df.to_string(index=False))
    
    # 2. ITEM STATISTICS
    print("\n" + "=" * 80)
    print("ITEM STATISTICS")
    print("=" * 80)
    
    item_stats_results = []
    
    # Global (all sentiments)
    for unique in [True, False]:
        label = "Unique" if unique else "With frequency"
        stats = calculate_stats(expanded_df, 'item_id', unique=unique)
        item_stats_results.append({
            'Metric': f'Statements per item ({label})',
            'Sentiment': 'All',
            'Topic': 'All',
            'Average': f"{stats['average']:.2f}",
            'Maximum': f"{stats['maximum']:.0f}",
            'Minimum': f"{stats['minimum']:.0f}",
            'Median': f"{stats['median']:.2f}",
            'Std_Dev': f"{stats['std']:.2f}"
        })
    
    # By sentiment
    for sentiment in sentiments:
        sentiment_data = expanded_df[expanded_df['sentiment'] == sentiment]
        for unique in [True, False]:
            label = "Unique" if unique else "With frequency"
            if not sentiment_data.empty:
                stats = calculate_stats(sentiment_data, 'item_id', unique=unique)
                item_stats_results.append({
                    'Metric': f'Statements per item ({label})',
                    'Sentiment': sentiment,
                    'Topic': 'All',
                    'Average': f"{stats['average']:.2f}",
                    'Maximum': f"{stats['maximum']:.0f}",
                    'Minimum': f"{stats['minimum']:.0f}",
                    'Median': f"{stats['median']:.2f}",
                    'Std_Dev': f"{stats['std']:.2f}"
                })
    
    # Per item/topic
    for unique in [True, False]:
        label = "Unique" if unique else "With frequency"
        stats = calculate_stats(expanded_df, ['item_id', 'topic'], unique=unique)
        item_stats_results.append({
            'Metric': f'Statements per item/topic ({label})',
            'Sentiment': 'All',
            'Topic': 'All',
            'Average': f"{stats['average']:.2f}",
            'Maximum': f"{stats['maximum']:.0f}",
            'Minimum': f"{stats['minimum']:.0f}",
            'Median': f"{stats['median']:.2f}",
            'Std_Dev': f"{stats['std']:.2f}"
        })
    
    # Per topic/item
    for topic in topics:
        topic_data = expanded_df[expanded_df['topic'] == topic]
        for unique in [True, False]:
            label = "Unique" if unique else "With frequency"
            if not topic_data.empty:
                stats = calculate_stats(topic_data, 'item_id', unique=unique)
                item_stats_results.append({
                    'Metric': f'Statements per item for topic "{topic}" ({label})',
                    'Sentiment': 'All',
                    'Topic': topic,
                    'Average': f"{stats['average']:.2f}",
                    'Maximum': f"{stats['maximum']:.0f}",
                    'Minimum': f"{stats['minimum']:.0f}",
                    'Median': f"{stats['median']:.2f}",
                    'Std_Dev': f"{stats['std']:.2f}"
                })
    
    item_stats_df = pd.DataFrame(item_stats_results)
    print(item_stats_df.to_string(index=False))
    
    # 3. INTERACTION STATISTICS
    print("\n" + "=" * 80)
    print("INTERACTION STATISTICS")
    print("=" * 80)
    
    interaction_stats_results = []
    
    # Global (all sentiments)
    for unique in [True, False]:
        label = "Unique" if unique else "With frequency"
        stats = calculate_stats(expanded_df, 'interaction_id', unique=unique)
        interaction_stats_results.append({
            'Metric': f'Statements per interaction ({label})',
            'Sentiment': 'All',
            'Topic': 'All',
            'Average': f"{stats['average']:.2f}",
            'Maximum': f"{stats['maximum']:.0f}",
            'Minimum': f"{stats['minimum']:.0f}",
            'Median': f"{stats['median']:.2f}",
            'Std_Dev': f"{stats['std']:.2f}"
        })
    
    # By sentiment
    for sentiment in sentiments:
        sentiment_data = expanded_df[expanded_df['sentiment'] == sentiment]
        for unique in [True, False]:
            label = "Unique" if unique else "With frequency"
            if not sentiment_data.empty:
                stats = calculate_stats(sentiment_data, 'interaction_id', unique=unique)
                interaction_stats_results.append({
                    'Metric': f'Statements per interaction ({label})',
                    'Sentiment': sentiment,
                    'Topic': 'All',
                    'Average': f"{stats['average']:.2f}",
                    'Maximum': f"{stats['maximum']:.0f}",
                    'Minimum': f"{stats['minimum']:.0f}",
                    'Median': f"{stats['median']:.2f}",
                    'Std_Dev': f"{stats['std']:.2f}"
                })
    
    # Per interaction/topic
    for unique in [True, False]:
        label = "Unique" if unique else "With frequency"
        stats = calculate_stats(expanded_df, ['interaction_id', 'topic'], unique=unique)
        interaction_stats_results.append({
            'Metric': f'Statements per interaction/topic ({label})',
            'Sentiment': 'All',
            'Topic': 'All',
            'Average': f"{stats['average']:.2f}",
            'Maximum': f"{stats['maximum']:.0f}",
            'Minimum': f"{stats['minimum']:.0f}",
            'Median': f"{stats['median']:.2f}",
            'Std_Dev': f"{stats['std']:.2f}"
        })
    
    # Per topic/interaction
    for topic in topics:
        topic_data = expanded_df[expanded_df['topic'] == topic]
        for unique in [True, False]:
            label = "Unique" if unique else "With frequency"
            if not topic_data.empty:
                stats = calculate_stats(topic_data, 'interaction_id', unique=unique)
                interaction_stats_results.append({
                    'Metric': f'Statements per interaction for topic "{topic}" ({label})',
                    'Sentiment': 'All',
                    'Topic': topic,
                    'Average': f"{stats['average']:.2f}",
                    'Maximum': f"{stats['maximum']:.0f}",
                    'Minimum': f"{stats['minimum']:.0f}",
                    'Median': f"{stats['median']:.2f}",
                    'Std_Dev': f"{stats['std']:.2f}"
                })
    
    interaction_stats_df = pd.DataFrame(interaction_stats_results)
    print(interaction_stats_df.to_string(index=False))
    
    # 4. SENTIMENT AND TOPIC DISTRIBUTION
    print("\n" + "=" * 80)
    print("SENTIMENT AND TOPIC DISTRIBUTION")
    print("=" * 80)
    
    # Sentiment distribution
    sentiment_counts = expanded_df['sentiment'].value_counts()
    sentiment_unique_counts = expanded_df.groupby('sentiment')['statement'].nunique()
    
    sentiment_dist = pd.DataFrame({
        'Sentiment': sentiment_counts.index,
        'Total_Count': sentiment_counts.values,
        'Unique_Count': [sentiment_unique_counts.get(s, 0) for s in sentiment_counts.index],
        'Total_Percentage': (sentiment_counts.values / len(expanded_df) * 100),
        'Unique_Percentage': (sentiment_unique_counts.values / expanded_df['statement'].nunique() * 100)
    })
    
    print("Sentiment distribution:")
    print(sentiment_dist.round(2).to_string(index=False))
    
    # Topic distribution
    topic_counts = expanded_df['topic'].value_counts()
    topic_unique_counts = expanded_df.groupby('topic')['statement'].nunique()
    
    topic_dist = pd.DataFrame({
        'Topic': topic_counts.index,
        'Total_Count': topic_counts.values,
        'Unique_Count': [topic_unique_counts.get(t, 0) for t in topic_counts.index],
        'Total_Percentage': (topic_counts.values / len(expanded_df) * 100),
        'Unique_Percentage': (topic_unique_counts.values / expanded_df['statement'].nunique() * 100)
    })
    
    print("\nTopic distribution:")
    print(topic_dist.round(2).to_string(index=False))
    
    # 5. SENTIMENT x TOPIC CROSS-MATRIX
    print("\n" + "=" * 80)
    print("SENTIMENT x TOPIC CROSS-MATRIX")
    print("=" * 80)
    
    # With frequency
    cross_matrix_freq = pd.crosstab(expanded_df['sentiment'], expanded_df['topic'], margins=True)
    print("Matrix with frequency:")
    print(cross_matrix_freq)
    
    # Unique statements
    cross_matrix_unique = pd.crosstab(expanded_df['sentiment'], expanded_df['topic'], 
                                    values=expanded_df['statement'], aggfunc='nunique', margins=True)
    print("\nMatrix with unique statements:")
    print(cross_matrix_unique.fillna(0).astype(int))
    
    # 6. COMPREHENSIVE STATISTICS SUMMARY
    print("\n" + "=" * 80)
    print("COMPREHENSIVE STATISTICS SUMMARY")
    print("=" * 80)
    
    # Create a comprehensive summary DataFrame
    summary_stats = []
    
    # Helper function to add stats
    def add_stats(group_type, group_cols, sentiment_filter=None, topic_filter=None):
        data = expanded_df.copy()
        
        if sentiment_filter:
            data = data[data['sentiment'] == sentiment_filter]
        if topic_filter:
            data = data[data['topic'] == topic_filter]
        
        if data.empty:
            return
        
        for unique in [True, False]:
            count_type = "Unique" if unique else "Frequency"
            
            if unique:
                grouped = data.groupby(group_cols)['statement'].nunique()
            else:
                grouped = data.groupby(group_cols)['statement'].count()
            
            summary_stats.append({
                'Grouping': group_type,
                'Sentiment': sentiment_filter or 'All',
                'Topic': topic_filter or 'All',
                'Count_Type': count_type,
                'Average': round(grouped.mean(), 2),
                'Maximum': int(grouped.max()),
                'Minimum': int(grouped.min()),
                'Median': round(grouped.median(), 2),
                'Std_Dev': round(grouped.std(), 2)
            })
    
    # Add all combinations
    add_stats('Per user', ['user_id'])
    add_stats('Per user/topic', ['user_id', 'topic'])
    add_stats('Per item', ['item_id'])
    add_stats('Per item/topic', ['item_id', 'topic'])
    add_stats('Per interaction', ['interaction_id'])
    add_stats('Per interaction/topic', ['interaction_id', 'topic'])
    
    # By sentiment
    for sentiment in sentiments:
        add_stats('Per user', ['user_id'], sentiment_filter=sentiment)
        add_stats('Per item', ['item_id'], sentiment_filter=sentiment)
        add_stats('Per interaction', ['interaction_id'], sentiment_filter=sentiment)

    for topic in topics:
        add_stats('Per user', ['user_id'], topic_filter=topic)
        add_stats('Per item', ['item_id'], topic_filter=topic)
        add_stats('Per interaction', ['interaction_id'], topic_filter=topic)
    
    summary_df = pd.DataFrame(summary_stats)
    print("Comprehensive statistics summary:")
    print(summary_df.to_string(index=False))
    
    # 7. TOP CONTRIBUTORS
    print("\n" + "=" * 80)
    print("TOP CONTRIBUTORS")
    print("=" * 80)
    
    # Top users
    top_users_freq = expanded_df.groupby('user_id')['statement'].count().nlargest(5)
    top_users_unique = expanded_df.groupby('user_id')['statement'].nunique().nlargest(5)
    
    print("Top 5 users (by statement frequency):")
    for user_id, count in top_users_freq.items():
        print(f"  {user_id}: {count} statements")
    
    print("\nTop 5 users (by unique statements):")
    for user_id, count in top_users_unique.items():
        print(f"  {user_id}: {count} unique statements")
    
    # Top items
    top_items_freq = expanded_df.groupby('item_id')['statement'].count().nlargest(5)
    top_items_unique = expanded_df.groupby('item_id')['statement'].nunique().nlargest(5)
    
    print("\nTop 5 items (by statement frequency):")
    for item_id, count in top_items_freq.items():
        print(f"  {item_id}: {count} statements")
    
    print("\nTop 5 items (by unique statements):")
    for item_id, count in top_items_unique.items():
        print(f"  {item_id}: {count} unique statements")
    
    # 8. DIVERSITY STATISTICS
    print("\n" + "=" * 80)
    print("DIVERSITY STATISTICS")
    print("=" * 80)
    
    diversity_stats = []
    
    # Diversity per user (number of different topics)
    user_topic_diversity = expanded_df.groupby('user_id')['topic'].nunique()
    diversity_stats.append({
        'Metric': 'Topics per user',
        'Average': round(user_topic_diversity.mean(), 2),
        'Maximum': int(user_topic_diversity.max()),
        'Minimum': int(user_topic_diversity.min()),
        'Median': round(user_topic_diversity.median(), 2)
    })
    
    # Diversity per user (number of different sentiments)
    user_sentiment_diversity = expanded_df.groupby('user_id')['sentiment'].nunique()
    diversity_stats.append({
        'Metric': 'Sentiments per user',
        'Average': round(user_sentiment_diversity.mean(), 2),
        'Maximum': int(user_sentiment_diversity.max()),
        'Minimum': int(user_sentiment_diversity.min()),
        'Median': round(user_sentiment_diversity.median(), 2)
    })
    
    # Diversity per item
    item_topic_diversity = expanded_df.groupby('item_id')['topic'].nunique()
    diversity_stats.append({
        'Metric': 'Topics per item',
        'Average': round(item_topic_diversity.mean(), 2),
        'Maximum': int(item_topic_diversity.max()),
        'Minimum': int(item_topic_diversity.min()),
        'Median': round(item_topic_diversity.median(), 2)
    })
    
    item_sentiment_diversity = expanded_df.groupby('item_id')['sentiment'].nunique()
    diversity_stats.append({
        'Metric': 'Sentiments per item',
        'Average': round(item_sentiment_diversity.mean(), 2),
        'Maximum': int(item_sentiment_diversity.max()),
        'Minimum': int(item_sentiment_diversity.min()),
        'Median': round(item_sentiment_diversity.median(), 2)
    })
    
    diversity_df = pd.DataFrame(diversity_stats)
    print("Diversity statistics:")
    print(diversity_df.to_string(index=False))
    
    # 9. STATEMENT FREQUENCY ANALYSIS
    print("\n" + "=" * 80)
    print("STATEMENT FREQUENCY ANALYSIS")
    print("=" * 80)
    
    # Most frequent statements
    statement_freq = expanded_df['statement'].value_counts()
    print("Top 10 most frequent statements:")
    for i, (stmt, count) in enumerate(statement_freq.head(10).items(), 1):
        print(f"  {i:2d}. '{stmt[:50]}{'...' if len(stmt) > 50 else ''}' (count: {count})")
    
    # Statement length statistics
    expanded_df['statement_length'] = expanded_df['statement'].str.len()
    length_stats = expanded_df['statement_length'].describe()
    
    print(f"\nStatement length statistics:")
    print(f"  Average length: {length_stats['mean']:.1f} characters")
    print(f"  Maximum length: {length_stats['max']:.0f} characters")
    print(f"  Minimum length: {length_stats['min']:.0f} characters")
    print(f"  Median length: {length_stats['50%']:.1f} characters")
    
    print("\n" + "=" * 80)
    print("ANALYSIS COMPLETED")

In [None]:
analyze_statements_dataset(new_data_df)

In [None]:
def prepare_dataset(data_df, statement_topic_sentiment_df, topics):
    statement_idxs = []
    topic_idxs = []
    sentiment_idxs = []
    sentiment_map = {"positive": +1, "neutral": 0, "negative": -1}

    lookup_dict = {}
    for idx, row in statement_topic_sentiment_df.iterrows():
        key = (row['statement'], row['topic'], row['sentiment'])
        lookup_dict[key] = idx
    
    for index, triplet_list in enumerate(data_df["statements"].tolist(), start=1):
        interaction_statement_idxs = []
        interaction_topic_idxs = []
        interaction_sentiment_idxs = []

        for triplet in triplet_list:
            topic = triplet.get("topic")
            sentiment = triplet.get("sentiment")
            statement = triplet.get("statement")

            topic_idx = topics.index(topic)
            sentiment_idx = sentiment_map[sentiment]
            statement_idx = lookup_dict[(statement, topic, sentiment)]

            interaction_statement_idxs.append(statement_idx)
            interaction_topic_idxs.append(topic_idx)
            interaction_sentiment_idxs.append(sentiment_idx)
        
        statement_idxs.append(interaction_statement_idxs)
        topic_idxs.append(interaction_topic_idxs)
        sentiment_idxs.append(interaction_sentiment_idxs)

        if index % 10_000 == 0:
            print("10000 samples processed...")

    print("Done!")

    data_df = pd.DataFrame(data_df)
    data_df["statement_ids"] = statement_idxs
    data_df["topic_ids"] = topic_idxs
    data_df["sentiments"] = sentiment_idxs

    return data_df

In [None]:
topics.sort()
print(topics)
with open(os.path.join(dataset_dir, "topics.json"), "w") as f:
    json.dump(topics, f)

In [None]:
final_df = prepare_dataset(new_data_df, statement_topic_sentiment_freq_df, topics)

In [None]:
final_df.head(n=10)

In [None]:
final_df.to_csv(os.path.join(dataset_dir, "processed_dataset.csv"))