In [1]:
from sentence_transformers import SentenceTransformer, util
from nltk.corpus import wordnet
from datasets import Dataset
from getpass import getpass
from tqdm import tqdm
import pandas as pd
import numpy as np
import random
import nltk
import os

In [2]:
# access_token
access_token = getpass()
os.environ["HUGGINGFACEHUB_API_TOKEN"] = access_token

In [3]:
# Load Embedding Model
ST = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

# UDA(Unsupervised Data Augmentation)

In [4]:
# Download necessary NLTK data
nltk.download("wordnet")
nltk.download("omw-1.4")

[nltk_data] Downloading package wordnet to /Users/cookie/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/cookie/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [None]:
def get_relevant_synonyms(word):
    """Fetch relevant synonyms of a given word using WordNet."""
    synonyms = set()
    for syn in wordnet.synsets(word):
        for lemma in syn.lemmas():
            # Only include single-word synonyms
            synonym = lemma.name().replace("_", " ")
            if " " not in synonym and synonym.lower() != word.lower():  # Avoid multi-word and self-replacement
                synonyms.add(synonym)


    fallback_synonyms = {
        "construction": ["building", "structure", "development"],
        "tools": ["equipment",  "gear"],
        "light": ["illumination", "lighting", "brightness"],
        "welding": ["fusing", "melting", "fusion"],
        "pump": ["compressor", "dispenser", "motor"],
        "generator": ["alternator", "engine", "dynamo"],
        "saw": ["cutter", "blade", "handsaw"],
        "drill": ["borer", "perforator", "auger"],
        "painting": ["decorating", "coating", "artwork"],
        "screw": ["fastener", "bolt", "pin"],
        "soldering": ["brazing", "tinning", "sealing"],
        "components": ["parts", "modules", "units"],
        "faucet": ["tap", "valve", "spigot"],
        "appliances": ["devices", "gadgets", "machines"],
        "personal": ["individual", "private", "self-care"],
        "massager": ["relaxer", "kneader", "stimulator"],
        "hair_cutter": ["trimmer", "clipper", "razor"],
        "scales": ["weighing_machine", "balance", "measuring_tool"],
        "environment": ["surroundings", "conditions", "ecosystem"],
        "vacuum": ["cleaner", "suction", "hoover"],
        "air_heater": ["warmer", "furnace", "radiator"],
        "air_conditioner": ["cooler", "AC", ],
        "climate": ["weather", "atmosphere", "temperature"],
        "water_heater": ["geyser", "boiler"],
        "fan": ["ventilator", "propeller"],
        "kitchen": ["cooking", "culinary"],
        "washer": ["laundry_machine", "washing_machine"],
        "refrigerators": ["freezers",  "chillers"],
        "oven": ["stove", "baking_device"],
        "kettle": [ "teapot"],
        "toaster": ["browner", "griller", "toasting_device"],
        "grill": ["barbecue", "broiler"],
        "hood": ["vent", "extractor_hood"],
        "mixer": [ "beater"],
        "juicer": ["squeezer", "press"],
        "blender": ["liquidizer"],
        "dishwasher": ["dish_cleaner", "cleaning_machine"],
        "steam_cooker": ["steamer", "pressure_cooker"],
        "meat_grinder": ["mincer", "chopper"],
        "coffee_grinder": ["mill", "bean_grinder"],
        "hob": ["cooktop",  "range"],
        "coffee_machine": ["espresso_machine", "brewer", "coffee_maker"],
        "fryer": ["deep_fryer",  "roaster"],
        "microwave": ["heater"],
        "notebook": ["laptop", "netbook", "ultrabook"],
        "power_supply": ["PSU", "adapter", "charger"],
        "memory": ["RAM", "storage", "cache"],
        "cooler": ["chiller",  "fan"],
        "motherboard": ["mainboard", "system_board", "logic_board"],
        "videocards": ["GPUs", "graphics_cards", "video_adapters"],
        "hdd": ["hard_drive", "storage_device", "disk"],
        "cpu": ["chip", "microprocessor"],
        "sound_card": ["audio_adapter", "DAC", "audio_processor"],
        "peripherals": ["add-ons"],
        "printer": ["plotter", "copier", "scanner"],
        "mouse": ["pointer", "trackpad"],
        "monitor": ["screen", "display", "visualizer"],
        "keyboard": ["keypad", "typewriter"],
        "electronics": ["technology"],
        "camera": ["photo_camera", "video_camera", "recorder"],
        "tv": ["television"],
        "projector": ["beamer"],
        "audio": ["sound", "stereo", "music"],
        "headphone": ["earphones", "headset"],
        "music_tools": ["instruments", "musical_devices"],
        "subwoofer": ["woofer","bass_system"],
        "microphone": ["mic",  "audio_device"],
        "shoes": ["footwear", "boots"],
        "sandals": ["flip_flops", "open_shoes"],
        "espadrilles": ["casual_shoes", "canvas_shoes"],
        "ballet_shoes": ["flats", "dancing_shoes", "ballet_slippers"],
        "furniture": ["fixtures", "home_decor", "furnishings"],
        "living_room": ["lounge", "sitting-room", "parlor"],
        "chair": ["armchair"],
        "cabinet": ["cupboard", "wardrobe", "storage_unit"],
        "sofa": ["couch", "settee", "lounger"],
        "shelving": ["racks", "shelves"],
        "bedroom": ["sleeping_area"],
        "bed": ["cot", "mattress", "bunk"],
        "pillow": ["cushion", "headrest", "bolster"],
        "blanket": ["quilt", "duvet", "cover"],
        "table": ["desk", "counter"],
        "bathroom": ["restroom", "washroom", "shower-room"],
        "bath": ["bathtub", "soak", "tub"],
        "toilet": ["lavatory", "commode"],
        "auto": ["automobile", "vehicle"],
        "compressor": ["air_compressor"],
        "radar": ["detector", "sensor"],
        "videoregister": ["dash-cam", "Car-DVR"],
        "anti_freeze": ["coolant", "defroster"],
        "winch": ["hoist", "pulley", "crane"],
        "parktronic": ["parking_sensor", "assistant"],
        "alarm": ["alert", "warning", "buzzer"],
        "player": ["media_player", "entertainer"],
        "kids": ["children", "youth", "infants"],
        "diapers": ["nappies", "pants", "huggies"],
        "medicine": ["healthcare", "pharmacy", "treatment"],
        "tonometer": ["pressure_meter", "gauge"],
        "country_yard": ["garden", "backyard", "farm"],
        "hammok": ["swing", "hammock"],
        "bench": ["long_chair"]
    }


    # Add fallback synonyms if WordNet lacks results
    if word in fallback_synonyms:
        synonyms.update(fallback_synonyms[word])

    if not synonyms:
        return []  # No synonyms available


    # Compute semantic similarity between the original word and its synonyms
    original_embedding = ST.encode(word, convert_to_tensor=True)
    synonym_scores = []
    for synonym in synonyms:
        synonym_embedding = ST.encode(synonym, convert_to_tensor=True)
        similarity_score = util.cos_sim(original_embedding, synonym_embedding).item()
        synonym_scores.append((synonym, similarity_score))

    # Filter synonyms with a high similarity threshold
    relevant_synonyms = [syn for syn, score in synonym_scores if score > 0.7]
    return relevant_synonyms

In [6]:
def augment_category_code(x):
    """Replace words in category_code with semantically relevant synonyms."""
    words = x.split()  # Split by spaces
    augmented_words = []
    
    for word in words:
        synonyms = get_relevant_synonyms(word)
        if synonyms:
            augmented_words.append(random.choice(synonyms))
        else:
            augmented_words.append(word)
            
    return " ".join(augmented_words)

In [7]:
category_df = pd.read_csv('/Users/cookie/Desktop/Final_run/2019-Oct.clean.csv').dropna(subset=['category_code'])

In [8]:
category_str = [str(' '.join(elm.split('.'))) for elm in category_df['category_code']]

In [9]:
category_dataset = Dataset.from_pandas(pd.DataFrame(category_df['user_id']))

In [10]:
category_dataset = category_dataset.add_column('category_code', category_str)
category_dataset

Dataset({
    features: ['user_id', 'category_code'],
    num_rows: 1235606
})

In [11]:
augmented_col = []
for x in tqdm(category_dataset['category_code']):
    augmented_col.append(augment_category_code(x))

100%|██████████| 1235606/1235606 [00:14<00:00, 85181.43it/s]


In [12]:
category_dataset = category_dataset.add_column('augmented_category_code', augmented_col)
category_dataset

Dataset({
    features: ['user_id', 'category_code', 'augmented_category_code'],
    num_rows: 1235606
})

# Embedding

In [13]:
def embed(batch, col_name):
  return {"embedding": ST.encode(batch[col_name], device='mps', show_progress_bar=True)}

In [14]:
product_dataset = category_dataset.map(
    embed,
    fn_kwargs={'col_name': 'category_code'},
    batched=True,
    batch_size=round(category_dataset.shape[0]/10),
    )

Map:   0%|          | 0/1235606 [00:00<?, ? examples/s]

Batches:   0%|          | 0/3862 [00:00<?, ?it/s]

Batches:   0%|          | 0/3862 [00:00<?, ?it/s]

Batches:   0%|          | 0/3862 [00:00<?, ?it/s]

Batches:   0%|          | 0/3862 [00:00<?, ?it/s]

Batches:   0%|          | 0/3862 [00:00<?, ?it/s]

Batches:   0%|          | 0/3862 [00:00<?, ?it/s]

Batches:   0%|          | 0/3862 [00:00<?, ?it/s]

Batches:   0%|          | 0/3862 [00:00<?, ?it/s]

Batches:   0%|          | 0/3862 [00:00<?, ?it/s]

Batches:   0%|          | 0/3862 [00:00<?, ?it/s]

In [15]:
augmented_product_dataset = category_dataset.map(
    embed,
    fn_kwargs={'col_name': 'augmented_category_code'},
    batched=True,
    batch_size=round(category_dataset.shape[0]/10),
    )

Map:   0%|          | 0/1235606 [00:00<?, ? examples/s]

Batches:   0%|          | 0/3862 [00:00<?, ?it/s]

Batches:   0%|          | 0/3862 [00:00<?, ?it/s]

Batches:   0%|          | 0/3862 [00:00<?, ?it/s]

Batches:   0%|          | 0/3862 [00:00<?, ?it/s]

Batches:   0%|          | 0/3862 [00:00<?, ?it/s]

Batches:   0%|          | 0/3862 [00:00<?, ?it/s]

Batches:   0%|          | 0/3862 [00:00<?, ?it/s]

Batches:   0%|          | 0/3862 [00:00<?, ?it/s]

Batches:   0%|          | 0/3862 [00:00<?, ?it/s]

Batches:   0%|          | 0/3862 [00:00<?, ?it/s]

In [16]:
product_dataset = product_dataset.remove_columns('augmented_category_code')
product_dataset

Dataset({
    features: ['user_id', 'category_code', 'embedding'],
    num_rows: 1235606
})

In [17]:
augmented_product_dataset = augmented_product_dataset.remove_columns('category_code')
augmented_product_dataset

Dataset({
    features: ['user_id', 'augmented_category_code', 'embedding'],
    num_rows: 1235606
})

# Calculate Average_Embedding

In [18]:
def average(data):
    embeddings = np.stack(data['embedding'].values)
    avg = np.average(embeddings, axis=0)
    return pd.Series({'average_embedding': avg})

In [19]:
ori_df = product_dataset.to_pandas()
aug_df = augmented_product_dataset.to_pandas()

In [20]:
ori_embedding_result = (
    ori_df.groupby('user_id')
    .apply(average)
)

  .apply(average)


In [21]:
aug_embedding_result = (
    aug_df.groupby('user_id')
    .apply(average)
)

  .apply(average)


In [22]:
print(ori_embedding_result)

                                           average_embedding
user_id                                                     
260013793  [-0.033347193, 0.025289588, -0.011747345, 0.02...
277655927  [-0.045145404, 0.065376855, -0.026705384, 0.00...
315720851  [0.024227075, 0.0050282013, -0.00562369, -0.08...
340041246  [-0.034396928, 0.016468745, 0.0050060693, -0.0...
364727284  [-0.034195606, 0.019063387, -0.03365945, 0.008...
...                                                      ...
566265448  [-0.06395154, 0.042917244, -0.01117276, -0.051...
566270060  [-0.039874908, 0.052562904, -0.0016835995, -0....
566275254  [-0.079045564, 0.04183333, 0.008014798, 0.0041...
566276537  [-0.05967588, 0.04580653, -0.03182595, -0.0767...
566277436  [-0.02906748, 0.026667679, -0.057768565, 0.002...

[112846 rows x 1 columns]


In [23]:
original_embedding = Dataset.from_pandas(ori_embedding_result)
augmented_embedding = Dataset.from_pandas(aug_embedding_result)

# Save Result

In [24]:
print(f'[*] Uploading Product Information:')
product_dataset.push_to_hub(
    "CookieLyu/Category_Codes",
    revision="1000k_embedded",
    token=access_token
)

[*] Uploading Product Information:


Uploading the dataset shards:   0%|          | 0/4 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/309 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/309 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/309 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/309 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/406 [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/CookieLyu/Category_Codes/commit/b4ded3a883c761d923d2d33d7bd8f46d2517e373', commit_message='Upload dataset', commit_description='', oid='b4ded3a883c761d923d2d33d7bd8f46d2517e373', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/CookieLyu/Category_Codes', endpoint='https://huggingface.co', repo_type='dataset', repo_id='CookieLyu/Category_Codes'), pr_revision=None, pr_num=None)

In [25]:
print(f'[*] Uploading Product Information:')
augmented_product_dataset.push_to_hub(
    "CookieLyu/Category_Codes",
    revision="1000k_embedded_aug",
    token=access_token
)

[*] Uploading Product Information:


Uploading the dataset shards:   0%|          | 0/4 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/309 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/309 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/309 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/309 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/417 [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/CookieLyu/Category_Codes/commit/986f22b2357526883e0a5ed54841af0d43eb483e', commit_message='Upload dataset', commit_description='', oid='986f22b2357526883e0a5ed54841af0d43eb483e', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/CookieLyu/Category_Codes', endpoint='https://huggingface.co', repo_type='dataset', repo_id='CookieLyu/Category_Codes'), pr_revision=None, pr_num=None)

In [26]:
print(f'[*] Uploading Product Information:')
original_embedding.push_to_hub(
    "CookieLyu/Category_Codes",
    revision="1000k_average_embedded",
    token=access_token
)

[*] Uploading Product Information:


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/113 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/367 [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/CookieLyu/Category_Codes/commit/4a823bd1afe6caf1d46007a1f639d30deaa3c7a2', commit_message='Upload dataset', commit_description='', oid='4a823bd1afe6caf1d46007a1f639d30deaa3c7a2', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/CookieLyu/Category_Codes', endpoint='https://huggingface.co', repo_type='dataset', repo_id='CookieLyu/Category_Codes'), pr_revision=None, pr_num=None)

In [27]:
print(f'[*] Uploading Product Information:')
augmented_embedding.push_to_hub(
    "CookieLyu/Category_Codes",
    revision="1000k_average_embedded_aug",
    token=access_token
)

[*] Uploading Product Information:


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/113 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/367 [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/CookieLyu/Category_Codes/commit/1110ff27dcdaeef98264dddee6f6ab9553d9577d', commit_message='Upload dataset', commit_description='', oid='1110ff27dcdaeef98264dddee6f6ab9553d9577d', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/CookieLyu/Category_Codes', endpoint='https://huggingface.co', repo_type='dataset', repo_id='CookieLyu/Category_Codes'), pr_revision=None, pr_num=None)