In [33]:
from datasets import load_dataset, Dataset
import re
from transformers import AutoTokenizer, DataCollatorWithPadding, TrainingArguments, AutoModelForSequenceClassification, Trainer
import torch
from sklearn.preprocessing import LabelEncoder
import numpy as np
from sklearn.metrics import accuracy_score

In [55]:
all_product_dataset = load_dataset("json", data_files="amz_products_small.jsonl", split="train")
product_dataset = all_product_dataset.shuffle(seed=42).select(range(10000))

In [54]:
print(all_product_dataset[50000])

{'also_buy': [], 'also_view': [], 'asin': 'B00DBHAE5A', 'brand': 'Omer', 'category': ['Automotive', 'Replacement Parts', 'Engines & Engine Parts', 'Engine Parts', 'Engine Mounts'], 'description': ['Now that rock crawling has become the favorite past time for Jeepers, and we are putting our Jeeps in places that they were never designed to go, the stresses on the stock motor mounts are tremendous. With deep gearing, big tires, lockers, and mega torque, the stock mounts can be torn apart in no time at all. That is where these BombProofTM Motor Mounts come into play. These are the ultimate motor mounts that you can install in your Jeep vehicle. They will create a cross- member with the engine, improve clutch linkage operation and beef up your Jeep for serious wheelin. They bolt into stock holes in the frame and engine, and in most applications, no modifications are needed. Black polyurethane bushings insulate from steel to steel contact. All BombProofTM Motor Mounts are bare steel or zinc 

In [56]:
# Step 2: Remove unnecessary columns
columns_to_remove = ['title','feature','brand','also_buy', 'also_view', 'asin', 'category', 'image', 'price']
product_dataset = product_dataset.remove_columns(columns_to_remove)

# Step 3: Flatten lists in the dataset
def flatten_lists(example):
    for key, value in example.items():
        if isinstance(value, list):
            example[key] = ' '.join(value)
    return example

product_dataset = product_dataset.map(flatten_lists)

# Step 4: Lowercase text columns
def lowercase_text(example):
    text_columns = ['description']
    for col in text_columns:
        example[col] = example[col].lower()
    return example

product_dataset = product_dataset.map(lowercase_text)

# Step 5: Clean HTML tags from description
def clean_html(example):
    example['description'] = re.sub(r'<[^>]+>', '', example['description'])
    return example

# Step 6: Rename main_cat into label
product_dataset = product_dataset.rename_column(
    original_column_name="main_cat", new_column_name="label"
)

product_dataset = product_dataset.map(clean_html)


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map: 100%|██████████| 10000/10000 [00:01<00:00, 6660.14 examples/s]
Map: 100%|██████████| 10000/10000 [00:00<00:00, 20533.04 examples/s]
Map: 100%|██████████| 10000/10000 [00:00<00:00, 19461.85 examples/s]


In [None]:
# Step 1: Load the tokenizer
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

# Step 2: Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(
        examples["description"],
        padding="max_length",  # Pad sequences to max_length
        truncation=True,       # Truncate sequences longer than max_length
        max_length=128,        # Set a fixed length for all sequences
    )

# Apply tokenization to the dataset
tokenized_datasets = product_dataset.map(tokenize_function, batched=True)


Map: 100%|██████████| 10000/10000 [00:02<00:00, 4781.38 examples/s]


In [58]:
# Check the shape of the first example's input_ids
print(tokenized_datasets[0])

{'description': 'lt-840-6a 4ch cv dmx-pwm decoder  input signal: dmx512  input power: dc5vdc24v  max load current: 6a 4ch max 24a  max output power: 120w/288w/576w(5v/12v/24v)  dmx512 socket: xlr-3, rj45, green terminal (with signal amplification function)  dimming range: 0~100%  working temperature: -3065  dimension: l163w78h40mm  package size: l180w82h50mm  weight (g.w): 445g   package include: 1pc * lt-840-6a 4ch cv decoder ', 'label': 'Musical Instruments', 'input_ids': [101, 8318, 1011, 28122, 1011, 1020, 2050, 1018, 2818, 26226, 1040, 22984, 1011, 1052, 2860, 2213, 21933, 4063, 7953, 4742, 1024, 1040, 22984, 22203, 2475, 7953, 2373, 1024, 5887, 2629, 16872, 2278, 18827, 2615, 4098, 7170, 2783, 1024, 1020, 2050, 1018, 2818, 4098, 2484, 2050, 4098, 6434, 2373, 1024, 6036, 2860, 1013, 24841, 2860, 1013, 5401, 2575, 2860, 1006, 1019, 2615, 1013, 2260, 2615, 1013, 2484, 2615, 1007, 1040, 22984, 22203, 2475, 22278, 1024, 28712, 2099, 1011, 1017, 1010, 1054, 3501, 19961, 1010, 2665, 553

In [None]:
# Extract all main_cat values
main_cat_values = product_dataset['label']  # Adjust based on your dataset structure

# Fit the label encoder on all main_cat values
label_encoder = LabelEncoder()
label_encoder.fit(main_cat_values)

# Add encoded labels to the dataset
def encode_labels(example):
    return {'label': label_encoder.transform([example['label']])[0]}  # Transform single value

tokenized_datasets = tokenized_datasets.map(encode_labels)

# Verify the labels
print(tokenized_datasets[0]['label'])


Map: 100%|██████████| 10000/10000 [00:02<00:00, 4189.79 examples/s]

15





In [41]:
# Inspect the first batch of data
batch = tokenized_datasets[:16]  # Take a batch of 16 examples
print("Input IDs shape:", len(batch["input_ids"]))
print("Attention mask shape:", len(batch["attention_mask"]))

Input IDs shape: 16
Attention mask shape: 16


In [60]:
print(len(label_encoder.classes_))

22


In [None]:
# Step 5: Load the model
num_labels = len(label_encoder.classes_)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=num_labels)

# Step 6: Set up training argument
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5, 
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=1,
    weight_decay=0.01,
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=10,
    load_best_model_at_end=True,
)

# Step 7: Define metrics function
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return {"accuracy": accuracy_score(labels, predictions)}

# Step 8: Initialize the Trainer
tokenized_datasets = tokenized_datasets.train_test_split(train_size=0.8, seed=42)
# Initialize the data collator
data_collator = DataCollatorWithPadding(tokenizer)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    compute_metrics=compute_metrics,
    data_collator=data_collator,
)

# Step 9: Train the model
trainer.train()

# Step 10: Evaluate the model
results = trainer.evaluate()
print(results)

# Step 11: Save the model
model.save_pretrained("./final_model")
tokenizer.save_pretrained("./final_model")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,1.0497,1.223777,0.662


KeyboardInterrupt: 

In [62]:
# Step 10: Evaluate the model
results = trainer.evaluate()
print(results)

{'eval_loss': 1.223777413368225, 'eval_accuracy': 0.662}


In [None]:
from datasets import load_dataset, Dataset
import re
from transformers import (
    AutoTokenizer,
    DataCollatorWithPadding,
    TrainingArguments,
    AutoModelForSequenceClassification,
    Trainer,
)
import torch
from sklearn.preprocessing import LabelEncoder
import numpy as np
from sklearn.metrics import accuracy_score, f1_score
import joblib  # For saving the label encoder

# Step 1: Load the dataset
all_product_dataset = load_dataset("json", data_files="amz_products_small.jsonl", split="train")
product_dataset = all_product_dataset.shuffle(seed=42).select(range(10000))

# Step 2: Remove unnecessary columns
columns_to_remove = ['title', 'feature', 'brand', 'also_buy', 'also_view', 'asin', 'category', 'image', 'price']
product_dataset = product_dataset.remove_columns(columns_to_remove)

# Step 3: Flatten lists in the dataset
def flatten_lists(example):
    for key, value in example.items():
        if isinstance(value, list):
            example[key] = ' '.join(value)
    return example

product_dataset = product_dataset.map(flatten_lists)

# Step 4: Lowercase text columns
def lowercase_text(example):
    text_columns = ['description']
    for col in text_columns:
        example[col] = example[col].lower()
    return example

product_dataset = product_dataset.map(lowercase_text)

# Step 5: Clean HTML tags from description
def clean_html(example):
    example['description'] = re.sub(r'<[^>]+>', '', example['description'])
    return example

product_dataset = product_dataset.map(clean_html)

# Step 6: Rename main_cat into label
product_dataset = product_dataset.rename_column(
    original_column_name="main_cat", new_column_name="label"
)

# Step 7: Split the dataset into train and test sets
product_dataset = product_dataset.train_test_split(train_size=0.8, seed=42)

# Step 8: Encode labels
label_encoder = LabelEncoder()
label_encoder.fit(product_dataset["train"]["label"])  # Fit only on the training set

def encode_labels(example):
    return {"label": label_encoder.transform([example["label"]])[0]}

product_dataset = product_dataset.map(encode_labels)

# Step 9: Save the label encoder for inference
joblib.dump(label_encoder, "label_encoder.pkl")

# Step 10: Load the tokenizer
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

# Step 11: Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(
        examples["description"],
        truncation=True,  # Dynamic padding will be handled by DataCollator
    )

tokenized_datasets = product_dataset.map(tokenize_function, batched=True)

# Step 12: Load the model
num_labels = len(label_encoder.classes_)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=num_labels)

# Step 13: Set up training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=1,  # Increased for better fine-tuning
    weight_decay=0.01,
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=10,
    load_best_model_at_end=True,
)

# Step 14: Define metrics function
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return {
        "accuracy": accuracy_score(labels, predictions),
        "f1": f1_score(labels, predictions, average="weighted"),  # Added F1-score
    }

# Step 15: Initialize the data collator
data_collator = DataCollatorWithPadding(tokenizer)

# Step 16: Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    compute_metrics=compute_metrics,
    data_collator=data_collator,
)

# Step 17: Train the model
trainer.train()

# Step 18: Evaluate the model
results = trainer.evaluate()
print(results)

# Step 19: Save the model and tokenizer
model.save_pretrained("./final_model")
tokenizer.save_pretrained("./final_model")

Map: 100%|██████████| 2000/2000 [00:00<00:00, 2364.05 examples/s]
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
