### create a machine learning fine tuned model that can predict the customer review about the products .

In [None]:
import os
import json
import pandas as pd
import numpy as np 
from sklearn.model_selection import train_test_split
from datasets import load_dataset, Dataset, ClassLabel,DatasetDict
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
)
import evaluate


  from .autonotebook import tqdm as notebook_tqdm


In [3]:

df = pd.read_csv('amazon_review.csv')

df.head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime,day_diff,helpful_yes,total_vote
0,A3SBTW3WS4IQSN,B007WTAJTO,,"[0, 0]",No issues.,4.0,Four Stars,1406073600,2014-07-23,138,0,0
1,A18K1ODH1I2MVB,B007WTAJTO,0mie,"[0, 0]","Purchased this for my device, it worked as adv...",5.0,MOAR SPACE!!!,1382659200,2013-10-25,409,0,0
2,A2FII3I2MBMUIA,B007WTAJTO,1K3,"[0, 0]",it works as expected. I should have sprung for...,4.0,nothing to really say....,1356220800,2012-12-23,715,0,0
3,A3H99DFEG68SR,B007WTAJTO,1m2,"[0, 0]",This think has worked out great.Had a diff. br...,5.0,Great buy at this price!!! *** UPDATE,1384992000,2013-11-21,382,0,0
4,A375ZM4U047O79,B007WTAJTO,2&amp;1/2Men,"[0, 0]","Bought it with Retail Packaging, arrived legit...",5.0,best deal around,1373673600,2013-07-13,513,0,0


# Config


In [None]:
MODEL_NAME = "distilbert-base-uncased"
OUTPUT_DIR = "./review-finetuned-model"
NUM_LABELS = 3  
EPOCHS = 4
TRAIN_BATCH_SIZE = 16
EVAL_BATCH_SIZE = 32
LEARNING_RATE = 2e-5
MAX_SEQ_LENGTH = 128
SEED = 42
HUB_MODEL_ID = "anantacoder/distilbert-base-uncased"



In [None]:

from huggingface_hub import login
hf_token = os.environ("HF_HUB_TOKEN")
if hf_token:
    login(token=hf_token)
else:
    print("Warning: HF_HUB_TOKEN not set; pushing to hub may fail. Please run 'huggingface-cli login' or set HF_HUB_TOKEN.")

In [6]:
rating_to_label = {
    1.0: "Discontinue",
    2.0: "Discontinue",
    3.0: "Keep",
    4.0: "Increase",
    5.0: "Increase"
}

# Load dataset
raw_df = pd.read_csv("amazon_review.csv")
raw_df = raw_df.dropna(subset=["reviewText", "overall"])
raw_df["decisionLabel"] = raw_df["overall"].map(rating_to_label)

In [7]:
print(raw_df["decisionLabel"])

0          Increase
1          Increase
2          Increase
3          Increase
4          Increase
           ...     
4910    Discontinue
4911       Increase
4912       Increase
4913       Increase
4914       Increase
Name: decisionLabel, Length: 4914, dtype: object


In [8]:
decision_labels = ClassLabel(names=["Discontinue", "Keep", "Increase"])
raw_df["labels"] = raw_df["decisionLabel"].apply(lambda x: decision_labels.str2int(x))
print(raw_df["labels"] )

0       2
1       2
2       2
3       2
4       2
       ..
4910    0
4911    2
4912    2
4913    2
4914    2
Name: labels, Length: 4914, dtype: int64


# train val split 

In [9]:
train_df, valid_df = train_test_split(
    raw_df[["reviewText", "labels"]],
    test_size=0.2,
    stratify=raw_df["labels"],
    random_state=SEED
)

In [10]:
# converting to Hugging face type dataset
train_ds = Dataset.from_pandas(train_df.reset_index(drop=True))
valid_ds = Dataset.from_pandas(valid_df.reset_index(drop=True))
datasets = DatasetDict({"train": train_ds, "validation": valid_ds})

print(train_df.head())

                                             reviewText  labels
1847  I bought this for my GoPro and so far it's wor...       2
386   I bought this to use in my Chromebook with a s...       2
2111  Works great in my compatible device BUT if you...       2
668   Used for two months in Samsung Galaxy.  Fizzle...       0
2460  As stated above, and not a bad price.  Bought ...       2


# Tokenizer 

In [None]:
# tokenizer makes the raw text strings into numaric data that a model can understand.
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def tokenize_batch(batch):
    return tokenizer(
        batch["reviewText"],
        padding="max_length",
        truncation=True,
        max_length=MAX_SEQ_LENGTH
    )

tokenized = datasets.map(
    tokenize_batch,
    batched=True,
    remove_columns=[col for col in datasets["train"].column_names if col not in ["labels"]]
)
#  only the fields the Trainer cares 
keep = ["input_ids", "attention_mask", "labels"]
tokenized = tokenized.remove_columns(
    [c for c in tokenized["train"].column_names if c not in keep]
)


# Format for PyTorch
tokenized.set_format(
    type="torch",
    columns=["input_ids", "attention_mask", "labels"]
)

collator = DataCollatorWithPadding(tokenizer)


Map: 100%|██████████| 3931/3931 [00:00<00:00, 6776.91 examples/s]
Map: 100%|██████████| 983/983 [00:00<00:00, 9421.59 examples/s]


# Model training 

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=NUM_LABELS
)
accuracy = evaluate.load("accuracy")
f1 = evaluate.load("f1")

def compute_metrics(eval_prediction):
    logits, labels = eval_prediction
    predictions = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy.compute(predictions=predictions, references=labels)["accuracy"],
        "f1": f1.compute(predictions=predictions, references=labels, average="weighted")["f1"]
    }

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=TRAIN_BATCH_SIZE,
    per_device_eval_batch_size=EVAL_BATCH_SIZE,
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=LEARNING_RATE,
    weight_decay=0.01,
    logging_dir=os.path.join(OUTPUT_DIR, "logs"),
    logging_steps=50,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    push_to_hub=False,
    hub_model_id=HUB_MODEL_ID,
    hub_strategy="every_save",
    seed=SEED,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["validation"],
    tokenizer=tokenizer,
    data_collator=collator,
    compute_metrics=compute_metrics
)


  trainer = Trainer(


# run training

In [14]:
if __name__ == "__main__":
    print("Evaluating model on validation set...")
    trainer.train()
    metrics = trainer.evaluate()
    print("Evaluation Metrics:", metrics)
    trainer.save_model()
    trainer.push_to_hub()
    

Evaluating model on validation set...




Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.2025,0.180386,0.944049,0.931639
2,0.141,0.17139,0.948118,0.93749
3,0.1192,0.182192,0.950153,0.942453
4,0.0646,0.203415,0.950153,0.943812




Evaluation Metrics: {'eval_loss': 0.20341487228870392, 'eval_accuracy': 0.9501525940996948, 'eval_f1': 0.9438121124599416, 'eval_runtime': 86.1517, 'eval_samples_per_second': 11.41, 'eval_steps_per_second': 0.36, 'epoch': 4.0}


model.safetensors: 100%|██████████| 268M/268M [04:38<00:00, 960kB/s]    


In [15]:
if __name__ == "__main__":
    metrics = trainer.evaluate()
    print("Evaluation Metrics:", metrics)
    trainer.save_model()
    trainer.push_to_hub()



Evaluation Metrics: {'eval_loss': 0.20341487228870392, 'eval_accuracy': 0.9501525940996948, 'eval_f1': 0.9438121124599416, 'eval_runtime': 83.9177, 'eval_samples_per_second': 11.714, 'eval_steps_per_second': 0.369, 'epoch': 4.0}


No files have been modified since last commit. Skipping to prevent empty commit.
