In [None]:
import pandas as pd
import numpy as np
from datasets import Dataset, DatasetDict
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling # Corrected import
)
from peft import LoraConfig, get_peft_model, TaskType
# from trl import SFTTrainer # SFTTrainer is not used in this code, consider removing it
import re
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')



In [None]:
# Check GPU availability
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    # Use mem_get_info() which returns total and free memory
    total_gpu_memory, free_gpu_memory = torch.cuda.mem_get_info()
    print(f"GPU memory: {free_gpu_memory / (1024**3):.2f} GB free out of {total_gpu_memory / (1024**3):.2f} GB total")

CUDA available: True
GPU: Tesla T4
GPU memory: 14.74 GB free out of 14.64 GB total


In [None]:
# Load and preprocess data
def load_and_preprocess_data():
    """Load and preprocess the Excel files"""
    train_df = pd.read_excel('bodywash-train.xlsx')
    test_df = pd.read_excel('bodywash-test.xlsx')

    print("Train data shape:", train_df.shape)
    print("Test data shape:", test_df.shape)
    print("\nTrain columns:", train_df.columns.tolist())

    # Clean text function
    def clean_text(text):
        if pd.isna(text):
            return ""
        text = str(text).lower()
        text = re.sub(r'[^\w\s]', ' ', text)
        text = re.sub(r'\s+', ' ', text)
        return text.strip()

    # Clean the text data
    train_df['cleaned_text'] = train_df['Core Item'].apply(clean_text)
    test_df['cleaned_text'] = test_df['Core Item'].apply(clean_text)

    # Process tags
    if 'Level 1 Factors' in train_df.columns:
        train_df['tags'] = train_df['Level 1 Factors'].apply(
            lambda x: [tag.strip() for tag in str(x).split(',')] if pd.notna(x) else []
        )

    return train_df, test_df

# Load data
train_df, test_df = load_and_preprocess_data()

Train data shape: (7744, 2)
Test data shape: (127, 2)

Train columns: ['Core Item', 'Level 1 Factors']


In [None]:
# Create instruction prompts for the model
def create_instruction_prompts(df, is_training=True):
    """Create instruction-following prompts for the model"""
    prompts = []

    for idx, row in df.iterrows():
        if is_training and 'tags' in row:
            # Training format: Instruction + Input + Expected Output
            instruction = "Analyze the following body wash product and predict its Level I factors (tags)."
            input_text = f"Product: {row['cleaned_text']}"
            output_text = f"Tags: {', '.join(row['tags']) if row['tags'] else 'None'}"

            prompt = f"""### Instruction:
{instruction}

### Input:
{input_text}

### Response:
{output_text}"""

        else:
            # Inference format (for test data)
            instruction = "Analyze the following body wash product and predict its Level I factors (tags)."
            input_text = f"Product: {row['cleaned_text']}"

            prompt = f"""### Instruction:
{instruction}

### Input:
{input_text}

### Response:
Tags:"""

        prompts.append(prompt)

    return prompts

# Create training and validation datasets
train_prompts = create_instruction_prompts(train_df, is_training=True)

# Split into train/validation
train_texts, val_texts = train_test_split(
    train_prompts,
    test_size=0.1,
    random_state=42
)

print(f"Training samples: {len(train_texts)}")
print(f"Validation samples: {len(val_texts)}")


Training samples: 6969
Validation samples: 775


In [None]:

# Create datasets
train_dataset = Dataset.from_dict({"text": train_texts})
val_dataset = Dataset.from_dict({"text": val_texts})

In [None]:
# Initialize tokenizer and model
model_name = "HuggingFaceTB/SmolLM-1.7B-Instruct"

print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Add padding token if it doesn't exist
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

print("Loading model...")
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,  # Use half precision to save memory
    device_map="auto",  # Automatically handle GPU placement
    trust_remote_code=True
)

print(f"Model loaded on: {model.device}")



Loading tokenizer...


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/655 [00:00<?, ?B/s]

Loading model...


config.json:   0%|          | 0.00/738 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors:   0%|          | 0.00/3.42G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/156 [00:00<?, ?B/s]

Model loaded on: cuda:0


In [None]:
# Configure QLoRA
lora_config = LoraConfig(
    r=16,  # Rank of adaptation
    lora_alpha=32,  # LoRA scaling parameter
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],  # Target modules for attention and MLP
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.CAUSAL_LM,
)

# Apply QLoRA to model
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()



trainable params: 18,087,936 || all params: 1,729,464,320 || trainable%: 1.0459


In [None]:
# Tokenization function
def tokenize_function(examples):
    """Tokenize the text data"""
    tokenized = tokenizer(
        examples["text"],
        truncation=True,
        padding="max_length",  # Added padding
        max_length=512,  # Reduced for efficiency
        return_tensors=None,
    )

    # For causal LM, labels are the same as input_ids
    tokenized["labels"] = tokenized["input_ids"].copy()

    return tokenized




In [None]:
# Tokenize datasets
print("Tokenizing datasets...")
tokenized_train_dataset = train_dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=train_dataset.column_names,
)

tokenized_val_dataset = val_dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=val_dataset.column_names,
)


Tokenizing datasets...


Map:   0%|          | 0/6969 [00:00<?, ? examples/s]

Map:   0%|          | 0/775 [00:00<?, ? examples/s]

In [None]:
# Data collator
data_collator = DataCollatorForLanguageModeling( # Corrected DataCollator
    tokenizer=tokenizer,
    mlm=False, # Set to False for Causal LM
)

# Training arguments optimized for QLoRA
training_args = TrainingArguments(
    output_dir="./smolLM-bodywash-finetuned",
    overwrite_output_dir=True,
    num_train_epochs=3,  # Reduced for quick training
    per_device_train_batch_size=4,  # Adjust based on GPU memory
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=2,  # Effective batch size = 4 * 2 = 8
    warmup_steps=50,
    learning_rate=2e-4,  # Slightly higher for QLoRA
    fp16=True,  # Use mixed precision
    logging_steps=10,
    eval_strategy="steps", # Corrected argument name
    eval_steps=50,
    save_strategy="steps",
    save_steps=100,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    report_to=None,  # Disable wandb/tensorboard for simplicity
    ddp_find_unused_parameters=False,
)

# Create trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
)



In [None]:
# Start training
print("Starting training...")
trainer.train()


Starting training...


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mdebasishpra314[0m ([33mdebasishpra314-indian-institute-of-technology-tirupati[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss,Validation Loss
50,2.2012,1.944088
100,1.7724,1.706197
150,1.6117,1.64315
200,1.565,1.602217
250,1.4966,1.570719
300,1.5672,1.547376
350,1.6318,1.521717
400,1.5165,1.501045
450,1.5846,1.482297
500,1.5036,1.461221


TrainOutput(global_step=2616, training_loss=1.1207627694176607, metrics={'train_runtime': 11279.3159, 'train_samples_per_second': 1.854, 'train_steps_per_second': 0.232, 'total_flos': 1.046118697229353e+17, 'train_loss': 1.1207627694176607, 'epoch': 3.0})

### 📊 Training vs Validation Loss Progress

The table below summarizes how the **training loss** and **validation loss** evolved over training steps.  
- Training loss steadily decreases → model is learning.  
- Validation loss also decreases smoothly → no overfitting (good generalization).  
- Around step **2500+**, losses stabilize near **0.87** (validation), which indicates good convergence.  

| Step | Training Loss | Validation Loss | 🔎 Comment |
|------|---------------|-----------------|------------|
| 50   | 2.2012 | 1.9441 | Model just starting, high losses. |
| 250  | 1.4966 | 1.5707 | Training < Validation → underfitting reducing. |
| 500  | 1.5036 | 1.4612 | Losses closer, validation improving. |
| 1000 | 1.1479 | 1.2833 | Strong improvement, good learning. |
| 1500 | 0.9418 | 1.0965 | Gap narrowing, generalization improving. |
| 1800 | 0.7459 | 0.9986 | Significant drop, model stabilizing. |
| 2000 | 0.6771 | 0.9617 | Good convergence trend. |
| 2250 | 0.6736 | 0.8978 | Validation improving, nearing optimum. |
| 2500 | 0.5379 | 0.8711 | Best region, very stable. |
| 2600 | 0.6387 | 0.8655 | Final stage, convergence achieved. |

---

✅ **Final Training Loss** = **1.1208**  
✅ **Final Validation Loss** ≈ **0.865**  
📌 **Conclusion:** Model converged well, both losses decreased consistently, no signs of severe overfitting.  



In [None]:
# Save the fine-tuned model
trainer.save_model()
tokenizer.save_pretrained("./smolLM-bodywash-finetuned")
print("Model saved!")

Model saved!


In [None]:
# Inference function
def predict_tags(model, tokenizer, product_descriptions):
    """Predict tags for given product descriptions"""
    predictions = []

    model.eval()  # Set model to evaluation mode

    for desc in product_descriptions:
        # Create prompt for inference
        prompt = f"""### Instruction:
Analyze the following body wash product and predict its Level I factors (tags).

### Input:
Product: {desc}

### Response:
Tags:"""

        # Tokenize input
        inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)

        # Generate prediction
        with torch.no_grad():
            outputs = model.generate(
                inputs.input_ids.to(model.device),
                max_new_tokens=50,  # Limit output length
                temperature=0.3,  # Lower temperature for more deterministic outputs
                do_sample=True,
                pad_token_id=tokenizer.pad_token_id,
                eos_token_id=tokenizer.eos_token_id,
            )

        # Decode prediction
        generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

        # Extract only the generated part (after "Tags:")
        response = generated_text.split("Tags:")[-1].strip()

        # Clean up the response
        tags = [tag.strip() for tag in response.split(',') if tag.strip()]
        predictions.append(tags)

    return predictions

In [None]:
# Make predictions on test data
print("Making predictions on test data...")
test_descriptions = test_df['cleaned_text'].tolist()
predicted_tags = predict_tags(model, tokenizer, test_descriptions)



The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Making predictions on test data...


In [None]:
# Add predictions to test dataframe
test_df['predicted_Level_I_factors'] = [', '.join(tags) for tags in predicted_tags]


In [None]:
# Save results
test_df[['Core Item', 'predicted_Level_I_factors']].to_excel( # Corrected column name for output
    'bodywash-test-predictions-smolLM.xlsx',
    index=False
)
print("Predictions saved to 'bodywash-test-predictions-smolLM.xlsx'")

Predictions saved to 'bodywash-test-predictions-smolLM.xlsx'


In [None]:

# Show sample predictions
print("\nSample predictions:")
for i in range(min(5, len(test_df))):
    print(f"Item: {test_df['Core Item'].iloc[i]}") # Corrected column name for output
    print(f"Predicted tags: {test_df['predicted_Level_I_factors'].iloc[i]}")
    print("-" * 80)


Sample predictions:
Item: "All of the body washes are excellent and they layer very well with their cologne counterparts. However, my skin eats up fragrances unless I moisturize heavy, any chance you'll release scented body lotion to go along with these scents ? Thanks. Also I'm here from Ashton from Gents Scents 
Predicted tags: Brand Value Brand Accountability Brand Accountability Brand Value Brand Value Brand Value Brand Value Brand Value Brand Value Brand Value Brand Value Brand Value Brand Value Brand Value Brand Value Brand Value Brand Value Brand Value Brand Value Brand Value Brand Value Brand Value Brand Value Brand Value Brand Value
--------------------------------------------------------------------------------
Item: "Cremo is by far the best!"
Predicted tags: Brand Value For Money Brand Value Brand Value For Money Brand Value Brand Value For Money Brand Value Brand Value Brand Value Brand Value Brand Value Brand Value Brand Value Brand Value Brand Value Brand Value Brand Va

Sample predictions:
Item: "All of the body washes are excellent and they layer very well with their cologne counterparts. However, my skin eats up fragrances unless I moisturize heavy, any chance you'll release scented body lotion to go along with these scents ? Thanks. Also I'm here from Ashton from Gents Scents
Predicted tags: Brand Value Brand Accountability Brand Accountability Brand Value Brand Value Brand Value Brand Value Brand Value Brand Value Brand Value Brand Value Brand Value Brand Value Brand Value Brand Value Brand Value Brand Value Brand Value Brand Value Brand Value Brand Value Brand Value Brand Value Brand Value Brand Value

Item: "Cremo is by far the best!"
Predicted tags: Brand Value For Money Brand Value Brand Value For Money Brand Value Brand Value For Money Brand Value Brand Value Brand Value Brand Value Brand Value Brand Value Brand Value Brand Value Brand Value Brand Value Brand Value Brand Value Brand Value Brand Value Brand Value Brand Value Product Text

Item: "I use the Nivea's during the spring and summer and Old spice during the fall and winter months... So this begs the question: Why no mention of anything from Old Spice? Have you not tried any of their body wash products or you feel like they're not upto par....?"
Predicted tags: Brand Value For Money Brand Value Brand Value For Money Skin Care Brand Value Brand Value For Money Skin Care Skin Care Product Texture Brand Value Brand Value Brand Value Brand Value Brand Value Product Texture Skin Feel Brand Value Brand Value Skin Care Brand Value Brand Value

Item: "Nivea and Dove. Both are great on my skin. Not only does it clean good but it proper rehydrates my skin with no irratation plus its the fininsh you get looking in the mirror. Ultra smooth skin. Oh its good on hair as well. I dont use Axe, its consider for kids where im from. Axe is lynx. Good review."
Predicted tags: Skin Texture Improvement Skin Texture Improvement Cleansing Completeness Skin Texture Improvement Skin Texture Improvement Cleansing Completeness Brand Value For Money Skin Sensibility Irritation

 Input:
Product: great product my husband

Item: "OG, The one thing that would hold me back from this is the chemicals Iâ€™m assuming are in this wash.  (Parabens, etc.) These days Iâ€™m a big Cremo body wash guy."
Predicted tags: Brand Accountability Accountability Source Safety Brand Accountability Brand Accountability Brand Accountability Brand Accountability Brand Accountability Brand Accountability Brand Accountability Brand Accountability Brand Accountability Brand Accountability Brand Accountability Brand Accountability Brand Accountability Brand Accountability Brand Accountability Brand Accountability Brand Accountability Brand Accountability Brand Accountability Brand Accountability Brand Accountability Brand Accountability Brand
