In [None]:
# !pip install --upgrade transformers==4.44.2

In [None]:
# import sys
# print(sys.executable)


In [None]:
# !pip install "huggingface_hub[hf_xet]"


In [None]:
# !pip install -U transformers accelerate huggingface_hub safetensors


In [None]:
# !pip install numpy pandas matplotlib scikit-learn torch torchvision torchaudio datasets tqdm jupyter ipykernel seaborn


In [None]:
# from huggingface_hub import login, HfApi
# login(token="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")

# api = HfApi()
# api.whoami()


In [None]:
# from huggingface_hub import HfApi
# api = HfApi()
# api.model_info("meta-llama/Llama-3.2-1B-Instruct")


In [None]:
# from tqdm.notebook import tqdm
# tqdm.pandas()


In [None]:
# !pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121


In [None]:
import torch
print(torch.cuda.is_available())
print(torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No GPU")


In [None]:
# =============================================================
# 🧠 Fine-tune LLaMA 3.2-1B on Pashto Sentiment Dataset (Local)
# =============================================================

import os
import random
import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    Trainer,
    TrainingArguments,
)
from sklearn.metrics import classification_report, accuracy_score
import re
from huggingface_hub import snapshot_download
# from tqdm.auto import tqdm  # instead of tqdm.notebook
from tqdm.notebook import tqdm
tqdm.pandas()



In [None]:
# -----------------------------
# Configuration
# -----------------------------
# MODEL_NAME = "meta-llama/Llama-3.2-1B-Instruct"
 
TRAIN_PATH = r"C:\Users\stdFurqan\Downloads\arabic_ham_SPAM\70_arabic_HS.csv"
TEST_PATH  = r"C:\Users\stdFurqan\Downloads\arabic_ham_SPAM\test.csv"
TEXT_COL = "Cleaned Text"
LABEL_COL = "Label"
LABELS = ["Ham", "Spam"]
SEED = 20
EPOCHS = 5
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# -----------------------------
# Reproducibility
# -----------------------------
def set_seed(seed=20):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed(SEED)

# -----------------------------
# Robust CSV loader
# -----------------------------
def safe_read_csv(path):
    encodings = ['utf-8', 'latin1', 'ISO-8859-1', 'cp1252']
    for enc in encodings:
        try:
            return pd.read_csv(path, encoding=enc, on_bad_lines='skip')
        except Exception as e:
            print(f"⚠️ Failed with {enc}: {e}")
    raise ValueError(f"❌ Could not read file: {path}")

train_df = safe_read_csv(TRAIN_PATH)
test_df  = safe_read_csv(TEST_PATH)

print("✅ Data loaded successfully!")
print(f"Train size: {len(train_df)} | Test size: {len(test_df)}")

In [None]:
# from huggingface_hub import snapshot_download
# HF_TOKEN = "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"  # replace with your actual token

# local_dir = r"C:\Users\stdFurqan\Downloads\lama_models_download\LAMA_3.2(1b)"

# snapshot_download(
#     repo_id="meta-llama/Llama-3.2-1B-Instruct",
#     local_dir=local_dir,
#     token=HF_TOKEN,
#     local_dir_use_symlinks=False
# )

# print("✅ Model successfully downloaded.")


In [None]:
# # ✅ Local directory where you want to store the model
# local_dir = r"C:\Users\stdFurqan\Downloads\lama_models_download\LAMA_3.2(1b)"

# # ✅ Download the entire model snapshot
# snapshot_download(
#     repo_id="meta-llama/Llama-3.2-1B-Instruct",
#     local_dir=local_dir,
#     token=HF_TOKEN,
#     local_dir_use_symlinks=False  # safer for Windows (no symlink issues)
# )

# print(f"✅ Model successfully downloaded to: {local_dir}")


In [None]:
# ✅ Local path where model is stored
MODEL_PATH = r"C:\Users\stdFurqan\Downloads\lama_models_download\LAMA_3.2(1b)"
# MODEL_PATH = r"C:\Users\stdFurqan\Downloads\lama_models_download\LAMA_3.2(3b)"


print(f"🧠 Loading model from {MODEL_PATH} ...")

# ✅ Load tokenizer from local folder
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"

# ✅ Load model from local folder
model = AutoModelForCausalLM.from_pretrained(
    MODEL_PATH,
    torch_dtype=torch.bfloat16,   # ✅ best for 40-series
    device_map="auto"
)

model.config.pad_token_id = tokenizer.pad_token_id
model.eval()

print("✅ Model loaded successfully from local directory!")

In [None]:





# -----------------------------
# Prompt template
# -----------------------------
PROMPT_TEMPLATE = ( "Classify the following tweet as either Ham (normal/non-spam message) or Spam (unsolicited or promotional message).\n" 
                    "Reply with only one word: Ham, or Spam.\n\n" 
                    "Text: {text}\n\nLabel:" )



def make_prompt(text, label=None):
    if label is None:
        return PROMPT_TEMPLATE.format(text=text)
    return PROMPT_TEMPLATE.format(text=text) + " " + label

# -----------------------------
# Dataset class
# -----------------------------
class SentimentDataset(Dataset):
    def __init__(self, df, tokenizer, is_train=True):
        self.texts = df[TEXT_COL].astype(str).tolist()
        self.labels = df[LABEL_COL].astype(str).tolist() if is_train else None
        self.tokenizer = tokenizer
        self.is_train = is_train

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        if self.is_train:
            label = self.labels[idx]
            prompt = make_prompt(text, label)
            tokenized = self.tokenizer(prompt, truncation=True, padding="max_length",
                                       max_length=256, return_tensors="pt")
            tokenized = {k: v.squeeze(0) for k, v in tokenized.items()}
            tokenized["labels"] = tokenized["input_ids"].clone()
            return tokenized
        else:
            prompt = make_prompt(text)
            tokenized = self.tokenizer(prompt, truncation=True, padding="max_length",
                                       max_length=256, return_tensors="pt")
            return {k: v.squeeze(0) for k, v in tokenized.items()}

train_dataset = SentimentDataset(train_df, tokenizer, is_train=True)
test_dataset  = SentimentDataset(test_df, tokenizer, is_train=False)



In [None]:
# -----------------------------
# Training configuration
# -----------------------------
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="no",              # ✅ disable evaluation
    save_strategy="epoch",           # ✅ save model each epoch
    learning_rate=2e-5,
    per_device_train_batch_size=8,   # ✅ fits 4080 SUPER
    per_device_eval_batch_size=8,
    num_train_epochs=EPOCHS,
    weight_decay=0.01,
    seed=SEED,
    fp16=False,                      # ✅ no AMP conflict
    bf16=True,                       # ✅ ideal for RTX 40-series
    dataloader_num_workers=0,        # ✅ Windows-safe (no multiprocessing issues)
    load_best_model_at_end=False,    # ✅ no eval → don’t track best model
    logging_dir="./logs",
    logging_strategy="epoch",        # ✅ matches your intent
    # disable_tqdm=True,               # ✅ optional: avoids frozen progress bars
    report_to=[],                    # ✅ no W&B/TensorBoard
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    tokenizer=tokenizer,
)

# -----------------------------
# Training & Evaluation (Windows-safe entry point)
# -----------------------------
if __name__ == "__main__":
    print("🚀 Starting fine-tuning ...")
    trainer.train()

    print("\n✅ Training finished! Now evaluating ...")

    pattern = re.compile(r"\b(Ham|Spam|)\b", re.IGNORECASE)

    def extract_label(output):
        m = pattern.search(output)
        return m.group(1).capitalize() if m else "Unknown"

    model.eval()
    pred_labels = []
    true_labels = test_df[LABEL_COL].astype(str).tolist()
    texts = test_df[TEXT_COL].astype(str).tolist()

    batch_size = 4
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        prompts = [make_prompt(t) for t in batch]
        inputs = tokenizer(prompts, return_tensors="pt", padding=True, truncation=True).to(DEVICE)
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=6,
                do_sample=False,
                top_p=1.0,
                pad_token_id=tokenizer.pad_token_id,
                eos_token_id=tokenizer.eos_token_id,
            )
        decoded = tokenizer.batch_decode(outputs, skip_special_tokens=True)
        for prompt, full_out in zip(prompts, decoded):
            gen = full_out[len(prompt):].strip() if full_out.startswith(prompt) else full_out
            pred_labels.append(extract_label(gen))

    # -----------------------------
    # Metrics
    # -----------------------------
    report = classification_report(true_labels, pred_labels, labels=LABELS, digits=4, zero_division=0)
    acc = accuracy_score(true_labels, pred_labels)
    print("\n📊 Classification Report:\n")
    print(report)
    print(f"Accuracy: {acc:.4f}")
