<a href="https://colab.research.google.com/github/Codewithprasenjit/AI-Humanizine-text-review-system-from-large-language-models/blob/main/Untitled16.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q transformers datasets accelerate peft bitsandbytes einops

import torch
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    TrainingArguments,
    Trainer,
    pipeline
)
from peft import (
    LoraConfig,
    get_peft_model,
    prepare_model_for_kbit_training,
    PeftModel
)
from datasets import Dataset
import pandas as pd
import gc
import os

# --- Configuration ---

MODEL_NAME = "tiiuae/falcon-7b"  # Falcon 7B (make sure you have access)
DATASET_PATH = "/mnt/data/train.csv"
OUTPUT_DIR = "./falcon7b_finetuned_qlora"
FINAL_OUTPUT_DIR = OUTPUT_DIR + "_quantized"
MAX_LENGTH = 256
EPOCHS = 3
BATCH_SIZE = 1

# --- Memory Clean-up ---

torch.cuda.empty_cache()
gc.collect()

# --- Load Dataset ---

print("📂 Loading dataset...")
try:
    df = pd.read_csv("/content/drive/MyDrive/train.csv", encoding='latin1')
    df = df[["text"]].dropna().reset_index(drop=True)
    dataset = Dataset.from_pandas(df)
    print(f"✅ Loaded {len(dataset)} samples")
except Exception as e:
    print(f"❌ Failed to load dataset: {e}")
    # Exit or handle the error appropriately, e.g., raise the exception
    raise e

# --- Tokenizer ---

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

# --- Quantization Config ---

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True
)

# --- Load Model ---

print("🚀 Loading model with 4-bit quantization...")
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
    use_cache=False # Add this line
)
model = prepare_model_for_kbit_training(model)

# --- Apply QLoRA ---

lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["query_key_value"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()
print("✅ Model ready with QLoRA setup")

# --- Tokenize Dataset ---

def tokenize(examples):
    return tokenizer(
        examples["text"],
        truncation=True,
        max_length=MAX_LENGTH,
        padding="max_length"
    )
tokenized_dataset = dataset.map(tokenize, batched=True)
print("✅ Tokenization complete")

# --- Training Setup ---

training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=4,
    learning_rate=2e-4,
    weight_decay=0.01,
    optim="paged_adamw_8bit",
    logging_steps=50,
    save_strategy="epoch",
    report_to="none",
    fp16=True,
    remove_unused_columns=False
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=lambda data: {
        'input_ids': torch.stack([torch.tensor(f['input_ids']) for f in data]),
        'attention_mask': torch.stack([torch.tensor(f['attention_mask']) for f in data]),
        'labels': torch.stack([torch.tensor(f['input_ids']) for f in data])
    }
)

# --- Train Model ---

print("\n🚀 Starting training...")
trainer.train()
print("✅ Training complete")
model.save_pretrained(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)

# --- Merge & Save Final Quantized Model ---

print("\n🔄 Merging LoRA adapters...")
base_model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    device_map="auto",
    trust_remote_code=True
)
merged_model = PeftModel.from_pretrained(base_model, OUTPUT_DIR)
merged_model = merged_model.merge_and_unload()

print("💾 Saving final quantized model...")
merged_model.save_pretrained(FINAL_OUTPUT_DIR)
tokenizer.save_pretrained(FINAL_OUTPUT_DIR)

# --- Inference ---

print("\n🧪 Testing quantized model...")
try:
    generator = pipeline(
        "text-generation",
        model=FINAL_OUTPUT_DIR,
        tokenizer=tokenizer,
        device="cuda:0",
        torch_dtype=torch.float16
    )

    prompt = "Rewrite this in a friendly and human way with emojis:."
    output = generator(
        prompt,
        max_new_tokens=100,
        temperature=0.7,
        top_p=0.9,
        do_sample=True
    )

    print("\n🎉 Quantized Model Output:")
    print(output[0]['generated_text'])

except Exception as e:
    print(f"❌ Inference failed: {e}")

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.0/67.0 MB[0m [31m15.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m71.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m66.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m43.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m13.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/287 [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/281 [00:00<?, ?B/s]

PackageNotFoundError: No package metadata was found for bitsandbytes

In [None]:
# 🚀 Install dependencies
!pip install -q transformers datasets accelerate peft bitsandbytes einops
!pip install -U bitsandbytes

# 🚀 Imports
import torch, gc, pandas as pd, os, random
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    TrainingArguments,
    Trainer,
    pipeline
)
from peft import (
    LoraConfig,
    get_peft_model,
    prepare_model_for_kbit_training,
    PeftModel
)

# --- CONFIG ---
MODEL_NAME = "tiiuae/falcon-rw-1b"
DATASET_PATH = "/content/drive/MyDrive/train.csv"
OUTPUT_DIR = "./falconrw1b_finetuned_qlora"
FINAL_OUTPUT_DIR = OUTPUT_DIR + "_quantized"
MAX_LENGTH = 256
EPOCHS = 3
BATCH_SIZE = 32
MAX_STEPS = 1000

# --- Memory Clean-up ---
torch.cuda.empty_cache()
gc.collect()

# --- Load + Prepare Dataset ---
print("📂 Loading and formatting dataset...")
try:
    df = pd.read_csv("/content/drive/MyDrive/train.csv", encoding='latin1')
    # Modify to use 'selected_text' instead of 'humanized'
    df = df[["text", "selected_text"]].dropna().reset_index(drop=True)[:5000]

    # Add pseudo humanization score using 'selected_text'
    df["formatted"] = df.apply(lambda row: f"{row['text']} => Selected Text: {row['selected_text']} Score: {random.randint(70, 100)}", axis=1)

    dataset = Dataset.from_pandas(df[["formatted"]].rename(columns={"formatted": "text"}))
    print(f"✅ Loaded and formatted {len(dataset)} samples using 'selected_text'")
except Exception as e:
    print(f"❌ Dataset load failed: {e}")
    raise e

# --- Tokenizer ---
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

# --- 4-bit Quantization Config ---
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True
)

# --- Load Model & Apply QLoRA ---
print("🚀 Loading model with QLoRA...")
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
    use_cache=False
)
model = prepare_model_for_kbit_training(model)

lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["query_key_value"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

# --- Tokenization ---
def tokenize(examples):
    return tokenizer(
        examples["text"],
        truncation=True,
        max_length=MAX_LENGTH,
        padding="max_length"
    )
tokenized_dataset = dataset.map(tokenize, batched=True)
print("✅ Tokenization done")

# --- Training Args ---
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    max_steps=MAX_STEPS,
    per_device_train_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=4,
    learning_rate=2e-4,
    weight_decay=0.01,
    optim="paged_adamw_8bit",
    logging_steps=50,
    save_strategy="epoch",
    report_to="none",
    fp16=True,
    remove_unused_columns=False,
    gradient_checkpointing=True,
    gradient_checkpointing_kwargs={"use_reentrant": True}
)

# --- Trainer Setup ---
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=lambda data: {
        'input_ids': torch.stack([torch.tensor(f['input_ids']) for f in data]),
        'attention_mask': torch.stack([torch.tensor(f['attention_mask']) for f in data]), # Corrected attention_mask
        'labels': torch.stack([torch.tensor(f['input_ids']) for f in data])
    }
)

# --- Train Model ---
print("\n🚀 Training started...")
trainer.train()
print("✅ Training complete")

# --- Save QLoRA model ---
model.save_pretrained(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)

# --- Merge & Save Final Quantized Model ---
print("\n🔄 Merging adapters...")
base_model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    device_map="auto",
    trust_remote_code=True
)
merged_model = PeftModel.from_pretrained(base_model, OUTPUT_DIR)
merged_model = merged_model.merge_and_unload()

print("💾 Saving final model...")
merged_model.save_pretrained(FINAL_OUTPUT_DIR)
tokenizer.save_pretrained(FINAL_OUTPUT_DIR)

# --- Inference ---
print("\n🧪 Testing model inference...")
try:
    generator = pipeline(
        "text-generation",
        model=FINAL_OUTPUT_DIR,
        tokenizer=tokenizer,
        device="cuda:0",
        torch_dtype=torch.float16
    )

    prompt = "Shanghai is exciting! =>"
    output = generator(
        prompt,
        max_new_tokens=100,
        temperature=0.7,
        top_p=0.9,
        do_sample=True
    )
    print("\n🎉 Output:\n", output[0]["generated_text"])

    # Parse score (optional)
    import re
    score_match = re.search(r"Score[:\s]+(\d+)", output[0]["generated_text"])
    if score_match:
        print("✅ Extracted Humanization Score:", score_match.group(1))
except Exception as e:
    print("❌ Inference failed:", e)



In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# 🚀 Install dependencies
!pip install -q transformers datasets accelerate peft bitsandbytes einops evaluate rouge_score
!pip install -U bitsandbytes

# 🚀 Imports
import torch, gc, os, random, pandas as pd, numpy as np
from datasets import Dataset, DatasetDict
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    TrainingArguments,
    Trainer,
    pipeline,
    set_seed
)
from peft import (
    LoraConfig,
    get_peft_model,
    prepare_model_for_kbit_training,
    PeftModel
)
import evaluate, re

# ✅ Set seed
SEED = 42
set_seed(SEED)
torch.manual_seed(SEED)
random.seed(SEED)
np.random.seed(SEED)

# --- CONFIG ---
MODEL_NAME = "tiiuae/falcon-rw-1b"
DATASET_PATH = "/content/drive/MyDrive/train.csv"
OUTPUT_DIR = "./falconrw1b_finetuned_humanizer"
FINAL_OUTPUT_DIR = OUTPUT_DIR + "_merged"
MAX_LENGTH = 256
EPOCHS = 3
BATCH_SIZE = 32
MAX_STEPS = 1000

torch.cuda.empty_cache()
gc.collect()

# --- Load Dataset ---
print("📂 Loading dataset...")
try:
    df = pd.read_csv(DATASET_PATH, encoding='latin1')
    df = df[["text", "selected_text"]].dropna().reset_index(drop=True)[:5000]

    # Realistic Prompt Template
    df["formatted"] = df.apply(lambda row: f"Rewrite this in a friendly and human way with emojis:\n{row['text']}\n=> {row['selected_text']} 😊 Score: {random.randint(70, 100)}", axis=1)

    raw_dataset = Dataset.from_pandas(df[["formatted"]].rename(columns={"formatted": "text"}))
    dataset = raw_dataset.train_test_split(test_size=0.1, seed=SEED)
    print(f"✅ Loaded {len(dataset['train'])} train + {len(dataset['test'])} test samples")
except Exception as e:
    print(f"❌ Dataset load failed: {e}")
    raise e

# --- Tokenizer ---
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

# --- Quantization Config ---
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True
)

# --- Load Model & Apply QLoRA ---
print("🚀 Loading model...")
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
    use_cache=False
)
model = prepare_model_for_kbit_training(model)
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["query_key_value"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

# --- Tokenization ---
def tokenize(examples):
    return tokenizer(
        examples["text"],
        truncation=True,
        max_length=MAX_LENGTH,
        padding="max_length"
    )
tokenized_dataset = dataset.map(tokenize, batched=True)

# --- Metrics
rouge = evaluate.load("rouge")
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    # Decode predictions and labels
    decoded_preds = tokenizer.batch_decode(logits, skip_special_tokens=True)
    # Replace -100s in labels as we can't decode them
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    result = rouge.compute(predictions=decoded_preds, references=decoded_labels)
    return {"rougeL": result["rougeL"]}

# --- Training Args ---
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    max_steps=MAX_STEPS,
    per_device_train_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=4,
    learning_rate=2e-4,
    weight_decay=0.01,
    warmup_steps=100,
    lr_scheduler_type="cosine",
    optim="paged_adamw_8bit",
    logging_steps=50,
    save_strategy="epoch",
    report_to="none",
    fp16=True,
    max_grad_norm=1.0,
    remove_unused_columns=False,
    gradient_checkpointing=True,
    gradient_checkpointing_kwargs={"use_reentrant": True}
)

# --- Trainer ---
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    compute_metrics=compute_metrics,
    data_collator=lambda data: {
        'input_ids': torch.stack([torch.tensor(f['input_ids']) for f in data]),
        'attention_mask': torch.stack([torch.tensor(f['attention_mask']) for f in data]),
        'labels': torch.stack([torch.tensor(f['input_ids']) for f in data])
    }
)

# --- Train Model ---
print("\n🚀 Training...")
trainer.train()
print("✅ Training complete")

# --- Save LoRA model ---
model.save_pretrained(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)

# --- Merge & Save Final Model ---
print("\n🔄 Merging adapters...")
base_model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    device_map="auto",
    trust_remote_code=True
)
merged_model = PeftModel.from_pretrained(base_model, OUTPUT_DIR)
merged_model = merged_model.merge_and_unload()
print("💾 Saving final model...")
merged_model.save_pretrained(FINAL_OUTPUT_DIR)
tokenizer.save_pretrained(FINAL_OUTPUT_DIR)

# --- Inference ---
print("\n🧪 Testing humanized output + score...")
try:
    generator = pipeline(
        "text-generation",
        model=FINAL_OUTPUT_DIR,
        tokenizer=tokenizer,
        device="cuda:0",
        torch_dtype=torch.float16
    )
    prompt = "Rewrite this in a friendly and human way with emojis:\nThe food delivery was late and cold."
    output = generator(
        prompt,
        max_new_tokens=100,
        temperature=0.7,
        top_p=0.9,
        do_sample=True
    )
    gen_text = output[0]["generated_text"]
    print("\n🎉 Humanized Output:\n", gen_text)

    score_match = re.search(r"Score[:\s]+(\d+)", gen_text)
    if score_match:
        print("✅ Extracted Humanization Score:", score_match.group(1))
    else:
        print("ℹ️ No score found — consider training a separate scorer.")
except Exception as e:
    print("❌ Inference failed:", e)

📂 Loading dataset...
✅ Loaded 4500 train + 500 test samples
🚀 Loading model...


You are using an old version of the checkpointing format that is deprecated (We will also silently ignore `gradient_checkpointing_kwargs` in case you passed it).Please update to the new format on your modeling file. To use the new format, you need to completely remove the definition of the method `_set_gradient_checkpointing` in your model.


trainable params: 1,572,864 || all params: 1,313,198,080 || trainable%: 0.1198


Map:   0%|          | 0/4500 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
You are using an old version of the checkpointing format that is deprecated (We will also silently ignore `gradient_checkpointing_kwargs` in case you passed it).Please update to the new format on your modeling file. To use the new format, you need to completely remove the definition of the method `_set_gradient_checkpointing` in your model.



🚀 Training...


  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)


Step,Training Loss


  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*a