In [None]:
import torch
from datasets import load_dataset
from transformers import AutoTokenizer,AutoModelForCausalLM,BitsAndBytesConfig,TrainingArguments

In [None]:
from peft import LoraConfig, get_peft_model
from trl import SFTTrainer

In [None]:
import os

In [None]:
os.environ["hf_token"] = "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"

In [None]:
model_name = "microsoft/phi-2"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,

    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
)

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto",
)


In [None]:
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

In [None]:
import json

cleaned = []
bad_rows = 0

with open("/content/train_sampled.jsonl", encoding="utf-8") as f:
    for i, line in enumerate(f, start=1):
        try:
            json.loads(line)
            cleaned.append(line)
        except Exception:
            bad_rows += 1
            if bad_rows <= 5:
                print(f"Dropped broken row {i}")

with open("/content/train_sampled_clean.jsonl", "w", encoding="utf-8") as f:
    for line in cleaned:
        f.write(line)

In [None]:
dataset = load_dataset(
    "json",
    data_files={
        "train": "/content/train_sampled_clean.jsonl",
        "validation": "val.jsonl"
    }
)

In [None]:
def format_prompt(example):
    return f"""### Instruction:
{example['instruction']}

### Input:
{example['input']}

### Response:
{example['output']}
"""

dataset = dataset.map(
    lambda x: {"text": format_prompt(x)},
    remove_columns=dataset["train"].column_names
)


In [None]:
training_args = TrainingArguments(
    output_dir="./indian_news_llm",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=16,
    learning_rate=2e-4,
    num_train_epochs=1,
    fp16=True,
    logging_steps=50,
    save_steps=500,
    save_total_limit=2,
    report_to="none",
)


In [None]:
import trl,transformers
print("trl_version",trl.__version__)
print("transformers_version",transformers.__version__)



In [None]:
trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["validation"],
    processing_class=tokenizer,
)

In [None]:
trainer.model.save_pretrained("indian_news_lora_adapter")
tokenizer.save_pretrained("indian_news_lora_adapter")