In [15]:
!pip install transformers datasets peft bitsandbytes accelerate trl



# PHI-2

In [None]:
import json
import torch
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from transformers import BitsAndBytesConfig

# Load and preprocess your data
def load_custom_dataset(path):
    with open(path, 'r') as f:
        data = json.load(f)

    samples = []
    for item in data['data'].values():
        for para in item['paragraphs'].values():
            context = para.get('context', '')
            for qa in para['qas'].values():
                question = qa['question']
                answer = qa['answers']['0']['text']  # assuming always one answer
                input_text = f"{context}\nQuestion: {question}\nAnswer:"
                samples.append({'input': input_text, 'output': answer})
    return Dataset.from_list(samples)

dataset = load_custom_dataset('check_hea_qa_dataset.json')

# Load tokenizer and quantized model
model_id = "microsoft/phi-2"

tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)

base_model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=quant_config,
    device_map="auto",
    trust_remote_code=True
)

base_model = prepare_model_for_kbit_training(base_model)

# Apply LoRA
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(base_model, lora_config)
model.print_trainable_parameters()

# Tokenize dataset
def tokenize(sample):
    inputs = tokenizer(sample["input"], truncation=True, padding="max_length", max_length=512)
    outputs = tokenizer(sample["output"], truncation=True, padding="max_length", max_length=64)
    inputs["labels"] = outputs["input_ids"]
    return inputs

tokenized_dataset = dataset.map(tokenize, remove_columns=["input", "output"])

# Training arguments
training_args = TrainingArguments(
    output_dir="./phi2-lora-finetune",
    num_train_epochs = 1,
    fp16 = True,
    bf16 = False,
    per_device_train_batch_size = 4,
    per_device_eval_batch_size = 4,
    gradient_accumulation_steps = 1,
    gradient_checkpointing = True,
    learning_rate = 2e-4,
    weight_decay = 0.001,
    optim = "paged_adamw_8bit",
    logging_steps = 10,
    save_strategy="epoch",
    report_to="none"
)

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

trainer = Trainer(
    model=model,
    train_dataset=tokenized_dataset,
    args=training_args,
    tokenizer=tokenizer,
    data_collator=data_collator
)

# 🚀 Train the model
trainer.train()

# ✅ Save final model
model.save_pretrained("./phi2-lora-finetuned")
tokenizer.save_pretrained("./phi2-lora-finetuned")

In [None]:
# 🚀 Train the model
trainer.train()

# ✅ Save final model
model.save_pretrained("./phi2-lora-finetuned")
tokenizer.save_pretrained("./phi2-lora-finetuned")

## Inference

In [None]:
import time
import json
import torch
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from transformers import BitsAndBytesConfig

In [5]:
model_id = "microsoft/phi-2"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

tokenizer_config.json:   0%|          | 0.00/7.34k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/1.08k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

In [None]:
start_time = time.time()
from transformers import pipeline



pipe = pipeline("text-generation", model="./phi2-lora-finetuned", tokenizer=tokenizer, device_map="auto")
context = "A new refractory alloy, Nb20Cr20Mo10Ta10Ti20Zr20, was produced by vacuum arc melting. To close shrinkage porosity, it was hot isostatically pressed (HIPd) at T=1723K and P=207MPa for 3h. In both as-solidified and HIPd conditions, the alloy contained three phases: two body centered cubic (BCC1 and BCC2) and one face centered cubic (FCC). The BCC1 phase was enriched with Nb, Mo and Ta and depleted with Zr and Cr, and its lattice parameter after HIP was a=324.76±0.16pm. The BCC2 phase was enriched with Zr and Ti and considerably depleted with Mo, Cr and Ta, and its lattice parameter after HIP was estimated to be a=341.0±1.0pm. The FCC phase was highly enriched with Cr and it was identified as a Laves C15 phase, (Zr,Ta)(Cr,Mo,Nb)2, with the lattice parameter a=733.38±0.18pm. The volume fractions of the BCC1, BCC2 and FCC phases were 67%, 16% and 17%, respectively. The alloy density and Vickers microhardness were ρ=8.23±0.01g/cm3 and Hv=5288±71MPa. The alloy had compression yield strength of 1595MPa at 296K, 983MPa at 1073K, 546MPa at 1273K and 171MPa at 1473K. During deformation at 296K and 1073K, the alloy showed a mixture of ductile and brittle fracture after plastic compression strain of ∼5-6%. No macroscopic fracture was observed after 50% compression strain at 1273K and 1473K. Phase transformations and particle coarsening considerably accelerated by the plastic deformation occurred in the temperature range of 1073-1473K. © 2011 Elsevier B.V."
# question = "What is the Vickers microhardness of the Nb20Cr20Mo10Ta10Ti20Zr20 alloy?"
question = "What is the density of the Nb20Cr20Mo10Ta10Ti20Zr20 alloy?"
prompt = f"{context}.\nQuestion: {question}\nAnswer:"
result = pipe(
    prompt,
    max_new_tokens=50,
    do_sample=False,
    temperature=0.0,
    eos_token_id=tokenizer.eos_token_id,
    return_full_text=False,
)
# generated = result[0]['generated_text'].strip().split("[END]")[0].strip()
generated = result[0]['generated_text'].strip()
print(generated.split("\n")[0])
print("--- %s seconds ---" % (time.time() - start_time))

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cuda:0


The density of the Nb20Cr20Mo10Ta10Ti20Zr20 alloy is 8.23±0.01g/cm3.
--- 97.04499578475952 seconds ---


In [None]:
for name, module in model.named_modules():
    print(name)


base_model
base_model.model
base_model.model.model
base_model.model.model.embed_tokens
base_model.model.model.embed_dropout
base_model.model.model.layers
base_model.model.model.layers.0
base_model.model.model.layers.0.self_attn
base_model.model.model.layers.0.self_attn.q_proj
base_model.model.model.layers.0.self_attn.q_proj.base_layer
base_model.model.model.layers.0.self_attn.q_proj.lora_dropout
base_model.model.model.layers.0.self_attn.q_proj.lora_dropout.default
base_model.model.model.layers.0.self_attn.q_proj.lora_A
base_model.model.model.layers.0.self_attn.q_proj.lora_A.default
base_model.model.model.layers.0.self_attn.q_proj.lora_B
base_model.model.model.layers.0.self_attn.q_proj.lora_B.default
base_model.model.model.layers.0.self_attn.q_proj.lora_embedding_A
base_model.model.model.layers.0.self_attn.q_proj.lora_embedding_B
base_model.model.model.layers.0.self_attn.q_proj.lora_magnitude_vector
base_model.model.model.layers.0.self_attn.k_proj
base_model.model.model.layers.0.self_a