In [7]:
import pandas as pd

df = pd.read_parquet("hf://datasets/jimmyzxj/massw/massw_data/train-00000-of-00001-30d85c6bc506170b.parquet")
# Inspect the data
print(df.head())


                         id  \
0  5dd50ed43a55ac51376178d1   
1  5dd6604a3a55ac78684acf19   
2  5e539eca3a55ac4db70a52b7   
3  5e539eca3a55ac4db70a52d0   
4  5e5794b791e0115453751069   

                                             context  \
0  Recent advances in deep learning have focused ...   
1  Current geolocalisation approaches require ima...   
2  Formulating efficient SQL queries is a challen...   
3  Two-sample tests are utilized to determine if ...   
4  Bandit learning algorithms typically balance e...   

                                            key_idea  \
0  The authors propose a metric, based on the Fis...   
1  The authors propose a novel approach to geoloc...   
2  The authors propose a new approach for predict...   
3  The authors suggest a new kernel-based two-sam...   
4  This paper proposes simple greedy algorithms f...   

                                              method  \
0  The authors provide a theoretical analysis inc...   
1  The model uses a sequenc

In [8]:
# Combine fields into a single prompt-response format with projected impact
def format_data_with_impact(row):
    return f"""
    Context: {row['context']}
    Key Idea: {row['key_idea']}
    Method: {row['method']}
    Outcome: {row['outcome']}
    Question: Based on the above information, what is the projected impact?
    Projected Impact: {row['projected_impact'] if row['projected_impact'] else 'Unknown'}
    """

# Apply the formatting
df["formatted"] = df.apply(format_data_with_impact, axis=1)

# Save for training
df["formatted"].to_csv("training_data_with_impact.txt", index=False, header=False)


In [1]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

# Define the paths
model_dir = "/Users/main/Documents/Data for palantir/Massw/models/original"  # Path to your model folder

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_dir)

# Load the model and map it to the CPU
try:
    state_dict = torch.load(f"{model_dir}/consolidated.00.pth", map_location=torch.device("cpu"))
    model = AutoModelForCausalLM.from_pretrained(model_dir, state_dict=state_dict)
except Exception as e:
    print(f"Error loading model: {e}")
    model = None

if model:
    # Test the model with a prompt
    prompt = "Explain advancements in AI."
    inputs = tokenizer(prompt, return_tensors="pt")

    # Generate a response
    try:
        outputs = model.generate(**inputs, max_length=100)
        print(tokenizer.decode(outputs[0], skip_special_tokens=True))
    except Exception as e:
        print(f"Error generating response: {e}")


  from .autonotebook import tqdm as notebook_tqdm
You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 - if you loaded a llama tokenizer from a GGUF file you can ignore this message
You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama_fast.LlamaTokenizerFast'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this w

: 

In [None]:
from transformers import AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments
from peft import LoraConfig, PeftModel, PeftModelForCausalLM

# Load the quantized model
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype="bfloat16",
    bnb_4bit_quant_type="nf4",
)

from llama_cpp import Llama

model_path = "./models/Llama-3.2-3B-Instruct-Q4_K_M.gguf"  # Update with your model file path
llm = Llama(model_path=model_path)
# Set up LoRA configuration
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],  # Fine-tune attention layers
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

# Prepare the model with LoRA
model = PeftModel.from_pretrained(model, lora_config)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./llama_finetuned",
    per_device_train_batch_size=4,
    num_train_epochs=3,
    save_steps=1000,
    logging_dir="./logs",
    logging_steps=100,
    evaluation_strategy="steps",
    eval_steps=500,
    save_total_limit=2,
    fp16=True,  # Enable mixed precision training
)

# Fine-tune the model
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    tokenizer=tokenizer,
)

trainer.train()


In [None]:
# Save the LoRA fine-tuned model
model.save_pretrained("./llama_finetuned")
tokenizer.save_pretrained("./llama_finetuned")

# Load the fine-tuned model for inference
from transformers import pipeline

fine_tuned_model = PeftModelForCausalLM.from_pretrained("./llama_finetuned")
pipe = pipeline("text-generation", model=fine_tuned_model, tokenizer=tokenizer)

# Test inference
prompt = "Context: Advances in AI have focused on..."
result = pipe(prompt, max_length=100)
print(result[0]['generated_text'])
