In [None]:
from helper.grobid_utils import process_pdfs_in_directory, grobid_process_pdf

# Process a directory of PDFs
process_pdfs_in_directory("data/pdfs", "data/xml_output")


In [None]:
from helper.tei_utils import process_tei_xml_files, process_body_xml_to_plain_text

process_tei_xml_files()
process_body_xml_to_plain_text()

In [None]:
generator.generateQA(data/chuncks.json, apiendpoint=localhost)

In [None]:
from helper.qag_utils import process_all
import pandas as pd

# Path to folder containing your JSON input files
data_directory = "data/json/"

# Run the full pipeline
df = process_all(data_directory)

# Preview results
print(df.head())
df.describe()

# Save to CSV
df.to_csv("generated_qa.csv", index=False)

In [None]:
from finetune_utils import (
    login_to_huggingface, load_model_and_tokenizer, add_lora_adapters,
    prepare_dataset, get_training_args, run_training, save_and_push_model
)

# Setup
login_to_huggingface()

model_name = "your-model-name"  # e.g., "meta-llama/Llama-3.2-1B-Instruct"
save_path = "./your/local/save/dir"
csv_path = "./your-dataset.csv"
output_dir = "./your/output/dir"
hf_repo = "your-org/your-model-repo"

# Load model/tokenizer, apply LoRA, prepare data
model, tokenizer = load_model_and_tokenizer(model_name, save_path, use_quantization=True)
model = add_lora_adapters(model)
train_dataset, eval_dataset = prepare_dataset(csv_path, tokenizer)
training_args = get_training_args(output_dir)

# Train
trainer = run_training(model, tokenizer, train_dataset, eval_dataset, training_args)

# Save merged model & push to hub
save_and_push_model(trainer.model, tokenizer, merged_dir=output_dir + "-merged", hf_repo=hf_repo)

In [None]:
from finetune_utils import convert_to_gguf

# After training and merging:
merged_model_path = "./models/YourProject/Llama-1B-Merged"
gguf_output_path = "./models/YourProject/gguf-out"

# requires llama-cpp
convert_to_gguf(
    input_dir=merged_model_path,
    output_dir=gguf_output_path,
    dtype="q8_0"
)


In [None]:
from openai import OpenAI
from helper_functions import (
    load_and_prepare_data,
    generate_model_responses,
    evaluate_scores,
    visualize_scores,
    calculate_improvements,
    generate_wordclouds,
)
from IPython.display import display

# Paths and setup
test_data_path = "questions_answers_comparisons_updated.csv"
output_file = "test_data_sample_with_generated_answers.csv"

# Initialize OpenAI clients
direct_client = OpenAI(base_url="http://localhost:1234/v1", api_key="lm-studio")
rag_client = OpenAI(base_url="http://127.0.0.1:5000", api_key="lm-studio")

# Model configurations
models = [
    ("Direct: meta-llama", "meta-llama-3.1-8b-instruct", direct_client),
    ("Direct: med-llama", "med-llama-3.1-8b-instruct-guff", direct_client),
    ("RAG: meta-llama", "meta-llama-3.1-8b-instruct@q8_0", rag_client),
    ("RAG: med-llama", "med-llama-3.1-8b-instruct-guff@q8_0", rag_client),
]

# Step 1: Load and prepare data
test_data_sample = load_and_prepare_data(test_data_path, output_file)

# Step 2: Generate responses
test_data_sample = generate_model_responses(test_data_sample, models, output_file)

# Step 3: Evaluate scores
results = evaluate_scores(test_data_sample, models)

# Step 4: Visualize scores
visualize_scores(results)

# Step 5: Calculate and display improvements
improvement_table = calculate_improvements(results, baseline_index=0)
display(improvement_table)

# Step 6: Word cloud visualization
generate_wordclouds(test_data_sample, models)