In [1]:
pip install pandas datasets unsloth torch transformers requests cupy-cuda12x

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.2 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [1]:
import pandas as pd
from datasets import load_dataset, Dataset
import requests
import json

# DeepSeek API setup
DEEPSEEK_API_KEY = "sk-732ea226899242339e5d25944abbafd7"
DEEPSEEK_URL = "https://api.deepseek.com/v1/chat/completions"

# Load dataset from Hugging Face
dataset = load_dataset("medalpaca/medical_meadow_medqa")
df = dataset["train"].to_pandas()

# Verify column names
print(df.columns)  # Debugging step

# Correct column names
HEALTH_TOPICS = ["neonatal", "pregnancy", "obgyn", "baby", "birth", "fetal", "labor", "ectopic", "preeclampsia", "gestation"]
filtered_df = df[df["input"].str.lower().apply(lambda x: any(topic in x for topic in HEALTH_TOPICS))]

# Generate synthetic data using DeepSeek API
def generate_synthetic_qa(prompt):
    payload = {
        "model": "deepseek-chat",
        "messages": [{"role": "user", "content": prompt}],
        "max_tokens": 200
    }
    headers = {"Authorization": f"Bearer {DEEPSEEK_API_KEY}", "Content-Type": "application/json"}
    response = requests.post(DEEPSEEK_URL, json=payload, headers=headers).json()

    print(response)  # Debug API response

    if "choices" in response:
        return response["choices"][0]["message"]["content"]
    else:
        return "Error: Invalid API Response"

synthetic_data = []
prompts = [
    "Generate a question and answer about neonatal jaundice treatment.",
    "Generate a Q&A pair about managing preeclampsia in pregnancy.",
    "Generate a question and answer about postpartum hemorrhage."
]

for prompt in prompts:
    response = generate_synthetic_qa(prompt)
    lines = response.split("\n")
    if len(lines) >= 2:
        q, a = lines[0], lines[1]
        synthetic_data.append({"input": q.strip(), "output": a.strip()})

# Combine datasets
synthetic_df = pd.DataFrame(synthetic_data)
combined_df = pd.concat([filtered_df[["input", "output"]], synthetic_df], ignore_index=True)

# Save to JSON
combined_df.to_json("health_data.json", orient="records")
dataset = Dataset.from_pandas(combined_df)
print(f"Total entries: {len(combined_df)}")


Index(['input', 'instruction', 'output'], dtype='object')
{'id': 'de60f922-87f2-4c0c-94f6-ee0a7a2bc4b5', 'object': 'chat.completion', 'created': 1743182523, 'model': 'deepseek-chat', 'choices': [{'index': 0, 'message': {'role': 'assistant', 'content': "**Question:**  \nWhat are the common treatment options for neonatal jaundice, and when is phototherapy typically recommended?  \n\n**Answer:**  \nThe most common treatment for neonatal jaundice is **phototherapy**, which uses blue-spectrum light to break down bilirubin in the baby's skin, making it easier for the body to eliminate. Phototherapy is usually recommended when bilirubin levels approach or exceed established thresholds based on the infant’s age (in hours), gestational age, and risk factors.  \n\nIn severe cases, **exchange transfusion** may be required if phototherapy fails to lower bilirubin levels sufficiently. Other supportive measures include ensuring adequate feeding (breastmilk or formula) to promote bilirubin excretion.

In [2]:
pip install cupy-cuda12x

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.2 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [6]:
from unsloth import FastLanguageModel
from datasets import load_dataset


# Load the model
model_name = "unsloth/DeepSeek-R1-Distill-Llama-8B"  # Or another DeepSeek model
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=model_name,
    max_seq_length=2048,
    dtype=None,  # Auto-detect
    load_in_4bit=True  # For efficiency
)

# Load the combined dataset
dataset = load_dataset("json", data_files="health_data.json")["train"]

# Define a prompt template
prompt_template = """### Instruction: You are Sharma, a medical expert specializing in neonatal, pregnancy, and OB-GYN cases. Respond only to health queries in these areas. For non-relevant queries, say: "I’m specialized in neonatal, pregnancy, and OB-GYN cases only. Please ask a related question."
Question: {question}
Answer: {answer}"""

# Format dataset
def format_prompt(example):
    return {"text": prompt_template.format(question=example["question"], answer=example["answer"])}

dataset = dataset.map(format_prompt)

# Fine-tune with LoRA
model = FastLanguageModel.get_peft_model(
    model,
    r=16,  # LoRA rank
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    lora_alpha=16,
    lora_dropout=0
)

trainer = FastLanguageModel.Trainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset,
    args={
        "per_device_train_batch_size": 2,
        "gradient_accumulation_steps": 4,
        "max_steps": 500,  # Adjust based on dataset size
        "learning_rate": 2e-4,
        "fp16": True,  # Mixed precision for speed
        "logging_steps": 10
    }
)
trainer.train()

# Save the fine-tuned model
model.save_pretrained("sharma-health-model")
tokenizer.save_pretrained("sharma-health-model")

NotImplementedError: Unsloth: No NVIDIA GPU found? Unsloth currently only supports GPUs!

In [3]:
import pandas as pd
import cupy as cp  # CuPy for CUDA acceleration
from datasets import load_dataset, Dataset
import requests
import json

# DeepSeek API setup
DEEPSEEK_API_KEY = "sk-732ea226899242339e5d25944abbafd7"
DEEPSEEK_URL = "https://api.deepseek.com/v1/chat/completions"

# Load dataset from Hugging Face
dataset = load_dataset("medalpaca/medical_meadow_medqa")
df = dataset["train"].to_pandas()

# Verify column names
print(df.columns)  # Debugging step

# Move text data to GPU for faster processing
df_gpu = {col: cp.array(df[col].astype(str)) for col in df.columns}

# Define relevant health topics
HEALTH_TOPICS = ["neonatal", "pregnancy", "obgyn", "baby", "birth", "fetal", "labor", "ectopic", "preeclampsia", "gestation"]

# GPU-accelerated filtering
input_texts_gpu = df_gpu["input"]
mask_gpu = cp.array([any(topic in text.lower() for topic in HEALTH_TOPICS) for text in input_texts_gpu.get()])
filtered_indices = cp.where(mask_gpu)[0]

# Convert filtered data back to Pandas
filtered_df = df.iloc[filtered_indices.get()]
print(f"Filtered entries: {len(filtered_df)}")

# Generate synthetic data using DeepSeek API
def generate_synthetic_qa(prompt):
    payload = {
        "model": "deepseek-chat",
        "messages": [{"role": "user", "content": prompt}],
        "max_tokens": 200
    }
    headers = {"Authorization": f"Bearer {DEEPSEEK_API_KEY}", "Content-Type": "application/json"}
    response = requests.post(DEEPSEEK_URL, json=payload, headers=headers).json()

    print(response)  # Debug API response

    if "choices" in response and response["choices"]:
        return response["choices"][0]["message"]["content"]
    else:
        return "Error: Invalid API Response"

synthetic_data = []
prompts = [
    "Generate a question and answer about neonatal jaundice treatment.",
    "Generate a Q&A pair about managing preeclampsia in pregnancy.",
    "Generate a question and answer about postpartum hemorrhage."
]

for prompt in prompts:
    response = generate_synthetic_qa(prompt)
    lines = response.split("\n")
    if len(lines) >= 2:
        q, a = lines[0], lines[1]
        synthetic_data.append({"input": q.strip(), "output": a.strip()})

# Combine datasets
synthetic_df = pd.DataFrame(synthetic_data)
combined_df = pd.concat([filtered_df[["input", "output"]], synthetic_df], ignore_index=True)

# Save to JSON
combined_df.to_json("health_data.json", orient="records")
dataset = Dataset.from_pandas(combined_df)
print(f"Total entries: {len(combined_df)}")




Index(['input', 'instruction', 'output'], dtype='object')


ValueError: Unsupported dtype object

In [6]:
import pandas as pd
from datasets import load_dataset, Dataset
import requests
import json
from unsloth import FastLanguageModel

# DeepSeek API setup
DEEPSEEK_API_KEY = "sk-732ea226899242339e5d25944abbafd7"
DEEPSEEK_URL = "https://api.deepseek.com/v1/chat/completions"

# Load dataset from Hugging Face
dataset = load_dataset("medalpaca/medical_meadow_medqa")
df = dataset["train"].to_pandas()

# Verify column names
print("Columns in dataset:", df.columns)

# Define relevant health topics
HEALTH_TOPICS = ["neonatal", "pregnancy", "obgyn", "baby", "birth", "fetal", "labor", "ectopic", "preeclampsia", "gestation"]

# Filter for relevant topics (CPU-based)
filtered_df = df[df["input"].str.lower().apply(lambda x: any(topic in x for topic in HEALTH_TOPICS))]
print(f"Filtered entries: {len(filtered_df)}")

# Generate synthetic data using DeepSeek API
def generate_synthetic_qa(prompt):
    payload = {
        "model": "deepseek-chat",
        "messages": [{"role": "user", "content": prompt}],
        "max_tokens": 200
    }
    headers = {"Authorization": f"Bearer {DEEPSEEK_API_KEY}", "Content-Type": "application/json"}
    response = requests.post(DEEPSEEK_URL, json=payload, headers=headers).json()

    print("API Response:", response)

    if "choices" in response and response["choices"]:
        content = response["choices"][0]["message"]["content"]
        lines = [line.strip() for line in content.split("\n") if line.strip()]
        if len(lines) >= 2:
            return lines[0], lines[1]
        else:
            return "Error: Unexpected format", content
    else:
        return "Error: Invalid API Response", "No content"

synthetic_data = []
prompts = [
    "Generate a question and answer about neonatal jaundice treatment.",
    "Generate a Q&A pair about managing preeclampsia in pregnancy.",
    "Generate a question and answer about postpartum hemorrhage."
]

for prompt in prompts:
    question, answer = generate_synthetic_qa(prompt)
    synthetic_data.append({"input": question, "output": answer})

# Combine datasets
synthetic_df = pd.DataFrame(synthetic_data)
combined_df = pd.concat([filtered_df[["input", "output"]], synthetic_df], ignore_index=True)

# Save to JSON
combined_df.to_json("health_data.json", orient="records")
dataset = Dataset.from_pandas(combined_df)
print(f"Total entries: {len(combined_df)}")

# Fine-tuning with unsloth
print("Starting fine-tuning...")
model_name = "unsloth/DeepSeek-R1-Distill-Llama-8B"  # Base model
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=model_name,
    max_seq_length=2048,
    dtype=None,  # Auto-detect
    load_in_4bit=True  # For efficiency
)

# Load the dataset
dataset = load_dataset("json", data_files="health_data.json")["train"]

# Define prompt template
prompt_template = """### Instruction: You are Sharma, a medical expert specializing in neonatal, pregnancy, and OB-GYN cases. Respond only to health queries in these areas. For non-relevant queries, say: "I’m specialized in neonatal, pregnancy, and OB-GYN cases only. Please ask a related question."
Question: {input}
Answer: {output}"""

def format_prompt(example):
    return {"text": prompt_template.format(input=example["input"], output=example["output"])}

dataset = dataset.map(format_prompt)

# Fine-tune with LoRA
model = FastLanguageModel.get_peft_model(
    model,
    r=16,  # LoRA rank
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    lora_alpha=16,
    lora_dropout=0
)

trainer = FastLanguageModel.Trainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset,
    args={
        "per_device_train_batch_size": 2,
        "gradient_accumulation_steps": 4,
        "max_steps": 500,
        "learning_rate": 2e-4,
        "fp16": True,
        "logging_steps": 10
    }
)

trainer.train()

# Save the fine-tuned model
model.save_pretrained("sharma-health-model")
tokenizer.save_pretrained("sharma-health-model")
print("Model saved to 'sharma-health-model'")

ImportError: DLL load failed while importing libtriton: A dynamic link library (DLL) initialization routine failed.