# Deploy Gemma2 9B

## Installing Necessary Packages:

In [None]:
%%capture
# Installs Unsloth, Xformers (Flash Attention) and all other packages!
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"

# We have to check which Torch version for Xformers (2.3 -> 0.0.27)
from torch import __version__; from packaging.version import Version as V
xformers = "xformers==0.0.27" if V(__version__) < V("2.4.0") else "xformers"
!pip install --no-deps {xformers} trl peft accelerate bitsandbytes triton

In [None]:
from unsloth import FastLanguageModel

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


## Loading the Model and Tokenizer:

In [None]:
pip install --no-deps --upgrade "flash-attn>=2.6.3"

Collecting flash-attn>=2.6.3
  Downloading flash_attn-2.6.3.tar.gz (2.6 MB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/2.6 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m2.6/2.6 MB[0m [31m169.8 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.6/2.6 MB[0m [31m74.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: flash-attn
  Building wheel for flash-attn (setup.py) ... [?25l[?25hdone
  Created wheel for flash-attn: filename=flash_attn-2.6.3-cp310-cp310-linux_x86_64.whl size=187309225 sha256=237ef9c6157db394e1ddde4ba609a21ebb98382377a27041edc09318801a6f24
  Stored in directory: /root/.cache/pip/wheels/7e/e3/c3/89c7a2f3c4adc07cd1c675f8bb7b9ad4d18f64a72bccdfe826
Successfully built flash-attn
Installing collected packages: flash-attn
Successfully installed flash-attn-2.

In [None]:
from unsloth import FastLanguageModel
import torch
import pandas as pd
from sklearn.metrics import precision_score, recall_score, f1_score
import random
import re
from google.colab import drive
from transformers import AutoTokenizer


In [None]:
max_seq_length = 1024  # Maximum sequence length for input data
dtype = None  # Automatically detect the appropriate data type (e.g., float16, bfloat16)
load_in_4bit = False  # Use 4-bit quantization to reduce memory usage

# Use the gemma-2-9b model
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/gemma-2-9b-bnb-4bit",
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
)

Unsloth: If you want to finetune Gemma 2, install flash-attn to make it faster!
To install flash-attn, do the below:

pip install --no-deps --upgrade "flash-attn>=2.6.3"
==((====))==  Unsloth 2024.9.post1: Fast Gemma2 patching. Transformers = 4.44.2.
   \\   /|    GPU: NVIDIA L4. Max memory: 22.168 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.4.1+cu121. CUDA = 8.9. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors.index.json:   0%|          | 0.00/39.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.90G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.96G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/3.67G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/46.4k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

## Prepare the Model for Inference:

In [None]:
# Prepare the model for inference
FastLanguageModel.for_inference(model)

Gemma2ForCausalLM(
  (model): Gemma2Model(
    (embed_tokens): Embedding(256000, 3584)
    (layers): ModuleList(
      (0-41): 42 x Gemma2DecoderLayer(
        (self_attn): Gemma2Attention(
          (q_proj): Linear(in_features=3584, out_features=4096, bias=False)
          (k_proj): Linear(in_features=3584, out_features=2048, bias=False)
          (v_proj): Linear(in_features=3584, out_features=2048, bias=False)
          (o_proj): Linear(in_features=4096, out_features=3584, bias=False)
          (rotary_emb): GemmaFixedRotaryEmbedding()
        )
        (mlp): Gemma2MLP(
          (gate_proj): Linear(in_features=3584, out_features=14336, bias=False)
          (up_proj): Linear(in_features=3584, out_features=14336, bias=False)
          (down_proj): Linear(in_features=14336, out_features=3584, bias=False)
          (act_fn): PytorchGELUTanh()
        )
        (input_layernorm): Gemma2RMSNorm((3584,), eps=1e-06)
        (post_attention_layernorm): Gemma2RMSNorm((3584,), eps=1e-06)
 

## Connecting to github, and verify

In [None]:
import os
# Clone the repository from GitHub.
# Make sure to replace the placeholder with the actual GitHub repository UR
!git clone
!git config --global user.email "....."
!git config --global user.name "...."
%cd Research/dataset

os.environ['GITHUB_TOKEN'] = ''

## Define the query function

In [None]:
def query_model(prompts, model, tokenizer, max_new_tokens=500):
    responses = []
    for prompt in prompts:
        # Tokenize the input
        inputs = tokenizer([prompt], return_tensors="pt").to("cuda")

        # Generate the response from the model
        outputs = model.generate(**inputs, max_new_tokens=max_new_tokens, use_cache=True)

        # Decode the response and keep it concise
        response = tokenizer.batch_decode(outputs)[0]
        # Remove any extra newlines or whitespace
        response = response.replace("\n", "").strip()
        print(response)
        responses.append(response)

    return responses

# Evaluating Gemma2_9b

## Basic Knowledge

In [None]:

# Extract the prompts
prompts = pd.read_csv('basic_knowledge_prompt.csv')['Prompt'].tolist()
responses = query_model(prompts, model, tokenizer)
# Save the responses to a new CSV file
responses_df = pd.DataFrame({
    'Prompt': prompts,
    'Response': responses
})
def clean_response(response):
    # Convert the response to lowercase and strip any extra whitespace
    response = response.lower().strip()
    # Check if the response contains only 'true'
    if "true" in response:
        return "True"
    # Check if the response contains only 'false'
    elif "false" in response:
        return "False"
    else:
        # If the response doesn't contain 'true' or 'false', return None
        return None
responses_df['Response'] = responses_df['Response'].apply(clean_response)

In [None]:
ground_truth_df = pd.read_csv('basic_knowledge_prompt.csv')
ground_truth_df
ground_truth_df['Result'] = ground_truth_df['Result'].map({True: 'True', False: 'False'})

# Merge the cleaned responses with the ground truth prompts on the 'Prompt' column
merged_df = pd.merge(ground_truth_df,
                     responses_df[['Prompt', 'Response']],
                     on="Prompt",
                     how="inner")


# Calculate precision, recall, and f1-score
precision = precision_score(merged_df['Result'], merged_df['Response'], pos_label="True")
recall = recall_score(merged_df['Result'], merged_df['Response'], pos_label="True")
f1 = f1_score(merged_df['Result'], merged_df['Response'], pos_label="True")

# Display the results
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1-Score: {f1:.2f}")

Precision: 0.59
Recall: 0.98
F1-Score: 0.74


In [None]:
# Define the target directory relative to the repository root
target_directory = '../model/Gemma2_9B'

# Check if the directory exists, and create it if not
if not os.path.exists(target_directory):
    os.makedirs(target_directory)  # Create the directory and all intermediate-level directories

# Define the full path where you want to save the CSV file
save_path = os.path.join(target_directory, 'basic_knowledge_response.csv')

# Save the DataFrame to the specified path
merged_df.to_csv(save_path, index=False)

print(f"File saved to: {save_path}")
!git remote set-url origin https://$GITHUB_TOKEN@github.com/..../Research.git
!git add {save_path}
!git commit -m "Added basic_knowledge_response to the model folder"
!git push origin main

## Mediator graph

In [None]:
prompts = pd.read_csv('mediator_prompt.csv')['Prompt'].tolist()
responses = query_model(prompts, model, tokenizer)


In [None]:
# Define a function to clean and normalize responses
def clean_response(response):
    # Remove the 'collider: ' prefix and '<|end_of_text|>' token, and strip any extra whitespace
    return response.replace('mediator: ', '').replace('<eos>', '').strip()
cleaned_responses = [clean_response(response) for response in responses]
ground_truth_df = pd.read_csv('mediator_prompt.csv')['Correct Answer']


comparison_df = pd.DataFrame({
    'Prompt': prompts,
    'Response': cleaned_responses,
    'Correct Answer': ground_truth_df # Corresponding correct answers
})

# Normalize both the Response and Correct Answer for comparison
comparison_df['Is Correct'] = comparison_df.apply(
    lambda row:row['Response'] == row['Correct Answer'],
    axis=1
)

# Calculate the accuracy
accuracy = comparison_df['Is Correct'].mean()
print(f"Accuracy: {accuracy * 100:.2f}%")

Accuracy: 46.37%


In [None]:
# Define the target directory relative to the repository root
target_directory = '../model/Gemma2_9B'

# Check if the directory exists, and create it if not
if not os.path.exists(target_directory):
    os.makedirs(target_directory)  # Create the directory and all intermediate-level directories

# Define the full path where you want to save the CSV file
save_path = os.path.join(target_directory, 'Mediator_response.csv')

# Save the DataFrame to the specified path
comparison_df.to_csv(save_path, index=False)

print(f"File saved to: {save_path}")
!git remote set-url origin https://$GITHUB_TOKEN@github.com/...../Research.git
!git add {save_path}
!git commit -m "Added basic_knowledge_response to the model folder"
!git push origin main

## Confunder graph

In [None]:
prompts = pd.read_csv('confunder_prompt.csv')['Prompt'].tolist()
responses = query_model(prompts, model, tokenizer)


In [None]:

def clean_response(response):
    # Remove the 'collider: ' prefix and '<|end_of_text|>' token, and strip any extra whitespace
    response = response.replace('confounder: ', '').replace('<eos>', '').strip()

    # Use regex to remove any instructional text that appears after the variable name
    # This regex captures everything before the instructional text, ensuring only the variable name remains
    clean_response = re.split(r'### Instruction:|### Input:', response)[0].strip()

    return clean_response
ground_truth_df = pd.read_csv('confunder_prompt.csv')['Correct Answer']
cleaned_responses = [clean_response(response) for response in responses]

# Combine the prompts, cleaned responses, and correct answers into a DataFrame
comparison_df = pd.DataFrame({
    'Prompt': prompts,
    'Response': cleaned_responses,
    'Correct Answer': ground_truth_df
})

# Normalize both the Response and Correct Answer for comparison
comparison_df['Is Correct'] = comparison_df.apply(
    lambda row:row['Response'] == row['Correct Answer'],
    axis=1
)

# Calculate the accuracy
accuracy = comparison_df['Is Correct'].mean()
print(f"Accuracy: {accuracy * 100:.2f}%")



Accuracy: 36.40%


In [None]:
# Define the target directory relative to the repository root
target_directory = '../model/Gemma2_9B'

# Check if the directory exists, and create it if not
if not os.path.exists(target_directory):
    os.makedirs(target_directory)  # Create the directory and all intermediate-level directories

# Define the full path where you want to save the CSV file
save_path = os.path.join(target_directory, 'Confunder_response.csv')

# Save the DataFrame to the specified path
comparison_df.to_csv(save_path, index=False)

print(f"File saved to: {save_path}")
!git remote set-url origin https://$GITHUB_TOKEN@github.com/..../Research.git
!git add {save_path}
!git commit -m "Added basic_knowledge_response to the model folder"
!git push origin main

## Collider graph

In [None]:
prompts = pd.read_csv('collider_prompt.csv')['Prompt'].tolist()
responses = query_model(prompts, model, tokenizer)

In [None]:
def clean_response(response):
    # Remove the 'collider: ' prefix and '<|end_of_text|>' token, and strip any extra whitespace
    response = response.replace('collider: ', '').replace('<eos>', '').strip()

    # Use regex to remove any instructional text that appears after the variable name
    # This regex captures everything before the instructional text, ensuring only the variable name remains
    clean_response = re.split(r'### Instruction:|### Input:', response)[0].strip()

    return clean_response


ground_truth_df = pd.read_csv('collider_prompt.csv')['Correct Answer']
cleaned_responses = [clean_response(response) for response in responses]

# Combine the prompts, cleaned responses, and correct answers into a DataFrame
comparison_df = pd.DataFrame({
    'Prompt': prompts,
    'Response': cleaned_responses,
    'Correct Answer': ground_truth_df
})

# Normalize both the Response and Correct Answer for comparison
comparison_df['Is Correct'] = comparison_df.apply(
    lambda row:row['Response'] == row['Correct Answer'],
    axis=1
)

# Calculate the accuracy
accuracy = comparison_df['Is Correct'].mean()
print(f"Accuracy: {accuracy * 100:.2f}%")

Accuracy: 21.40%


In [None]:
# Define the target directory relative to the repository root
target_directory = '../model/Gemma2_9B'

# Check if the directory exists, and create it if not
if not os.path.exists(target_directory):
    os.makedirs(target_directory)  # Create the directory and all intermediate-level directories

# Define the full path where you want to save the CSV file
save_path = os.path.join(target_directory, 'Collider_response.csv')

# Save the DataFrame to the specified path
comparison_df.to_csv(save_path, index=False)

print(f"File saved to: {save_path}")
!git remote set-url origin https://$GITHUB_TOKEN@github.com/..../Research.git
!git add {save_path}
!git commit -m "Added basic_knowledge_response to the model folder"
!git push origin main

#Fine-tuning with direct pair

In [None]:
from datasets import Dataset
# Load the CSV file

ground_truth_df = pd.read_csv('basic_knowledge_prompt.csv')

# Define the alpaca_prompt template
fine_tuning_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""
EOS_TOKEN = tokenizer.eos_token  # Get the EOS token from the tokenizer

# Prepare the data for fine-tuning
def format_for_finetuning(row):
    instruction = "Is the following statement true or false? Only provide true or false, do not include any additional explanation."
    input_text = f"In the context of diagnosing neuropathic pain, consider the causal relationships where {row['Source']} causes discomfort related to {row['Target']}."

    # Convert the boolean response to a string before adding the EOS token
    response = str(row['Result']) + EOS_TOKEN

    # Combine instruction, input, and response into a single 'text' field
    text = fine_tuning_prompt.format(instruction, input_text, response)
    return {"text": text}

formatted_data = ground_truth_df.apply(format_for_finetuning, axis=1).tolist()
dataset = Dataset.from_pandas(pd.DataFrame(formatted_data))

In [None]:
formatted_df = pd.DataFrame(formatted_data)

# Save the formatted DataFrame to a CSV file with the desired name
file_name = 'Gemma_direct_pair_fine_tuning_data.csv'
formatted_df.to_csv(file_name, index=False)

print(f"Formatted data saved as '{file_name}'.")

# Git commands to add, commit, and push the file to GitHub
# Note: Replace <YOUR_GITHUB_TOKEN> with your actual GitHub token for secure access
!git remote set-url origin https://$GITHUB_TOKEN@github.com/...../Research.git
!git add {file_name}
!git commit -m "Added Gemma direct pair fine-tuning data to the dataset folder"
!git push origin main

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r=16,  # The rank of the LoRA matrices (common values are 8, 16, 32, etc.)
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
                    "gate_proj", "up_proj", "down_proj"],  # The modules to which LoRA is applied
    lora_alpha=16,  # Scaling factor for LoRA layers
    lora_dropout=0,  # Dropout for LoRA layers
    bias="none",  # Use "none" to avoid adding additional biases
    use_gradient_checkpointing="unsloth",  # Enable gradient checkpointing to save memory
    random_state=42,  # Set random state for reproducibility
    use_rslora=False,  # Disable rank-stabilized LoRA
    loftq_config=None,  # No special quantization configuration for LoRA
)


Unsloth 2024.8 patched 42 layers with 42 QKV layers, 42 O layers and 42 MLP layers.


In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    dataset_num_proc=2,
    packing=False,  # Disable packing for simplicity
    args=TrainingArguments(
        per_device_train_batch_size=30,  # Small batch size to fit in memory
        gradient_accumulation_steps=2,  # Accumulate gradients to simulate larger batch size
        warmup_steps=5,  # Warmup steps for learning rate
        max_steps=100,  # Increase to 400 steps for a more comprehensive fine-tuning
        learning_rate=2e-4,  # Learning rate
        fp16=not is_bfloat16_supported(),  # Use FP16 if BF16 is not supported
        bf16=is_bfloat16_supported(),  # Use BF16 if supported by hardware
        logging_steps=10,  # Log progress every 10 steps
        optim="adamw_8bit",  # Use memory-efficient optimizer
        weight_decay=0.01,  # Regularization to prevent overfitting
        lr_scheduler_type="linear",  # Linear learning rate schedule
        seed=42,  # Set seed for reproducibility
        output_dir="outputs",  # Directory to save model outputs
    ),
)

trainer.train()


Map (num_proc=2):   0%|          | 0/658 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 658 | Num Epochs = 10
O^O/ \_/ \    Batch size per device = 30 | Gradient Accumulation steps = 2
\        /    Total batch size = 60 | Total steps = 100
 "-____-"     Number of trainable parameters = 54,018,048


Step,Training Loss
10,2.0177
20,0.1918
30,0.1234
40,0.1085
50,0.102
60,0.0985
70,0.0966
80,0.0931
90,0.0914
100,0.0905


TrainOutput(global_step=100, training_loss=0.3013527101278305, metrics={'train_runtime': 1188.8943, 'train_samples_per_second': 5.047, 'train_steps_per_second': 0.084, 'total_flos': 3.3599134233391104e+16, 'train_loss': 0.3013527101278305, 'epoch': 9.090909090909092})

In [None]:
import shutil
# Save the LoRA-adapted model and tokenizer to the initial directory
model.save_pretrained("Gemma2_9B_Tuned_with_Pair_Knowledge_model")
tokenizer.save_pretrained("Gemma2_9B_Tuned_with_Pair_Knowledge_model")
shutil.move("Gemma2_9B_Tuned_with_Pair_Knowledge_model", "/content/drive/MyDrive/CITS4011/Gemma2_9b/Gemma2_9B_Tuned_with_Pair_Knowledge_model")

'/content/drive/MyDrive/CITS4011/Gemma2_9b/Gemma2_9B_Tuned_with_Pair_Knowledge_model'

# Evaluating Gemma2_9b after fine tuning

In [None]:
model_path = "/content/drive/MyDrive/CITS4011/Gemma2_9b/Gemma2_9B_Tuned_with_Pair_Knowledge_model"
# Load the fine-tuned model
model, tokenizer = FastLanguageModel.from_pretrained(model_path)

Unsloth: If you want to finetune Gemma 2, install flash-attn to make it faster!
To install flash-attn, do the below:

pip install --no-deps --upgrade "flash-attn>=2.6.3"
==((====))==  Unsloth 2024.8: Fast Gemma2 patching. Transformers = 4.44.2.
   \\   /|    GPU: NVIDIA L4. Max memory: 22.168 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.4.0+cu121. CUDA = 8.9. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.27.post2. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/6.13G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Unsloth 2024.8 patched 42 layers with 42 QKV layers, 42 O layers and 42 MLP layers.


In [None]:
# Prepare the model for inference
FastLanguageModel.for_inference(model)

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): Gemma2ForCausalLM(
      (model): Gemma2Model(
        (embed_tokens): Embedding(256000, 3584)
        (layers): ModuleList(
          (0-41): 42 x Gemma2DecoderLayer(
            (self_attn): Gemma2Attention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=3584, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Identity()
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=3584, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): lora.Linear4bit(
  

## Basic Knowledge

In [None]:
prompts = pd.read_csv('basic_knowledge_prompt.csv')['Prompt'].tolist()
responses = query_model(prompts, model, tokenizer)



In [None]:
# Save the responses to a new CSV file
responses_df = pd.DataFrame({
    'Prompt': prompts,
    'Response': responses
})
def clean_response(response):
    # Convert the response to lowercase and strip any extra whitespace
    response = response.lower().strip()
    # Check if the response contains only 'true'
    if "true" in response:
        return "True"
    # Check if the response contains only 'false'
    elif "false" in response:
        return "False"
    else:
        # If the response doesn't contain 'true' or 'false', return None
        return None
responses_df['Response'] = responses_df['Response'].apply(clean_response)

In [None]:
ground_truth_df = pd.read_csv('basic_knowledge_prompt.csv')
ground_truth_df
ground_truth_df['Result'] = ground_truth_df['Result'].map({True: 'True', False: 'False'})

# Merge the cleaned responses with the ground truth prompts on the 'Prompt' column
merged_df = pd.merge(ground_truth_df,
                     responses_df[['Prompt', 'Response']],
                     on="Prompt",
                     how="inner")


# Calculate precision, recall, and f1-score
precision = precision_score(merged_df['Result'], merged_df['Response'], pos_label="True")
recall = recall_score(merged_df['Result'], merged_df['Response'], pos_label="True")
f1 = f1_score(merged_df['Result'], merged_df['Response'], pos_label="True")

# Display the results
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1-Score: {f1:.2f}")

Precision: 1.00
Recall: 1.00
F1-Score: 1.00


In [None]:
# Define the target directory relative to the repository root
target_directory = '../model/Gemma2_9B_Tuned_with_Pair_Knowledge'

# Check if the directory exists, and create it if not
if not os.path.exists(target_directory):
    os.makedirs(target_directory)  # Create the directory and all intermediate-level directories

# Define the full path where you want to save the CSV file
save_path = os.path.join(target_directory, 'basic_knowledge_response.csv')

# Save the DataFrame to the specified path
merged_df.to_csv(save_path, index=False)

print(f"File saved to: {save_path}")
!git remote set-url origin https://$GITHUB_TOKEN@github.com/..../Research.git
!git add {save_path}
!git commit -m "Added basic_knowledge_response to the model folder"
!git push origin main

## Mediator graph

In [None]:
prompts = pd.read_csv('mediator_prompt.csv')['Prompt'].tolist()
responses = query_model(prompts, model, tokenizer)


In [None]:
def clean_response(response):
    # Remove the 'collider: ' prefix and '<|end_of_text|>' token, and strip any extra whitespace
    response = response.replace('confounder: ', '').replace('<eos>', '').strip()

    # Use regex to remove any instructional text that appears after the variable name
    # This regex captures everything before the instructional text, ensuring only the variable name remains
    clean_response = re.split(r'### Instruction:|### Input:', response)[0].strip()

    return clean_response
cleaned_responses = [clean_response(response) for response in responses]
ground_truth_df = pd.read_csv('mediator_prompt.csv')['Correct Answer']


comparison_df = pd.DataFrame({
    'Prompt': prompts,
    'Response': cleaned_responses,
    'Correct Answer': ground_truth_df # Corresponding correct answers
})

# Normalize both the Response and Correct Answer for comparison
comparison_df['Is Correct'] = comparison_df.apply(
    lambda row:row['Response'] == row['Correct Answer'],
    axis=1
)

# Calculate the accuracy
accuracy = comparison_df['Is Correct'].mean()
print(f"Accuracy: {accuracy * 100:.2f}%")

Accuracy: 68.44%


In [None]:
# Define the target directory relative to the repository root
target_directory = '../model/Gemma2_9B_Tuned_with_Pair_Knowledge'

# Check if the directory exists, and create it if not
if not os.path.exists(target_directory):
    os.makedirs(target_directory)  # Create the directory and all intermediate-level directories

# Define the full path where you want to save the CSV file
save_path = os.path.join(target_directory, 'Mediator_response.csv')

# Save the DataFrame to the specified path
comparison_df.to_csv(save_path, index=False)

print(f"File saved to: {save_path}")
!git remote set-url origin https://$GITHUB_TOKEN@github.com/..../Research.git
!git add {save_path}
!git commit -m "Added basic_knowledge_response to the model folder"
!git push origin main

## Confunder graph

In [None]:
prompts = pd.read_csv('confunder_prompt.csv')['Prompt'].tolist()
responses = query_model(prompts, model, tokenizer)


In [None]:
def clean_response(response):
    # Remove the 'confounder: ' prefix and '<eos>' token, and strip any extra whitespace
    response = response.replace('confounder: ', '').replace('<eos>', '').strip()

    # Use regex to split the response based on "where" and "causes"
    result = re.split(r'where|causes', response)

    # Extract the content between "where" and "causes"
    extracted_content = result[1].strip() if len(result) > 1 else ''

    return extracted_content
ground_truth_df = pd.read_csv('confunder_prompt.csv')['Correct Answer']
cleaned_responses = [clean_response(response) for response in responses]

# Combine the prompts, cleaned responses, and correct answers into a DataFrame
comparison_df = pd.DataFrame({
    'Prompt': prompts,
    'Response': cleaned_responses,
    'Correct Answer': ground_truth_df
})

# Normalize both the Response and Correct Answer for comparison
comparison_df['Is Correct'] = comparison_df.apply(
    lambda row:row['Response'] == row['Correct Answer'],
    axis=1
)

# Calculate the accuracy
accuracy = comparison_df['Is Correct'].mean()
print(f"Accuracy: {accuracy * 100:.2f}%")



Accuracy: 71.20%


In [None]:
# Define the target directory relative to the repository root
target_directory = '../model/Gemma2_9B_Tuned_with_Pair_Knowledge'

# Check if the directory exists, and create it if not
if not os.path.exists(target_directory):
    os.makedirs(target_directory)  # Create the directory and all intermediate-level directories

# Define the full path where you want to save the CSV file
save_path = os.path.join(target_directory, 'Confunder_response.csv')

# Save the DataFrame to the specified path
comparison_df.to_csv(save_path, index=False)

print(f"File saved to: {save_path}")
!git remote set-url origin https://$GITHUB_TOKEN@github.com/..../Research.git
!git add {save_path}
!git commit -m "Added basic_knowledge_response to the model folder"
!git push origin main

## Collider graph

In [None]:
prompts = pd.read_csv('collider_prompt.csv')['Prompt'].tolist()
responses = query_model(prompts, model, tokenizer)

L L5 Radikulopati -> L S1 Radikulopati<eos>
L L1 Radikulopati -> L Nedre bukbesvär### Instruction:Is the following statement true or false? Only provide true
R T9 Radikulopati -> L T12 Radikulopati<eos>
R T3 Radikulopati -> R T9 Radikulopati<eos>
L L4 Radikulopati -> L L3 Radikulopati### Instruction:Is the following statement true or false? Only provide
R T10 Radikulopati -> R T12 Radikulopati<eos>
R C8 Radikulopati -> R Skulderbesvär### Instruction:Is the following statement true or false? Only provide true
R T10 Radikulopati -> L L2 Radikulopati<eos>
L C3 Radikulopati -> L C2 Radikulopati<eos>
R C6 Radikulopati -> R C7 Radikulopati### Instruction:Is the following statement true or false? Only provide
R L5 Radikulopati -> R Benbesvär### Instruction:Is the following statement true or false? Only provide true or false
R T10 Radikulopati -> R T1 Radikulopati### Instruction:Is the following statement true or false? Only
L T8 Radikulopati -> R T7 Radikulopati<eos>
R L1 Radikulopati -> L L2

In [None]:
def clean_response(response):
  pattern = r'->\s*(.*?)(?:\s*<eos>|\s*###)'
  match = re.search(pattern, response)
  if match:
    return match.group(1)
  else:
    return ""
ground_truth_df = pd.read_csv('collider_prompt.csv')['Correct Answer']
cleaned_responses = [clean_response(response) for response in responses]
# Combine the prompts, cleaned responses, and correct answers into a DataFrame
comparison_df = pd.DataFrame({
    'Prompt': prompts,
    'Response': cleaned_responses,
    'Correct Answer': ground_truth_df
})

# Normalize both the Response and Correct Answer for comparison
comparison_df['Is Correct'] = comparison_df.apply(
    lambda row:row['Response'] == row['Correct Answer'],
    axis=1
)

# Calculate the accuracy
accuracy = comparison_df['Is Correct'].mean()
print(f"Accuracy: {accuracy * 100:.2f}%")

Accuracy: 32.60%


In [None]:
# Define the target directory relative to the repository root
target_directory = '../model/Gemma2_9B_Tuned_with_Pair_Knowledge'

# Check if the directory exists, and create it if not
if not os.path.exists(target_directory):
    os.makedirs(target_directory)  # Create the directory and all intermediate-level directories

# Define the full path where you want to save the CSV file
save_path = os.path.join(target_directory, 'Collider_response.csv')

# Save the DataFrame to the specified path
comparison_df.to_csv(save_path, index=False)

print(f"File saved to: {save_path}")
!git remote set-url origin https://$GITHUB_TOKEN@github.com/..../Research.git
!git add {save_path}
!git commit -m "Added basic_knowledge_response to the model folder"
!git push origin main

In [None]:
import pandas as pd
from datasets import Dataset
# Load the CSV files
mediator_df = pd.read_csv('mediator_prompt.csv')
confounder_df = pd.read_csv('confunder_prompt.csv')  # Corrected the file name from confunder to confounder
collider_df = pd.read_csv('collider_prompt.csv')

# Randomly sample 5 rows from each DataFrame
sampled_mediator_df = mediator_df.sample(n=5, random_state=4011)  # Use a fixed seed for reproducibility
sampled_confounder_df = confounder_df.sample(n=5, random_state=4011)
sampled_collider_df = collider_df.sample(n=5, random_state=4011)

# Define the EOS token
EOS_TOKEN = "<eos>"  # Adjust this based on your tokenizer's settings

def format_for_finetuning(row, label_type):
    """
    Formats the row data by appending the correct answer and EOS token to form the response.
    """
    response = f"{label_type}: {row['Correct Answer']}{EOS_TOKEN}"
    text = row['Prompt'] + response
    return {"text": text}

# Apply the formatting function to each DataFrame
formatted_mediators = sampled_mediator_df.apply(lambda row: format_for_finetuning(row, 'mediator'), axis=1).tolist()
formatted_confounders = sampled_confounder_df.apply(lambda row: format_for_finetuning(row, 'confounder'), axis=1).tolist()
formatted_colliders = sampled_collider_df.apply(lambda row: format_for_finetuning(row, 'collider'), axis=1).tolist()

# Combine all formatted data into a single list
all_formatted_data = formatted_mediators + formatted_confounders + formatted_colliders

# Optional: Convert the combined list back to a DataFrame for easier manipulation or saving
combined_df = pd.DataFrame(all_formatted_data)
dataset = Dataset.from_pandas(combined_df)


In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r=16,  # The rank of the LoRA matrices (common values are 8, 16, 32, etc.)
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
                    "gate_proj", "up_proj", "down_proj"],  # The modules to which LoRA is applied
    lora_alpha=16,  # Scaling factor for LoRA layers
    lora_dropout=0,  # Dropout for LoRA layers
    bias="none",  # Use "none" to avoid adding additional biases
    use_gradient_checkpointing="unsloth",  # Enable gradient checkpointing to save memory
    random_state=42,  # Set random state for reproducibility
    use_rslora=False,  # Disable rank-stabilized LoRA
    loftq_config=None,  # No special quantization configuration for LoRA
)

Unsloth: Already have LoRA adapters! We shall skip this step.


In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset,
    dataset_text_field="text",
    max_seq_length=1024,
    dataset_num_proc=2,
    packing=False,  # Disable packing for simplicity
    args=TrainingArguments(
        per_device_train_batch_size=5,  # Small batch size to fit in memory
        gradient_accumulation_steps=1,  # Accumulate gradients to simulate larger batch size
        warmup_steps=5,  # Warmup steps for learning rate
        max_steps=50,  # Increase to 400 steps for a more comprehensive fine-tuning
        learning_rate=2e-4,  # Learning rate
        fp16=not is_bfloat16_supported(),  # Use FP16 if BF16 is not supported
        bf16=is_bfloat16_supported(),  # Use BF16 if supported by hardware
        logging_steps=10,  # Log progress every 10 steps
        optim="adamw_8bit",  # Use memory-efficient optimizer
        weight_decay=0.01,  # Regularization to prevent overfitting
        lr_scheduler_type="linear",  # Linear learning rate schedule
        seed=42,  # Set seed for reproducibility
        output_dir="outputs",  # Directory to save model outputs
    ),
)

trainer.train()


Map (num_proc=2):   0%|          | 0/15 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 15 | Num Epochs = 17
O^O/ \_/ \    Batch size per device = 5 | Gradient Accumulation steps = 1
\        /    Total batch size = 5 | Total steps = 50
 "-____-"     Number of trainable parameters = 54,018,048


Step,Training Loss
10,1.4284
20,0.0705
30,0.0329
40,0.0271
50,0.0252


TrainOutput(global_step=50, training_loss=0.3168316614627838, metrics={'train_runtime': 79.5808, 'train_samples_per_second': 3.141, 'train_steps_per_second': 0.628, 'total_flos': 2280055190568960.0, 'train_loss': 0.3168316614627838, 'epoch': 16.666666666666668})

In [None]:
import shutil
# Save the LoRA-adapted model and tokenizer to the initial directory
model.save_pretrained("Gemma2_9B_Tuned_with_causal_rule_model")
tokenizer.save_pretrained("Gemma2_9B_Tuned_with_causal_rule_model")
shutil.move("Gemma2_9B_Tuned_with_causal_rule_model", "/content/drive/MyDrive/CITS4011/Gemma2_9b/Gemma2_9B_Tuned_with_causal_rule_model")

'/content/drive/MyDrive/CITS4011/Gemma2_9b/Gemma2_9B_Tuned_with_causal_rule_model'

# Reflection fine tuning with one shot example from the causal question

## Create instructional fine-tunning dataset manually

In [None]:
import pandas as pd
from datasets import Dataset
# Creating the dataset with the six examples formatted as requested
all_formatted_data = [
    {
        "text": """<|begin_of_text|><|start_header_id|>user<|end_header_id|>

You are a world-class AI system, capable of complex causal reasoning and reflection. Use your knowledge to deeply infer the relationships among variables. Reason through the query inside <thinking> tags and provide your final response inside <output> tags.

If you detect any mistakes in your reasoning at any point, correct yourself inside <reflection> tags.
<|eot_id|><|start_header_id|>user<|end_header_id|>

Identify the mediator variable in the causal pathway A -> B -> C. This means if A causes B is true and B causes C is true, then B is the mediator. Only provide the mediator variable (B) without any additional explanation.
In diagnosing neuropathic pain, consider the variables DLS C6-C7, R C7 Radikulopati, and R Under armsbesvär. Identify the mediator variable.
<|eot_id|><|start_header_id|>assistant<|end_header_id|>
<thinking>
The causal chain here is: DLS C6-C7 causes R C7 Radikulopati discomfort is true.
R C7 Radikulopati causes R Under armsbesvär is true.
The causal pathway is DLS C6-C7 -> R C7 Radikulopati -> R Under armsbesvär,
so the mediator is R C7 Radikulopati.
</thinking>
<reflection>No corrections needed</reflection>
<output>R C7 Radikulopati</output><eos>"""
    },
    {
      "text": """<|begin_of_text|><|start_header_id|>user<|end_header_id|>

You are a world-class AI system, capable of complex causal reasoning and reflection. Use your knowledge to deeply infer the relationships among variables. Reason through the query inside <thinking> tags and provide your final response inside <output> tags.

If you detect any mistakes in your reasoning at any point, correct yourself inside <reflection> tags.
<|eot_id|><|start_header_id|>user<|end_header_id|>

Identify the mediator variable in the causal pathway A -> B -> C. This means if A causes B is true and B causes C is true, then B is the mediator. Only provide the mediator variable (B) without any additional explanation.
In diagnosing neuropathic pain, consider the variables R C7 Radikulopati, DLS C6-C7, and R Under armsbesvär. Identify the mediator variable.
<|eot_id|><|start_header_id|>assistant<|end_header_id|>
<thinking>
The causal chain here is: DLS C6-C7 causes R C7 Radikulopati discomfort is true.
R C7 Radikulopati causes R Under armsbesvär is true.
The causal pathway is DLS C6-C7 -> R C7 Radikulopati -> R Under armsbesvär,
so the mediator is R C7 Radikulopati.
</thinking>
<reflection>No corrections needed</reflection>
<output>R C7 Radikulopati</output><eos>"""
    },
    {
        "text": """<|begin_of_text|><|start_header_id|>user<|end_header_id|>

You are a world-class AI system, capable of complex causal reasoning and reflection. Use your knowledge to deeply infer the relationships among variables. Reason through the query inside <thinking> tags and provide your final response inside <output> tags.

If you detect any mistakes in your reasoning at any point, correct yourself inside <reflection> tags.
<|eot_id|><|start_header_id|>user<|end_header_id|>

Identify the mediator variable in the causal pathway A -> B -> C. This means if A causes B is true and B causes C is true, then B is the mediator. Only provide the mediator variable (B) without any additional explanation.
In diagnosing neuropathic pain, consider the variables R C7 Radikulopati, DLS C6-C7, and R Under armsbesvär. Identify the mediator variable.
<|eot_id|><|start_header_id|>assistant<|end_header_id|>
<thinking>
The causal chain here is: R C7 Radikulopati causes DLS C6-C7 discomfort is true.
DLS C6-C7 causes R Under armsbesvär is true.
The causal pathway is R C7 Radikulopati  -> DLS C6-C7 -> R Under armsbesvär,
so the mediator is DLS C6-C7.
</thinking>
<reflection>The correct causal chain is DLS C6-C7 -> R C7 Radikulopati -> R Under armsbesvär, where DLS C6-C7 causes discomfort related to R C7 Radikulopati, and R C7 Radikulopati causes discomfort related to R Under armsbesvär. My mistake was misidentifying the causal relationships within the chain.</reflection>
<output>R C7 Radikulopati</output><eos>"""
    },
    {
        "text": """<|begin_of_text|><|start_header_id|>user<|end_header_id|>

You are a world-class AI system, capable of complex causal reasoning and reflection. Use your knowledge to deeply infer the relationships among variables. Reason through the query inside <thinking> tags and provide your final response inside <output> tags.

If you detect any mistakes in your reasoning at any point, correct yourself inside <reflection> tags.
<|eot_id|><|start_header_id|>user<|end_header_id|>

Identify the mediator variable in the causal pathway A -> B -> C. This means if A causes B is true and B causes C is true, then B is the mediator. Only provide the mediator variable (B) without any additional explanation.
In diagnosing neuropathic pain, consider the variables R C7 Radikulopati, R Under armsbesvär and  DLS C6-C7. Identify the mediator variable.
<|eot_id|><|start_header_id|>assistant<|end_header_id|>
<thinking>
The causal chain here is: R C7 Radikulopati causes R Under armsbesvär is true.
DLS C6-C7 causes R C7 Radikulopati discomfort is true.
The causal pathway is DLS C6-C7 -> R C7 Radikulopati -> R Under armsbesvär,
so the mediator is R C7 Radikulopati.
</thinking>
<reflection>No corrections needed</reflection>
<output>R C7 Radikulopati</output><eos>"""
    },
    {
        "text": """<|begin_of_text|><|start_header_id|>user<|end_header_id|>

You are a world-class AI system, capable of complex causal reasoning and reflection. Use your knowledge to deeply infer the relationships among variables. Reason through the query inside <thinking> tags and provide your final response inside <output> tags.

If you detect any mistakes in your reasoning at any point, correct yourself inside <reflection> tags.
<|eot_id|><|start_header_id|>user<|end_header_id|>

Identify the mediator variable in the causal pathway A -> B -> C. This means if A causes B is true and B causes C is true, then B is the mediator. Only provide the mediator variable (B) without any additional explanation.
In diagnosing neuropathic pain, consider the variables R Under armsbesvär, R C7 Radikulopati and DLS C6-C7. Identify the mediator variable.
<|eot_id|><|start_header_id|>assistant<|end_header_id|>
<thinking>
The causal chain here is: DLS C6-C7 causes R C7 Radikulopati discomfort is true.
R C7 Radikulopati causes R Under armsbesvär is true.
The causal pathway is DLS C6-C7 -> R C7 Radikulopati -> R Under armsbesvär,
so the mediator is R C7 Radikulopati.
</thinking>
<reflection>No corrections needed</reflection>
<output>R C7 Radikulopati</output><eos>"""
    },
    {
        "text": """<|begin_of_text|><|start_header_id|>user<|end_header_id|>

You are a world-class AI system, capable of complex causal reasoning and reflection. Use your knowledge to deeply infer the relationships among variables. Reason through the query inside <thinking> tags and provide your final response inside <output> tags.

If you detect any mistakes in your reasoning at any point, correct yourself inside <reflection> tags.
<|eot_id|><|start_header_id|>user<|end_header_id|>

Identify the mediator variable in the causal pathway A -> B -> C. This means if A causes B is true and B causes C is true, then B is the mediator. Only provide the mediator variable (B) without any additional explanation. In diagnosing neuropathic pain, consider the variables R Under armsbesvär, R C7 Radikulopati and DLS C6-C7. Identify the mediator variable.
<|eot_id|><|start_header_id|>assistant<|end_header_id|>
<thinking>
The causal chain here is: R C7 Radikulopati causes R Under armsbesvär is true. DLS C6-C7 causes R C7 Radikulopati discomfort is true. The causal pathway is DLS C6-C7 -> R C7 Radikulopati -> R Under armsbesvär,
so the mediator is R Under armsbesvär.
</thinking>
<reflection>The mediator should be the variable that connects the causal chain, which is R C7 Radikulopati, not R Under armsbesvär. My mistake was in identifying the effect as the mediator instead of the connecting variable.</reflection>
<output>R C7 Radikulopati</output><eos>"""
    },
    {
        "text":""" <|begin_of_text|><|start_header_id|>system<|end_header_id|> You are a world-class AI system, capable of complex causal reasoning and reflection. Use your knowledge to deeply infer the relationships among variables. Reason through the query inside <thinking> tags and provide your final response inside <output> tags. If you detect any mistakes in your reasoning at any point, correct yourself inside <reflection> tags. <|eot_id|><|start_header_id|>user<|end_header_id|> Identify the confounder variable in the causal pathway A -> B and A -> C. This means if A causes B is true, and A causes C is true, and B and C have no causal relationship, then A is the confounder. Only provide the confounder variable (A) without any additional explanation. In diagnosing neuropathic pain, consider the variables R Vadbesvär, R Lårbesvär, and R L5 Radikulopati. Identify the confounder variable.

<|eot_id|><|start_header_id|>assistant<|end_header_id|>
<thinking>
The causal chain here is: R L5 Radikulopati causes R Vadbesvär is true, and R L5 Radikulopati causes R Lårbesvär is true. R Vadbesvär and R Lårbesvär have no causal relationship. Therefore, R L5 Radikulopati is the confounder.
</thinking>
<reflection>No corrections needed</reflection>
<output>R L5 Radikulopati</output><eos>"""
    },
    {
        "text":"""<|begin_of_text|><|start_header_id|>system<|end_header_id|> You are a world-class AI system, capable of complex causal reasoning and reflection. Use your knowledge to deeply infer the relationships among variables. Reason through the query inside <thinking> tags and provide your final response inside <output> tags. If you detect any mistakes in your reasoning at any point, correct yourself inside <reflection> tags. <|eot_id|><|start_header_id|>user<|end_header_id|> Identify the confounder variable in the causal pathway A -> B and A -> C. This means if A causes B is true, and A causes C is true, and B and C have no causal relationship, then A is the confounder. Only provide the confounder variable (A) without any additional explanation. In diagnosing neuropathic pain, consider the variables R Lårbesvär, R L5 Radikulopati, and R Vadbesvär. Identify the confounder variable.

<|eot_id|><|start_header_id|>assistant<|end_header_id|>
<thinking>
The causal chain here is: R L5 Radikulopati causes R Lårbesvär is true, and R L5 Radikulopati causes R Vadbesvär is true. R Lårbesvär and R Vadbesvär have no causal relationship. Therefore, R L5 Radikulopati is the confounder.
</thinking>
<reflection>No corrections needed</reflection>
<output>R L5 Radikulopati</output><eos>"""
    },
    {
        "text":"""<|begin_of_text|><|start_header_id|>system<|end_header_id|> You are a world-class AI system, capable of complex causal reasoning and reflection. Use your knowledge to deeply infer the relationships among variables. Reason through the query inside <thinking> tags and provide your final response inside <output> tags. If you detect any mistakes in your reasoning at any point, correct yourself inside <reflection> tags. <|eot_id|><|start_header_id|>user<|end_header_id|> Identify the confounder variable in the causal pathway A -> B and A -> C. This means if A causes B is true, and A causes C is true, and B and C have no causal relationship, then A is the confounder. Only provide the confounder variable (A) without any additional explanation. In diagnosing neuropathic pain, consider the variables R Lårbesvär, R Vadbesvär, and R L5 Radikulopati. Identify the confounder variable.

<|eot_id|><|start_header_id|>assistant<|end_header_id|>
<thinking>
The causal chain here is: R Lårbesvär causes R L5 Radikulopati is true, and R L5 Radikulopati causes R Vadbesvär is true. Therefore, R Lårbesvär is the confounder.
</thinking>
<reflection>Correction: The correct chain should recognize R L5 Radikulopati as the common cause of R Lårbesvär and R Vadbesvär with no direct relationship between the effects. Thus, R L5 Radikulopati is the confounder, not R Lårbesvär.</reflection>
<output>R L5 Radikulopati</output><eos>"""
    },
    {
        "text":"""<|begin_of_text|><|start_header_id|>system<|end_header_id|> You are a world-class AI system, capable of complex causal reasoning and reflection. Use your knowledge to deeply infer the relationships among variables. Reason through the query inside <thinking> tags and provide your final response inside <output> tags. If you detect any mistakes in your reasoning at any point, correct yourself inside <reflection> tags. <|eot_id|><|start_header_id|>user<|end_header_id|> Identify the confounder variable in the causal pathway A -> B and A -> C. This means if A causes B is true, and A causes C is true, and B and C have no causal relationship, then A is the confounder. Only provide the confounder variable (A) without any additional explanation. In diagnosing neuropathic pain, consider the variables R Vadbesvär, R Lårbesvär, and R L5 Radikulopati. Identify the confounder variable.

<|eot_id|><|start_header_id|>assistant<|end_header_id|>
<thinking>
The causal chain here is: R L5 Radikulopati causes R Lårbesvär discomfort is true, and R L5 Radikulopati causes R Lårbesvär is true. Therefore, R Vadbesvär is the confounder.
</thinking>
<reflection>Correction: The error was in misidentifying R Vadbesvär as the confounder. The correct confounder is R L5 Radikulopati, as it influences both R Vadbesvär and R Lårbesvär without them influencing each other.</reflection>
<output>R L5 Radikulopati</output><eos>"""
    },
    {
        "text":"""<|begin_of_text|><|start_header_id|>system<|end_header_id|> You are a world-class AI system, capable of complex causal reasoning and reflection. Use your knowledge to deeply infer the relationships among variables. Reason through the query inside <thinking> tags and provide your final response inside <output> tags. If you detect any mistakes in your reasoning at any point, correct yourself inside <reflection> tags. <|eot_id|><|start_header_id|>user<|end_header_id|> Identify the collider variable in the causal pathway A -> C and B -> C. This means if A causes C is true, and B causes C is true, and A and B have no causal relationship, then C is the collider. Only provide the collider variable (C) without any additional explanation. In diagnosing neuropathic pain, consider the variables R T12 Radikulopati, Nedre bukbesvär, and R T10 Radikulopati. Identify the collider variable.

<|eot_id|><|start_header_id|>assistant<|end_header_id|>
<thinking>
The causal chain here is: R T12 Radikulopati causes Nedre bukbesvär is true, and R T10 Radikulopati causes Nedre bukbesvär is true. R T12 Radikulopati and R T10 Radikulopati have no causal relationship. Therefore, Nedre bukbesvär is the collider.
</thinking>
<reflection>No corrections needed</reflection>
<output>Nedre bukbesvär</output><eos>"""
    },
    {
        "text":"""<|begin_of_text|><|start_header_id|>system<|end_header_id|> You are a world-class AI system, capable of complex causal reasoning and reflection. Use your knowledge to deeply infer the relationships among variables. Reason through the query inside <thinking> tags and provide your final response inside <output> tags. If you detect any mistakes in your reasoning at any point, correct yourself inside <reflection> tags. <|eot_id|><|start_header_id|>user<|end_header_id|> Identify the collider variable in the causal pathway A -> C and B -> C. This means if A causes C is true, and B causes C is true, and A and B have no causal relationship, then C is the collider. Only provide the collider variable (C) without any additional explanation. In diagnosing neuropathic pain, consider the variables Nedre bukbesvär, R T10 Radikulopati, and R T12 Radikulopati. Identify the collider variable.

<|eot_id|><|start_header_id|>assistant<|end_header_id|>
<thinking>
The causal chain here is: R T10 Radikulopati causes Nedre bukbesvär is true, and R T12 Radikulopati causes Nedre bukbesvär is true. R T10 Radikulopati and R T12 Radikulopati have no causal relationship. Therefore, Nedre bukbesvär is the collider.
</thinking>
<reflection>No corrections needed</reflection>
<output>Nedre bukbesvär</output><eos>"""
    },
    {
        "text":"""<|begin_of_text|><|start_header_id|>system<|end_header_id|> You are a world-class AI system, capable of complex causal reasoning and reflection. Use your knowledge to deeply infer the relationships among variables. Reason through the query inside <thinking> tags and provide your final response inside <output> tags. If you detect any mistakes in your reasoning at any point, correct yourself inside <reflection> tags. <|eot_id|><|start_header_id|>user<|end_header_id|> Identify the collider variable in the causal pathway A -> C and B -> C. This means if A causes C is true, and B causes C is true, and A and B have no causal relationship, then C is the collider. Only provide the collider variable (C) without any additional explanation. In diagnosing neuropathic pain, consider the variables R T12 Radikulopati, R T10 Radikulopati and Nedre bukbesvär. Identify the collider variable.

<|eot_id|><|start_header_id|>assistant<|end_header_id|>
<thinking>
The causal chain here is: R T12 Radikulopati causes Nedre bukbesvär is true, and R T10 Radikulopati causes Nedre bukbesvär is true. Therefore, Nedre bukbesvär is the collider .
</thinking>
<reflection>No corrections needed.</reflection>
<output>Nedre bukbesvär</output><eos>"""
    }

]
# Create a DataFrame from the formatted data
combined_df = pd.DataFrame(all_formatted_data)

dataset = Dataset.from_pandas(combined_df)

In [None]:
# Define the target directory and save path
target_directory = '../model/Gemma2_9B_reflection_finetuning'

# Check if the directory exists, and create it if it doesn't
if not os.path.exists(target_directory):
    os.makedirs(target_directory)

# Define the full path where you want to save the CSV file
save_path = os.path.join(target_directory, 'fine_tuning_dataset.csv')  # Specify the CSV filename

# Save the DataFrame to the specified path
combined_df.to_csv(save_path, index=False)

print(f"File saved to: {save_path}")
!git remote set-url origin https://$GITHUB_TOKEN@github.com/..../Research.git
!git add {save_path}
!git commit -m "Added fine tuned dataset to the model folder"
!git push origin main

## Reflection fine tuning

In [None]:
model_path = "/content/drive/MyDrive/CITS4011/Gemma2_9b/Gemma2_9B_Tuned_with_Pair_Knowledge_model1"
# Load the fine-tuned model
model, tokenizer = FastLanguageModel.from_pretrained(model_path)

Unsloth: If you want to finetune Gemma 2, install flash-attn to make it faster!
To install flash-attn, do the below:

pip install --no-deps --upgrade "flash-attn>=2.6.3"
==((====))==  Unsloth 2024.9: Fast Gemma2 patching. Transformers = 4.44.2.
   \\   /|    GPU: NVIDIA L4. Max memory: 22.168 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.4.0+cu121. CUDA = 8.9. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset,
    dataset_text_field="text",
    max_seq_length=1024,
    dataset_num_proc=2,
    packing=False,  # Disable packing for simplicity
    args=TrainingArguments(
        per_device_train_batch_size=1,  # Small batch size to fit in memory
        gradient_accumulation_steps=1,  # Accumulate gradients to simulate larger batch size
        warmup_steps=5,  # Warmup steps for learning rate
        max_steps=100,  # Increase to 400 steps for a more comprehensive fine-tuning
        learning_rate=2e-4,  # Learning rate
        fp16=not is_bfloat16_supported(),  # Use FP16 if BF16 is not supported
        bf16=is_bfloat16_supported(),  # Use BF16 if supported by hardware
        logging_steps=10,  # Log progress every 10 steps
        optim="adamw_8bit",  # Use memory-efficient optimizer
        weight_decay=0.01,  # Regularization to prevent overfitting
        lr_scheduler_type="linear",  # Linear learning rate schedule
        seed=42,  # Set seed for reproducibility
        output_dir="outputs",  # Directory to save model outputs
    ),
)

trainer.train()

Map (num_proc=2):   0%|          | 0/13 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 13 | Num Epochs = 8
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 1
\        /    Total batch size = 1 | Total steps = 100
 "-____-"     Number of trainable parameters = 54,018,048


Step,Training Loss
10,1.8257
20,0.1534
30,0.1066
40,0.0521
50,0.0424
60,0.0377
70,0.0219
80,0.0229
90,0.0195
100,0.0185


config.json:   0%|          | 0.00/924 [00:00<?, ?B/s]

TrainOutput(global_step=100, training_loss=0.2300783021748066, metrics={'train_runtime': 142.534, 'train_samples_per_second': 0.702, 'train_steps_per_second': 0.702, 'total_flos': 1997613911580672.0, 'train_loss': 0.2300783021748066, 'epoch': 7.6923076923076925})

In [None]:
import shutil
# Save the LoRA-adapted model and tokenizer to the initial directory
model.save_pretrained("Gemma2_9B_reflection_finetuning_model")
tokenizer.save_pretrained("Gemma2_9B_reflection_finetuning_model")
shutil.move("Gemma2_9B_reflection_finetuning_model", "/content/drive/MyDrive/CITS4011/Gemma2_9b/Gemma2_9B_reflection_finetuning_model")

'/content/drive/MyDrive/CITS4011/Gemma2_9b/Gemma2_9B_reflection_finetuning_model'

# Evaluating the reflection finetuning model


In [None]:
model_path = "/content/drive/MyDrive/CITS4011/Gemma2_9b/Gemma2_9B_reflection_finetuning_model"
# Load the fine-tuned model
model, tokenizer = FastLanguageModel.from_pretrained(model_path)

Unsloth: If you want to finetune Gemma 2, install flash-attn to make it faster!
To install flash-attn, do the below:

pip install --no-deps --upgrade "flash-attn>=2.6.3"
==((====))==  Unsloth 2024.9: Fast Gemma2 patching. Transformers = 4.44.2.
   \\   /|    GPU: NVIDIA L4. Max memory: 22.168 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.4.0+cu121. CUDA = 8.9. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/6.13G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Unsloth 2024.9 patched 42 layers with 42 QKV layers, 42 O layers and 42 MLP layers.


In [None]:
# Prepare the model for inference
FastLanguageModel.for_inference(model)

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): Gemma2ForCausalLM(
      (model): Gemma2Model(
        (embed_tokens): Embedding(256000, 3584)
        (layers): ModuleList(
          (0-41): 42 x Gemma2DecoderLayer(
            (self_attn): Gemma2Attention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=3584, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Identity()
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=3584, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): lora.Linear4bit(
  

## Mediator graph

In [None]:
prompts = pd.read_csv('mediator_prompt1.csv')['Prompt'].tolist()
responses = query_model(prompts, model, tokenizer)

<bos><|begin_of_text|><|start_header_id|>system<|end_header_id|>You are a world-class AI system, capable of complex causal reasoning and reflection. Use your knowledge to deeply infer the relationships among variables. Reason through the query inside <thinking> tags and provide your final response inside <output> tags.If you detect any mistakes in your reasoning at any point, correct yourself inside <reflection> tags.<|eot_id|><|start_header_id|>user<|end_header_id|>Identify the mediator variable in the causal pathway A -> B -> C. This means if A causes B is true and B causes C is true, then B is the mediator. Only provide the mediator variable (B) without any additional explanation. In diagnosing neuropathic pain, consider the variables DLS L4-L5, R Stortårbesvär, and R L5 Radikulopati. Identify the mediator variable.<|eot_id|><|start_header_id|>assistant<|end_header_id|><thinking>The causal chain here is: DLS L4-L5 causes R Stortårbesvär is true.R Stortårbesvär causes R L5 Radikulopa

In [None]:
import re

def clean_response(response):
    # Check if the response is a non-empty string
    if not response or not isinstance(response, str):
        return None  # Return None if the response is empty or not a string

    # Use regex to find the last occurrence of the <output> and </output> tags
    matches = re.findall(r'</reflection><output>(.*?)</output>', response, re.DOTALL)

    # Return the last match if found, else None
    if matches:
        return matches[-1].strip()  # Return the last content inside <output> tags, stripped of whitespace
    else:
        return None  # Return None if the tags are not found



# Apply the clean_response function to the single response
cleaned_responses = [clean_response(response) for response in responses]

In [None]:
ground_truth_df = pd.read_csv('mediator_prompt.csv')['Correct Answer']
comparison_df = pd.DataFrame({
    'Prompt': prompts,
    'Response': cleaned_responses,
    'Correct Answer': ground_truth_df # Corresponding correct answers
})

# Normalize both the Response and Correct Answer for comparison
comparison_df['Is Correct'] = comparison_df.apply(
    lambda row:row['Response'] == row['Correct Answer'],
    axis=1
)

# Calculate the accuracy
accuracy = comparison_df['Is Correct'].mean()
print(f"Accuracy: {accuracy * 100:.2f}%")

In [None]:
# Define the target directory relative to the repository root
target_directory = '../model/Gemma2_9B_reflection_tunning'

# Check if the directory exists, and create it if not
if not os.path.exists(target_directory):
    os.makedirs(target_directory)  # Create the directory and all intermediate-level directories

# Define the full path where you want to save the CSV file
save_path = os.path.join(target_directory, 'Mediator_response.csv')

# Save the DataFrame to the specified path
comparison_df.to_csv(save_path, index=False)

print(f"File saved to: {save_path}")
!git remote set-url origin https://$GITHUB_TOKEN@github.com/..../Research.git
!git add {save_path}
!git commit -m "Added basic_knowledge_response to the model folder"
!git push origin main

## Confounder graph

In [None]:
prompts = pd.read_csv('confunder_prompt1.csv')['Prompt'].tolist()
responses = query_model(prompts, model, tokenizer)

In [None]:
import re

def clean_response(response):
    # Check if the response is a non-empty string
    if not response or not isinstance(response, str):
        return None  # Return None if the response is empty or not a string

    # Use regex to find the last occurrence of the <output> and </output> tags
    matches = re.findall(r'</reflection><output>(.*?)</output>', response, re.DOTALL)

    # Return the last match if found, else None
    if matches:
        return matches[-1].strip()  # Return the last content inside <output> tags, stripped of whitespace
    else:
        return None  # Return None if the tags are not found



# Apply the clean_response function to the single response
cleaned_responses = [clean_response(response) for response in responses]

In [None]:
ground_truth_df = pd.read_csv('confunder_prompt1.csv')['Correct Answer']
comparison_df = pd.DataFrame({
    'Prompt': prompts,
    'Response': cleaned_responses,
    'Correct Answer': ground_truth_df # Corresponding correct answers
})

# Normalize both the Response and Correct Answer for comparison
comparison_df['Is Correct'] = comparison_df.apply(
    lambda row:row['Response'] == row['Correct Answer'],
    axis=1
)

# Calculate the accuracy
accuracy = comparison_df['Is Correct'].mean()
print(f"Accuracy: {accuracy * 100:.2f}%")

Accuracy: 94.20%


In [None]:
# Define the target directory relative to the repository root
target_directory = '../model/Gemma2_9B_reflection_tunning'

# Check if the directory exists, and create it if not
if not os.path.exists(target_directory):
    os.makedirs(target_directory)  # Create the directory and all intermediate-level directories

# Define the full path where you want to save the CSV file
save_path = os.path.join(target_directory, 'confounder_response.csv')

# Save the DataFrame to the specified path
comparison_df.to_csv(save_path, index=False)

print(f"File saved to: {save_path}")
!git remote set-url origin https://$GITHUB_TOKEN@github.com/..../Research.git
!git add {save_path}
!git commit -m "Added confounder_response to the model folder"
!git push origin main

## Collider graph

In [None]:
prompts = pd.read_csv('collider_prompt1.csv')['Prompt'].tolist()
responses = query_model(prompts, model, tokenizer)

In [None]:
import re

def clean_response(response):
    # Check if the response is a non-empty string
    if not response or not isinstance(response, str):
        return None  # Return None if the response is empty or not a string

    # Use regex to find the last occurrence of the <output> and </output> tags
    matches = re.findall(r'</reflection><output>(.*?)</output>', response, re.DOTALL)

    # Return the last match if found, else None
    if matches:
        return matches[-1].strip()  # Return the last content inside <output> tags, stripped of whitespace
    else:
        return None  # Return None if the tags are not found



# Apply the clean_response function to the single response
cleaned_responses = [clean_response(response) for response in responses]

In [None]:
ground_truth_df = pd.read_csv('collider_prompt1.csv')['Correct Answer']
comparison_df = pd.DataFrame({
    'Prompt': prompts,
    'Response': cleaned_responses,
    'Correct Answer': ground_truth_df # Corresponding correct answers
})

# Normalize both the Response and Correct Answer for comparison
comparison_df['Is Correct'] = comparison_df.apply(
    lambda row:row['Response'] == row['Correct Answer'],
    axis=1
)

# Calculate the accuracy
accuracy = comparison_df['Is Correct'].mean()
print(f"Accuracy: {accuracy * 100:.2f}%")

Accuracy: 98.40%


In [None]:
# Define the target directory relative to the repository root
target_directory = '../model/Gemma2_9B_reflection_tunning'

# Check if the directory exists, and create it if not
if not os.path.exists(target_directory):
    os.makedirs(target_directory)  # Create the directory and all intermediate-level directories

# Define the full path where you want to save the CSV file
save_path = os.path.join(target_directory, 'collider_response.csv')

# Save the DataFrame to the specified path
comparison_df.to_csv(save_path, index=False)

print(f"File saved to: {save_path}")
!git remote set-url origin https://$GITHUB_TOKEN@github.com/..../Research.git
!git add {save_path}
!git commit -m "Added collider_response to the model folder"
!git push origin main