# Mount the drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Create and Validate a BN
Ensure that all species dichtomous key sets actually map to the species we intend them to

In [None]:
# CONFIGS

# network dependencies
!pip install pgmpy
from pgmpy.models import DiscreteBayesianNetwork
from pgmpy.estimators import BayesianEstimator
from pgmpy.inference import VariableElimination


# loading dichtomous key data
import json
import os
import pandas as pd
import ast
import numpy as np

In [None]:
# DIRS
ROOT = '/content/drive/MyDrive/Semester Project'
KEYS_DIR = os.path.join(ROOT, 'data', 'keys.csv')
DICHT_DIR = os.path.join(ROOT, 'data', 'feature_names.json')

In [None]:
# DATA
df = pd.read_csv(KEYS_DIR, encoding="latin1")         # Ana's data on dichotomous keys
dicht = json.load(open(DICHT_DIR, "r"))               # the feature names (key names)

In [None]:
# HELPERS
def parse_and_fix(x):
    """Parse Ana's stored vector reliably and convert:
       1 -> 1 (present)
       0 -> 0 (absent)
       anything else (nan, None, '', etc.) -> 2 (not evaluated)
    """
    # First, safely parse the nested string structure
    try:
        parsed = ast.literal_eval(x)
        parsed = ast.literal_eval(parsed)
    except Exception:
        # fallback: try more normalization
        cleaned = x.replace("nan", "None").replace("NaN", "None")
        parsed = ast.literal_eval(ast.literal_eval(cleaned))

    vec = []
    for v in parsed:
        if v in [1, "1", True]:
            vec.append(1)
        elif v in [0, "0", False]:
            vec.append(0)
        else:
            vec.append(2)  # NOT EVALUATED
    return np.array(vec, dtype=int)


def vector_is_valid(vec):
    if vec is None:
        return False
    if isinstance(vec, float) and np.isnan(vec):
        return False
    arr = np.array(vec, dtype=float)
    return not np.isnan(arr).any()

def cpd_to_df(cpd):
    species = list(cpd.state_names["Species"])
    df = pd.DataFrame(cpd.values,
                      index=cpd.state_names[cpd.variable],
                      columns=species)
    if len(df.index) == 2:
        df.index = ["No", "Yes"]
    return df

# validate these mappings
def predict_species(BN_model, evidence_dict, verbose=True):
    """
    Run BN inference given a feature dictionary (evidence).
    Returns (predicted_species, probability, full_distribution).
    """

    # set up inference engine
    inference = VariableElimination(BN_model)

    # run query
    query_result = inference.query(
        variables=["Species"],
        evidence=evidence_dict
    )

    # extract names and probs
    species_names = query_result.state_names["Species"]
    probs = query_result.values

    # most probable species
    max_idx = probs.argmax()
    predicted_species = species_names[max_idx]
    predicted_prob = probs[max_idx]

    if verbose:
        print("\n=== Bayesian Network Prediction ===")
        print("Predicted species:", predicted_species)
        print("Probability:", predicted_prob)
        print("\nFull distribution:")
        for sp, p in zip(species_names, probs):
            print(f"  {sp}: {p:.4f}")

    return predicted_species, predicted_prob, (species_names, probs)


import matplotlib.pyplot as plt

def plot_posterior(species, probs, top_k=15):
    """
    Plot the posterior distribution from the BN.
    Optionally show only the top_k species for clarity.
    """

    # Sort species by probability
    idx = probs.argsort()[::-1]       # descending
    species_sorted = [species[i] for i in idx]
    probs_sorted = probs[idx]

    # Optionally show only top-k most likely species
    species_plot = species_sorted[:top_k]
    probs_plot = probs_sorted[:top_k]

    plt.figure(figsize=(10, 5))
    plt.bar(species_plot, probs_plot)
    # plt.xticks(rotation=45, ha='right')
    plt.ylim(0, 1)
    plt.ylabel("Posterior Probability")
    plt.title("Bayesian Network Species Posterior (Top {} Species)".format(top_k))
    plt.tight_layout()
    plt.show()



In [None]:
# clean the key vector
df["Full vector"] = df["Full vector"].apply(parse_and_fix)

In [None]:
# clean names and drop dupes
df["Species"] = df["Species"].astype(str).str.strip()
df = df.drop_duplicates(subset="Species", keep="first")
df = df[df["Full vector"].apply(vector_is_valid)]

# fill out key value pairs : (species, json of feature names)
keys = {}

for species, row in df.iterrows():
    vec = row["Full vector"]

    # Length check
    if len(vec) != len(dicht):
        print(f"ERROR: Species '{row['Species']}' has vector length {len(vec)} (expected {len(dicht)})")
        continue

    # Build feature dictionary
    feats = {feat: int(val) for feat, val in zip(dicht, vec)}
    keys[row["Species"]] = feats

In [None]:
from collections import defaultdict

# CHECK FOR DUPES
def build_equivalence_dict(keys):
    """
    Input:  keys = { species_name : {feature_name: value, ...}, ... }
    Output: equivalence = { species_name : [species_with_same_vector], ... }
    """

    # Step 1: Reverse-map feature vectors → list of species with that vector
    reverse = defaultdict(list)
    for sp, featdict in keys.items():
        vec_tuple = tuple(featdict.values())   # must be hashable
        reverse[vec_tuple].append(sp)

    # Step 2: Build equivalence mapping
    equivalence = {}
    for vec, species_list in reverse.items():
        # all species in this group map to the same species_list
        for sp in species_list:
            equivalence[sp] = species_list

    return equivalence, reverse

# === RUN IT ===
equivalence, reverse_groups = build_equivalence_dict(keys)

# Print duplicate groups nicely
print("\n=== Duplicate Feature Groups ===")
for vec, group in reverse_groups.items():
    if len(group) > 1:
        print(group)

# Example lookup
print("\nmarshallii group:", equivalence.get("marshallii"))
print("gambiae group:", equivalence.get("gambiae", ["gambiae"]))


In [None]:
print(equivalence.get("ardensis"))

In [None]:
# build a network
rows = []
for species, featdict in keys.items():
    row = featdict.copy()
    row["Species"] = species
    rows.append(row)

df_bn = pd.DataFrame(rows)

def expand_species(df, n=100):
    expanded = []
    for _, row in df.iterrows():
        for _ in range(n):
            expanded.append(row.copy())
    return pd.DataFrame(expanded)

df_train = expand_species(df_bn, n=100)

# build model
from pgmpy.models import DiscreteBayesianNetwork
features = [c for c in df_train.columns if c != "Species"]
edges = [("Species", feat) for feat in features]

BN_model = DiscreteBayesianNetwork(edges)


# fit to cpd
from pgmpy.estimators import BayesianEstimator
BN_model.fit(df_train, estimator=BayesianEstimator, prior_type="BDeu", equivalent_sample_size=10)

In [None]:
# validate the model with equivalence groups
validated = True
for species, featdict in keys.items():
    evidence = featdict.copy()
    pred, _, _ = predict_species(BN_model, evidence, verbose=False)

    allowed = equivalence.get(species, [species])
    if pred not in allowed:
        print(f"ERROR: Species '{species}' predicted as '{pred}'")
        validated = False

if validated:
    print("\n=== VALIDATION SUCCESSFUL ===")


# Load Dataset

In [None]:
# load existing dataset from /processed
from datasets import load_from_disk

drive.mount('/content/drive')
DATASET_PATH = "/content/drive/MyDrive/Semester Project/data-augmentation/processed"
dataset = load_from_disk(DATASET_PATH)

In [None]:
# %%capture
# import re
# if "COLAB_" not in "".join(os.environ.keys()):
#     !pip install unsloth
# else:
#     # Do this only in Colab notebooks! Otherwise use pip install unsloth
#     import torch; v = re.match(r"[0-9\.]{3,}", str(torch.__version__)).group(0)
#     xformers = "xformers==" + ("0.0.32.post2" if v == "2.8.0" else "0.0.29.post3")
#     !pip install --no-deps bitsandbytes accelerate {xformers} peft trl triton cut_cross_entropy unsloth_zoo
#     !pip install sentencepiece protobuf "datasets>=3.4.1,<4.0.0" "huggingface_hub>=0.34.0" hf_transfer
#     !pip install --no-deps unsloth
# !pip install transformers==4.56.2
# !pip install --no-deps trl==0.22.2

%%capture
import os, re
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    # Do this only in Colab notebooks! Otherwise use pip install unsloth
    import torch; v = re.match(r"[0-9]{1,}\.[0-9]{1,}", str(torch.__version__)).group(0)
    xformers = "xformers==" + ("0.0.33.post1" if v=="2.9" else "0.0.32.post2" if v=="2.8" else "0.0.29.post3")
    !pip install --no-deps bitsandbytes accelerate {xformers} peft trl triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf "datasets==4.3.0" "huggingface_hub>=0.34.0" hf_transfer
    !pip install --no-deps unsloth
!pip install transformers==4.56.2
!pip install --no-deps trl==0.22.2

In [None]:
from unsloth import FastVisionModel
from peft import PeftModel
import torch

# 1. Load base model
base_model, tokenizer = FastVisionModel.from_pretrained(
    "unsloth/Llama-3.2-11B-Vision-Instruct",
    load_in_4bit=True,
    dtype=torch.float16,
    use_gradient_checkpointing="unsloth",
)

# 2. Attach your LoRA adapters
lora_path = "/content/drive/MyDrive/Semester Project/data-augmentation/lora_model"
model = PeftModel.from_pretrained(base_model, lora_path)

In [None]:
# load the basic key map (to map machine keys -> dichotomous keys)
KEYS_PATH = "/content/drive/MyDrive/Semester Project/data-augmentation/basic_key_map.json"

with open(KEYS_PATH, "r") as f:
    basic_key_map = json.load(f)

In [None]:
# set up GPT client
!pip install openai
from openai import OpenAI

# set up the api key
os.environ["OPENAI_API_KEY"] = "insert your api key here"
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

In [None]:
# post-processing GPT HELPER

allowed_keys = basic_key_map.keys()

# instruction:
def enforce_json_structure(raw_text, allowed_keys):
    allowed_keys = list(allowed_keys)

    prompt = f"""
You are a strict JSON validator.

Your ONLY tasks are:
1. Ensure the output is valid JSON.
2. Ensure the JSON contains exactly this set of keys:
{json.dumps(allowed_keys, indent=2)}
3. For any key already in the input with value "present", "absent", or "occluded",
   DO NOT MODIFY ITS VALUE.
4. If a key is missing, add it with value "occluded".
5. If a key has an invalid value, replace it with "occluded".
6. Return ONLY a JSON object. No comments, no explanations.

Input:
{raw_text}
    """

    response = client.chat.completions.create(
        model="gpt-4.1-mini",
        response_format={"type": "json_object"},
        messages=[{"role": "user", "content": prompt}],
    )

    return json.loads(response.choices[0].message.content)



# decode the json output by mapping the basic keys back to the dichtomous keys
# also map present -> 1
#          absent  -> 0
#          else    -> exclude from json

def decode_feature_json(model_json, key_map):
    decoded = {}
    all_basic_keys = set(key_map.keys())

    # 1. Ensure no missing/extra keys
    if set(model_json.keys()) != all_basic_keys:
        missing = all_basic_keys - set(model_json.keys())
        extra   = set(model_json.keys()) - all_basic_keys
        raise ValueError(f"Key mismatch. Missing={missing}, Extra={extra}")

    for basic_key, value in model_json.items():
        if basic_key not in key_map:
            raise KeyError(f"Unexpected key '{basic_key}' in model_json")

        v = value.strip().lower()

        # 2. Strict validity
        if v not in ("present", "absent", "occluded"):
            raise ValueError(f"Invalid value '{value}' for key '{basic_key}'")

        # 3. Map with NO silent fallback
        if v == "present":
            decoded[key_map[basic_key]] = 1
        elif v == "absent":
            decoded[key_map[basic_key]] = 0
        else:
            decoded[key_map[basic_key]] = 2

    return decoded


# Perform inference with Model

In [None]:
# enable inference mode!
FastVisionModel.for_inference(model)

# select an image
im = 250

# select an image to infer from
image = dataset[im]["image"]

# instruction to the VLM
instruction = (
    "You are an expert entomologist. Describe accurately what you see in this "
    "image based on mosquito descriptions present in dichotomous keys."
)

messages = [
    {"role": "user", "content": [
        {"type": "image"},
        {"type": "text", "text": instruction}
    ]}
]

input_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True)

inputs = tokenizer(
    image,
    input_text,
    add_special_tokens=False,
    return_tensors="pt",
).to("cuda")

In [None]:
# perform inference
outputs = model.generate(
    **inputs,
    max_new_tokens=1000,
    use_cache=True,
    temperature=0.1,
    min_p=0.1
)

# decode into string
vlm_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
def clean_vlm_output(raw_output: str) -> str:
    # Split on 'assistant' and take everything after it
    if "assistant" in raw_output:
        cleaned = raw_output.split("assistant", 1)[1].strip()
    else:
        cleaned = raw_output.strip()

    return cleaned

cleaned_vlm_output = clean_vlm_output(vlm_output)

In [None]:
print(json.dumps(cleaned_vlm_output, indent=2))

In [None]:
# post process the output
model_json = enforce_json_structure(cleaned_vlm_output, list(allowed_keys))

In [None]:
import ast

if isinstance(model_json, str):
    model_json = ast.literal_eval(model_json)

In [None]:
# post process the output
input = decode_feature_json(model_json, basic_key_map)

In [None]:
print(json.dumps(input, indent=2))

In [None]:
# process through the BN
pred, _, _ = predict_species(BN_model, input, verbose=True)
print(pred)

In [None]:
# what did the original predict?
test = enforce_json_structure(dataset[im]["caption"], basic_key_map)
test = decode_feature_json(test, basic_key_map)

def convert_for_bn(decoded, reverse_map):
    out = {}
    for full_text, val in decoded.items():
        if val in [0, 1] and full_text in reverse_map:
            short_key = reverse_map[full_text]
            out[short_key] = val
    return out

test = convert_for_bn(test, basic_key_map)

In [None]:
# process through the BN
pred2, _, _ = predict_species(BN_model, test, verbose=True)


In [None]:
# plot the distrubution
pred2, _, posterior = predict_species(BN_model, test, verbose=False)
species, probs = posterior
plot_posterior(species, probs, 8)

In [None]:
# renormalization for fun

def restrict_and_renormalize(query_result, allowed_subset):
    """
    Given a pgmpy query_result for variable 'Species',
    filter to a subset of species and renormalize probabilities.

    Returns:
        best_name        – the MAP species inside allowed_subset
        best_prob        – its renormalized probability
        filtered_names   – list of species kept
        filtered_probs   – renormalized probabilities
    """
    # Extract original posterior
    species_names = query_result.state_names["Species"]
    probs = query_result.values  # array

    # Filter
    filtered_names = []
    filtered_probs = []

    for name, prob in zip(species_names, probs):
        if name in allowed_subset:
            filtered_names.append(name)
            filtered_probs.append(prob)

    filtered_probs = np.array(filtered_probs)

    # Handle edge case: none found
    if filtered_probs.sum() == 0:
        # Return uniform over subset
        uniform_prob = 1.0 / len(filtered_names)
        filtered_probs = np.array([uniform_prob] * len(filtered_names))
    else:
        # Renormalize
        filtered_probs /= filtered_probs.sum()

    # Pick best
    max_idx = filtered_probs.argmax()
    best_name = filtered_names[max_idx]
    best_prob = filtered_probs[max_idx]

    return best_name, best_prob, filtered_names, filtered_probs


In [None]:
inference = VariableElimination(BN_model)
allowed_subset = ["funestus", "gambiae", "tenebrosus", "pharoensis", "coustani"]

query_result = inference.query(
    variables=["Species"],
    evidence=input
)

best_name, best_prob, names, probs = restrict_and_renormalize(query_result, allowed_subset)

print("Most likely species:", best_name)
print("Probability:", best_prob)
print("Distribution:")
for n, p in zip(names, probs):
    print(f"  {n}: {p:.3f}")


In [None]:
plot_posterior(names, probs, 5)

# Bleu and Rouge Scores
note that it doesn't make sense to use these scores because they measure semantic similarity. This is just a demonstration. The actual metric for structured output should be a simple accuracy score for correct key output

In [None]:
%%capture
!pip install evaluate
!pip install nltk
!pip install rouge_score
import evaluate

In [None]:
print(json.dumps(cleaned_vlm_output, indent=2))
print(json.dumps(dataset[im]["caption"]))

In [None]:
#Compute Metrics
# Load the metrics
bleu = evaluate.load("bleu")
rouge = evaluate.load("rouge")

# set up the refs
ref1 = dataset[im]["caption"]

# set up the predictions


# Example data (can fill it with the dichtomous key stuff later...)
predictions = [
    json.dumps(input, indent=2),
]

references = [
    json.dumps(ref1, indent=2),
]

# bleu
bleu_result = bleu.compute(
    predictions=predictions,
    references=references
)

# rouge
rouge_result = rouge.compute(
    predictions=predictions,
    references=[r[0] for r in references]
)

print("BLEU:", json.dumps(bleu_result, indent=4))
print("ROUGE:", json.dumps(rouge_result, indent=4))

# Accuracy of Inference Output Structure

In [None]:
print(json.dumps(cleaned_vlm_output))
print(json.dumps(dataset[im]["caption"]))

In [None]:
# compare model_json vs dataset[im]["caption"] keys

pred_keys = set(model_json.keys())
actual_keys = set(dataset[im]["caption"].keys())

jaccard = len(actual_keys & pred_keys) / len(actual_keys | pred_keys)
print('Jaccard overlap: ', jaccard)
