In [None]:
from transformers import CLIPImageProcessor
from dataclasses import dataclass
import json
import os
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

#### Download Clip to folder befor pregressive training

In [None]:
@dataclass
class DataArguments:
    image_processor = None

data_args = DataArguments()
data_args.image_processor = CLIPImageProcessor.from_pretrained("openai/clip-vit-large-patch14-336")

print("CLIP processor loaded successfully!")

In [None]:


preprocessor_config = {
  "crop_size": 224,
  "do_center_crop": True,
  "do_normalize": True,
  "do_resize": True,
  "feature_extractor_type": "CLIPFeatureExtractor",
  "image_mean": [
    0.48145466,
    0.4578275,
    0.40821073
  ],
  "image_std": [
    0.26862954,
    0.26130258,
    0.27577711
  ],
  "resample": 3,
  "size": 224
}



output_path = "/media/volume/Slava/Dual-View-Slava/Reasoning/preprocessor_config.json"

os.makedirs(os.path.dirname(output_path), exist_ok=True)
with open(output_path, "w") as f:
    json.dump(preprocessor_config, f, indent=2)

print(f"Saved preprocessor_config.json to {output_path}")


In [None]:

with open("slava_llava_split_test.json", "r") as f:
    data = json.load(f)  # list of dicts

with open("slava_llava_split_test.jsonl", "w") as f:
    for item in data:
        f.write(json.dumps(item) + "\n")


### Recognition Training

In [2]:
!PYTHONPATH=. NCCL_P2P_DISABLE=1 NCCL_IB_DISABLE=1 \
CUDA_LAUNCH_BLOCKING=1 PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True \
python llava_phi/train/train.py \
--model_name_or_path /media/volume/Slava/checkpoint/LLaVA-Med-Phi-finetune \
--resume_from_checkpoint /media/volume/Slava/Dual-View-Slava/Recognition \
--image_folder /media/volume/Slava/MIMIC_Dataset224 \
--version plain \
--data_path ./slava_llava_recognition.json \
--is_multimodal True \
--tune_mm_mlp_adapter True \
--freeze_vision_tower True \
--freeze_backbone True \
--mm_use_im_start_end False \
--mm_use_im_patch_token False \
--image_aspect_ratio pad \
--group_by_modality_length False \
--bf16 True \
--output_dir /media/volume/Slava/Dual-View-Slava/Recognition \
--num_train_epochs 8\
--per_device_train_batch_size 4 \
--per_device_eval_batch_size 1 \
--gradient_accumulation_steps 8 \
--max_grad_norm 1.0 \
--save_strategy "steps" \
--save_steps 300 \
--save_total_limit 1 \
--learning_rate 1e-5 \
--weight_decay 0.0 \
--warmup_ratio 0.1 \
--lr_scheduler_type "cosine" \
--logging_steps 1 \
--tf32 True \
--model_max_length 2048 \
--gradient_checkpointing True \
--dataloader_num_workers 4 \
--lazy_preprocess True \
--optim "adamw_bnb_8bit" \
--report_to wandb



The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.
You are using a model of type phi to instantiate a model of type llava_phi. This is not supported for all configurations of models and can yield errors.
Loading checkpoint shards: 100%|██████████████████| 2/2 [00:01<00:00,  1.99it/s]
Some weights of LlavaPhiForCausalLM were not initialized from the model checkpoint at /media/volume/Slava/checkpoint/LLaVA-Med-Phi-finetune and are newly initialized: ['model.cross_attention.in_proj_bias', 'model.cross_attention.in_proj_weight', 'model.cross_attention.out_proj.bias', 'model.cross_attention.out_proj.weight', 'model.fuse_gate.0.bias', 'model.fuse_gate.0.weight', 'model.fuse_gate.2.bias', 'model.fuse_gate.2.weight', 'model.fuse_gate.3.bias', 'model.fuse_gate.3.weight', 'model.fusion_weight', 'model.med_feature_adapter.0.bias', 'model.med_feature_adapter.0.weight', 'model.med_feature_adapter.1.bias', 'model.med_feature_adapter.1.weight', 'mo

### Reasoning

In [1]:
!PYTHONPATH=. NCCL_P2P_DISABLE=1 NCCL_IB_DISABLE=1 \
CUDA_LAUNCH_BLOCKING=1 PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True \
python llava_phi/train/train.py \
--model_name_or_path  /media/volume/Slava/Dual-View-Slava/Recognition \
--resume_from_checkpoint  /media/volume/Slava/Dual-View-Slava/Reasoning \
--image_folder /media/volume/Slava/MIMIC_Dataset224 \
--version v0 \
--data_path ./slava_llava_reasoning.json \
--is_multimodal true \
--tune_mm_mlp_adapter True \
--freeze_vision_tower False \
--freeze_backbone True \
--mm_use_im_start_end False \
--mm_use_im_patch_token False \
--image_aspect_ratio pad \
--group_by_modality_length False \
--bf16 True \
--output_dir /media/volume/Slava/Dual-View-Slava/Reasoning \
--num_train_epochs 2\
--per_device_train_batch_size 2 \
--gradient_accumulation_steps 8 \
--max_grad_norm 1.0 \
--save_strategy "steps" \
--save_steps 500 \
--save_total_limit 1 \
--evaluation_strategy "steps" \
--learning_rate 5e-5 \
--weight_decay 0.0 \
--warmup_ratio 0.1 \
--lr_scheduler_type "cosine" \
--logging_steps 1 \
--tf32 True \
--model_max_length 2048 \
--gradient_checkpointing True \
--dataloader_num_workers 4 \
--lazy_preprocess True \
--optim "adamw_bnb_8bit" \
--report_to wandb

The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.
Loading checkpoint shards: 100%|██████████████████| 3/3 [00:00<00:00, 15.20it/s]
Some weights of the model checkpoint at /media/volume/Slava/Dual-View-Slava/Recognition were not used when initializing LlavaPhiForCausalLM: ['model.medical_vision_tower.head.proj.weight', 'model.medical_vision_tower.trunk.blocks.0.attn.proj.bias', 'model.medical_vision_tower.trunk.blocks.0.attn.proj.weight', 'model.medical_vision_tower.trunk.blocks.0.attn.qkv.bias', 'model.medical_vision_tower.trunk.blocks.0.attn.qkv.weight', 'model.medical_vision_tower.trunk.blocks.0.mlp.fc1.bias', 'model.medical_vision_tower.trunk.blocks.0.mlp.fc1.weight', 'model.medical_vision_tower.trunk.blocks.0.mlp.fc2.bias', 'model.medical_vision_tower.trunk.blocks.0.mlp.fc2.weight', 'model.medical_vision_tower.trunk.blocks.0.norm1.bias', 'model.medical_vision_tower.trunk.blocks.0.norm1.weight', 'model.medical_vision_tower.trunk.

### Reporting

In [1]:
!PYTHONPATH=. NCCL_P2P_DISABLE=1 NCCL_IB_DISABLE=1 \
CUDA_LAUNCH_BLOCKING=1 PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True \
python llava_phi/train/train.py \
--model_name_or_path /media/volume/Slava/Dual-View-Slava/Reasoning  \
--resume_from_checkpoint /media/volume/Slava/Dual-View-Slava/Reporting \
--image_folder /media/volume/Slava/MIMIC_Dataset224 \
--version v0 \
--data_path ./slava_llava_report.json \
--is_multimodal true \
--tune_mm_mlp_adapter True \
--freeze_vision_tower False \
--freeze_backbone False \
--mm_use_im_start_end False \
--mm_use_im_patch_token False \
--image_aspect_ratio pad \
--group_by_modality_length False \
--bf16 True \
--output_dir /media/volume/Slava/Dual-View-Slava/Reporting \
--num_train_epochs 3 \
--per_device_train_batch_size 2 \
--gradient_accumulation_steps 8 \
--max_grad_norm 1.0 \
--save_strategy "steps" \
--save_steps 500 \
--save_total_limit 2 \
--eval_steps 500 \
--evaluation_strategy "steps" \
--learning_rate 2e-5 \
--weight_decay 0.0 \
--warmup_ratio 0.03 \
--lr_scheduler_type "cosine" \
--logging_steps 10 \
--tf32 True \
--model_max_length 2048 \
--gradient_checkpointing True \
--dataloader_num_workers 4 \
--lazy_preprocess True \
--optim "adamw_bnb_8bit" \
--report_to wandb \
--remove_unused_columns False

The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.
Loading checkpoint shards: 100%|██████████████████| 3/3 [00:00<00:00, 12.19it/s]
Some weights of the model checkpoint at /media/volume/Slava/Dual-View-Slava/Reasoning were not used when initializing LlavaPhiForCausalLM: ['model.medical_vision_tower.head.proj.weight', 'model.medical_vision_tower.trunk.blocks.0.attn.proj.bias', 'model.medical_vision_tower.trunk.blocks.0.attn.proj.weight', 'model.medical_vision_tower.trunk.blocks.0.attn.qkv.bias', 'model.medical_vision_tower.trunk.blocks.0.attn.qkv.weight', 'model.medical_vision_tower.trunk.blocks.0.mlp.fc1.bias', 'model.medical_vision_tower.trunk.blocks.0.mlp.fc1.weight', 'model.medical_vision_tower.trunk.blocks.0.mlp.fc2.bias', 'model.medical_vision_tower.trunk.blocks.0.mlp.fc2.weight', 'model.medical_vision_tower.trunk.blocks.0.norm1.bias', 'model.medical_vision_tower.trunk.blocks.0.norm1.weight', 'model.medical_vision_tower.trunk.bl