In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# install all the requirements for miniCPM-V
!pip install git+https://github.com/huggingface/transformers
!pip install --upgrade modelscope sentencepiece accelerate bitsandbytes datamodel_code_generator

In [None]:
!pip install deepspeed>=0.9.3
!pip install liger-kernel-nightly==0.2.1.dev20240911164559

In [None]:
!git clone https://github.com/modelscope/swift.git
%cd swift
!pip install -e '.[llm]'

In [None]:
!pip install pyav

In [None]:
!pip install qwen_vl_utils

## QWEN-VL-2B DATASET CREATION

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Read the CSV file
df = pd.read_csv('/kaggle/input/amazon-ml-train/train.csv')

# Calculate the sample size (1% of total rows)
sample_size = int(len(df) * 0.01)

# Perform stratified sampling
stratified_sample, _ = train_test_split(
    df,
    test_size=1 - (sample_size / len(df)),
    stratify=df['entity_name'],
    random_state=42  # for reproducibility
)

# Verify the sample size
print(f"Original dataset size: {len(df)}")
print(f"Sampled dataset size: {len(stratified_sample)}")

# Verify the distribution
print("\nOriginal distribution:")
print(df['entity_name'].value_counts(normalize=True))
print("\nSampled distribution:")
print(stratified_sample['entity_name'].value_counts(normalize=True))

# Save the sampled dataset
stratified_sample.to_csv('/kaggle/working/sample_train.csv', index=False)
print("\nSampled dataset saved as 'sample_train.csv'")

In [None]:
%cd ..

In [None]:
# download images
import os
from time import time
import multiprocessing
import logging
import requests
import pandas as pd
import time as t
import random 

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

img_dir = 'image_dir'

def get_sbu_urls():
    df = pd.read_csv('/kaggle/working/sample_train.csv')
    urls = df['image_link'].tolist()
    indices = df.index
    return list(zip(indices, urls))

def scrape_and_save(args):
    url, savepath = args
    retries = 3
    for attempt in range(retries):
        try:
            session = requests.Session()
            response = session.get(url, timeout=1)
            response.raise_for_status()
            with open(savepath, 'wb') as f:
                f.write(response.content)
            # logging.info(f"Successfully downloaded: {url}")
            return True
        except requests.RequestException as e:
            pass
            # logging.error(f"Error downloading {url}: {e} (Attempt {attempt+1}/{retries})")
        except IOError as e:
            # logging.error(f"Error saving file {savepath}: {e}")
            return False
        t.sleep(2)  # small delay before retrying
    return False

if __name__ == '__main__':
    startidx = 0
    chunk_size = 5000  # Break downloads into chunks
    urls = get_sbu_urls()
    total_urls = len(urls)

    if not os.path.exists(img_dir):
        os.makedirs(img_dir)

    starttime = time()
    piccounter = 0

    # Download in chunks
    for chunk_start in range(startidx, total_urls, chunk_size):
        chunk_end = min(chunk_start + chunk_size, total_urls)
        url_chunk = urls[chunk_start:chunk_end]
        
        pool = multiprocessing.Pool(16)  # create a new pool for each chunk

        results = []
        for i, (index, url) in enumerate(url_chunk, start=chunk_start):
            name = f'train_{index}.jpg'
            savepath = os.path.join(img_dir, name)
            result = pool.apply_async(scrape_and_save, ((url, savepath),))
            results.append(result)

        pool.close()  # no more tasks
        pool.join()  # wait for completion

        # Count successful downloads in the chunk
        successful_downloads = sum([r.get() for r in results])
        piccounter += successful_downloads

        logging.info(f"Downloaded {successful_downloads}/{chunk_size} in chunk {chunk_start//chunk_size + 1}. Total downloaded: {piccounter}/{total_urls}")

        # Throttle requests to prevent server throttling (optional)
        t.sleep(10)  # 10-second break between chunks

    print(f"Downloaded {piccounter}/{total_urls} images.")
    print(f"Total time taken: {time() - starttime:.2f} seconds")

In [None]:
import pandas as pd
import json
import os

def prepare_dataset(csv_path, image_dir, output_json_path, question, answer_column):
    # Read the CSV file
    df = pd.read_csv(csv_path)
    
    # Prepare the data in the required format
    data = []
    for index, row in df.iterrows():
        image_path = os.path.join(image_dir, f"train_{index}.jpg")
        
        # Check if the image file exists
        if not os.path.exists(image_path):
            print(f"Warning: Image file not found for index {index}")
            continue
        
        entry = {
            "system": "You are an entity extractor OCR model. Given the entity, you can extract the text from the image denoting the entity value. Entity value is always a number.",
            "query": f"<image>What is the value of the {row['entity_name']} of the item shown? Give me numerical value as seen in the image. Adhere to instructions.",
            "response": f"{row['entity_value'].split()[0]}",
            "images": [f"{image_path}"]
        }
        data.append(entry)
    
    # Write the data to a JSON file
    with open(output_json_path, 'w') as f:
        json.dump(data, f, indent=2)
    
    print(f"Dataset prepared and saved to {output_json_path}")

# Usage
csv_path = '/kaggle/working/sample_train.csv'
image_dir = '/kaggle/working/image_dir/'
output_json_path = '/kaggle/working/train.json'
question = "What is the value of the {row['entity_name']} of the item shown? Give me numerical value as seen in the image. Adhere to instructions."
answer_column = 'entity_value'

prepare_dataset(csv_path, image_dir, output_json_path, question, answer_column)

In [None]:
# Single-card A10/3090 can run
# GPU Memory: 20GB
!SIZE_FACTOR=8 MAX_PIXELS=602112 CUDA_VISIBLE_DEVICES=0,1 swift sft \
    --model_type qwen2-vl-2b-instruct \
    --model_id_or_path qwen/Qwen2-VL-2B-Instruct \
    --sft_type lora \
    --dataset /kaggle/working/train.json \
    --freeze_vit \
    --seed 42 \
    --max_length 128 \
    --lora_rank 32 \
    --use_liger \
    --num_train_epochs 1

In [None]:
!zip -r checkpoint163.zip /kaggle/working/output/qwen2-vl-2b-instruct/v3-20240914-130231/checkpoint-163/

In [None]:
!ls -lah

## INFERENCE

In [None]:
# download images
import os
from time import time
import multiprocessing
import logging
import requests
import pandas as pd
import time as t
import random 

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

img_dir = 'sample_test_image_dir'

def get_sbu_urls():
    df = pd.read_csv('/kaggle/input/sample-test-amc/sample_test.csv')
    urls = df['image_link'].tolist()
    indices = df.index
    return list(zip(indices, urls))

def scrape_and_save(args):
    url, savepath = args
    retries = 3
    for attempt in range(retries):
        try:
            session = requests.Session()
            response = session.get(url, timeout=1)
            response.raise_for_status()
            with open(savepath, 'wb') as f:
                f.write(response.content)
            # logging.info(f"Successfully downloaded: {url}")
            return True
        except requests.RequestException as e:
            pass
            # logging.error(f"Error downloading {url}: {e} (Attempt {attempt+1}/{retries})")
        except IOError as e:
            # logging.error(f"Error saving file {savepath}: {e}")
            return False
        t.sleep(2)  # small delay before retrying
    return False

if __name__ == '__main__':
    startidx = 0
    chunk_size = 5000  # Break downloads into chunks
    urls = get_sbu_urls()
    total_urls = len(urls)

    if not os.path.exists(img_dir):
        os.makedirs(img_dir)

    starttime = time()
    piccounter = 0

    # Download in chunks
    for chunk_start in range(startidx, total_urls, chunk_size):
        chunk_end = min(chunk_start + chunk_size, total_urls)
        url_chunk = urls[chunk_start:chunk_end]
        
        pool = multiprocessing.Pool(16)  # create a new pool for each chunk

        results = []
        for i, (index, url) in enumerate(url_chunk, start=chunk_start):
            name = f'test_{index}.jpg'
            savepath = os.path.join(img_dir, name)
            result = pool.apply_async(scrape_and_save, ((url, savepath),))
            results.append(result)

        pool.close()  # no more tasks
#         pool.join()  # wait for completion

        # Count successful downloads in the chunk
        successful_downloads = sum([r.get() for r in results])
        piccounter += successful_downloads

        logging.info(f"Downloaded {successful_downloads}/{chunk_size} in chunk {chunk_start//chunk_size + 1}. Total downloaded: {piccounter}/{total_urls}")

        # Throttle requests to prevent server throttling (optional)
        t.sleep(10)  # 10-second break between chunks

    print(f"Downloaded {piccounter}/{total_urls} images.")
    print(f"Total time taken: {time() - starttime:.2f} seconds")

In [None]:
import pandas as pd
import json
import os

def prepare_dataset(csv_path, image_dir, output_json_path, question, answer_column):
    # Read the CSV file
    df = pd.read_csv(csv_path)
    df = df[:10000]
    # Prepare the data in the required format
    data = []
    for index, row in df.iterrows():
        image_path = os.path.join(image_dir, f"test_{index}.jpg")
        
        # Check if the image file exists
        if not os.path.exists(image_path):
#             print(f"Warning: Image file not found for index {index}")
            continue
        
        entry = {
            "system": "You are an entity extractor OCR model. Given the entity, you can extract the text from the image denoting the entity value. Entity value is always a number.",
            "query": f"<image>What is the value of the {row['entity_name']} of the item shown? Give me numerical value as seen in the image. Adhere to instructions.",
            "response": "",
            "images": [f"{image_path}"]
        }
        data.append(entry)
    
    # Write the data to a JSON file
    with open(output_json_path, 'w') as f:
        json.dump(data, f, indent=2)
    
    print(f"Dataset prepared and saved to {output_json_path}")

# Usage
csv_path = '/kaggle/input/testfile/test.csv'
image_dir = '/kaggle/working/test_image_dir/'
output_json_path = '/kaggle/working/test.json'
question = "What is the value of the {row['entity_name']} of the item shown? Give me numerical value as seen in the image. Adhere to instructions."
answer_column = 'entity_value'

prepare_dataset(csv_path, image_dir, output_json_path, question, answer_column)

In [None]:
!python /kaggle/input/amccode/inference.py \
    --csv_file /kaggle/input/sample-test-amc/sample_test.csv \
    --start 0 \
    --end 88 \
    --chunk_size 5000 \
    --img_dir /kaggle/working/sample_test_image_dir \
    --output_csv /kaggle/working/sample_test_preds.csv

In [None]:
!CUDA_VISIBLE_DEVICES=0,1 swift infer \
    --ckpt_dir /kaggle/working/output/qwen2-vl-2b-instruct/v3-20240914-130231/checkpoint-163 \
    --load_dataset_config true --merge_lora true \
    --dataset /kaggle/working/test.json \
    --dataset_test_ratio 1 \

# !CUDA_VISIBLE_DEVICES=0,1 swift infer \
#     --ckpt_dir /kaggle/working/output/qwen2-vl-2b-instruct/v3-20240914-130231/checkpoint-163 \
#     --load_args_from_ckpt_dir \
#     --dataset /kaggle/working/test.json \

In [None]:
df = pd.read_csv("/kaggle/input/testfile/test.csv")[:10000]

In [1]:
df

NameError: name 'df' is not defined