Dataset provided is of the following structure (only showing important parts):

66e31d6ee96cd_student_resource_3/ <br/>
├─ student_resource 3/ <br/>
│  ├─ dataset/ <br/>
│  │  ├─ sample_test <br/>
│  │  ├─ sample_test_out <br/>
│  │  ├─ sample_test_out_fail<br/>
│  │  ├─ test <br/>
│  │  ├─ train<br/>
│  ├─ README<br/>
│  ├─ src/<br/>
│  │  ├─ utils.py<br/>
│  │  ├─ sanity.py<br/>
│  │  ├─ constants.py<br/>
│  │  ├─ .DS_Store<br/>
│  │  ├─ test.ipynb<br/>
│  ├─ sample_code<br/>
│  ├─ .DS_Store<br/>
├─ __MACOSX/<br/>


Extract Url Images for Test to Google Drive

Steps:

    1:  Upload the test.csv on the Colab Workspace (Not Google Drive)
    2:  make sure you have necessary modules : request , tqdm

you may then run the code below

In [None]:
!pip install transformers
!pip install timm

In [None]:
from google.colab import drive
drive.mount('/content/drive')

import os
import csv
import requests
from tqdm import tqdm


save_folder = '/content/drive/MyDrive/Amazon_Hackathon_Images'
os.makedirs(save_folder, exist_ok=True)


csv_file_path = 'test.csv'

def download_images(csv_file, folder):
    with open(csv_file, 'r') as file:
        reader = csv.DictReader(file)
        for row in tqdm(reader):
            image_url = row['image_link']
            try:
                response = requests.get(image_url, stream=True)
                if response.status_code == 200:
                    image_name = image_url.split("/")[-1]
                    image_path = os.path.join(folder, image_name)

                    with open(image_path, 'wb') as f:
                        for chunk in response.iter_content(1024):
                            f.write(chunk)
                else:
                    print(f"Failed to download {image_url}")
            except Exception as e:
                print(f"Error downloading {image_url}: {str(e)}")

download_images(csv_file_path, save_folder)

------------------------- Optional Code Begins -------------------------------

NOTE : Optional part below

Due to insufficient resources we had to run our inferences on multiple google colab accounts and to share the folder amongst them we have zipped it

In [None]:
!zip -r Images.zip /content/drive/MyDrive/Amazon_Hackathon_Images

Then follow these steps:

    1: Share the zip file with the other account
    2: in the other account make a shortcut to this file in MyDrive

the location to this zip file must be '/content/drive/MyDrive/Images.zip'


In [None]:
!unzip '/content/drive/MyDrive/Images.zip' -d 'Images'

this will successfully transfer the images to the Colab Workspace in the folder Images/Images/

-----------------------------Optional Part Ends ---------------------------------

Note : in the optional code part we transfered all images to the Colab WorkSpace in Images/Images/ from now on we will consider this to be the folder to deal with

If You Skip the Optional Code : Make sure to make this folder and mount and transfer all images to this folder

Image cropping and enchancement

    we have cropped the images from all side after carefully observing the data given

    And after that we have applied image upscaling to 2x the original size

simply following the code below will work

In [None]:
# Image cropper



import os
from PIL import Image

def crop_image(image_path, left, top, right, bottom):
    with Image.open(image_path) as img:
        width, height = img.size
        cropped_img = img.crop((left, top, width - right, height - bottom))
        return cropped_img

def process_images_in_folder(folder_path, left, top, right, bottom):
    for filename in os.listdir(folder_path):
        if filename.lower().endswith(('.png', '.jpg', '.jpeg', '.tiff', '.bmp', '.gif')):
            file_path = os.path.join(folder_path, filename)
            try:
                cropped_img = crop_image(file_path, left, top, right, bottom)
                cropped_img.save(file_path)
                print(f"Processed and saved: {filename}")
            except Exception as e:
                print(f"Error processing {filename}: {str(e)}")

# Usage
images_folder = "Images/Images"
left_crop = 20
top_crop = 20
right_crop = 20
bottom_crop = 20

process_images_in_folder(images_folder, left_crop, top_crop, right_crop, bottom_crop)
print("All images have been processed.")

In [None]:
# image upscaling


import cv2
import os

def upscale_images_in_folder(input_folder, output_folder, scale_factor):
    # Create output folder if it doesn't exist
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    # Iterate over all files in the input folder
    for filename in os.listdir(input_folder):
        if filename.endswith(('.png', '.jpg', '.jpeg', '.bmp', '.tiff')):
            # Construct full file path
            img_path = os.path.join(input_folder, filename)

            # Read the image using OpenCV
            img = cv2.imread(img_path)

            if img is not None:
                # Get new dimensions
                new_width = int(img.shape[1] * scale_factor)
                new_height = int(img.shape[0] * scale_factor)

                # Resize the image (upscale)
                upscaled_img = cv2.resize(img, (new_width, new_height), interpolation=cv2.INTER_LINEAR)

                # Save the upscaled image to the output folder
                output_path = os.path.join(output_folder, filename)
                cv2.imwrite(output_path, upscaled_img)

                print(f"Upscaled and saved: {output_path}")
            else:
                print(f"Failed to read image: {img_path}")

# Example usage
input_folder = "Images/Images"  # Specify your input folder
output_folder = "Images2"  # Specify where to save the upscaled images
scale_factor = 2.0  # Scale factor (2.0 means double the size)

upscale_images_in_folder(input_folder, output_folder, scale_factor)

Setting up InternVL for inference

In [None]:
import numpy as np
import torch
import torchvision.transforms as T
# from decord import VideoReader, cpu
from PIL import Image
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer

IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)

def build_transform(input_size):
    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
    transform = T.Compose([
        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
        T.ToTensor(),
        T.Normalize(mean=MEAN, std=STD)
    ])
    return transform

def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
    best_ratio_diff = float('inf')
    best_ratio = (1, 1)
    area = width * height
    for ratio in target_ratios:
        target_aspect_ratio = ratio[0] / ratio[1]
        ratio_diff = abs(aspect_ratio - target_aspect_ratio)
        if ratio_diff < best_ratio_diff:
            best_ratio_diff = ratio_diff
            best_ratio = ratio
        elif ratio_diff == best_ratio_diff:
            if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
                best_ratio = ratio
    return best_ratio

def dynamic_preprocess(image, min_num=1, max_num=4, image_size=896, use_thumbnail=False):
    orig_width, orig_height = image.size
    aspect_ratio = orig_width / orig_height

    # calculate the existing image aspect ratio
    target_ratios = set(
        (i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if
        i * j <= max_num and i * j >= min_num)
    target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])

    # find the closest aspect ratio to the target
    target_aspect_ratio = find_closest_aspect_ratio(
        aspect_ratio, target_ratios, orig_width, orig_height, image_size)

    # calculate the target width and height
    target_width = image_size * target_aspect_ratio[0]
    target_height = image_size * target_aspect_ratio[1]
    blocks = target_aspect_ratio[0] * target_aspect_ratio[1]

    # resize the image
    resized_img = image.resize((target_width, target_height))
    processed_images = []
    for i in range(blocks):
        box = (
            (i % (target_width // image_size)) * image_size,
            (i // (target_width // image_size)) * image_size,
            ((i % (target_width // image_size)) + 1) * image_size,
            ((i // (target_width // image_size)) + 1) * image_size
        )
        # split the image
        split_img = resized_img.crop(box)
        processed_images.append(split_img)
    assert len(processed_images) == blocks
    if use_thumbnail and len(processed_images) != 1:
        thumbnail_img = image.resize((image_size, image_size))
        processed_images.append(thumbnail_img)
    return processed_images

def load_image(image_file, input_size=448, max_num=4):
    image = Image.open(image_file).convert('RGB')
    transform = build_transform(input_size=input_size)
    images = dynamic_preprocess(image, image_size=input_size, use_thumbnail=True, max_num=max_num)
    pixel_values = [transform(image) for image in images]
    pixel_values = torch.stack(pixel_values)
    return pixel_values

# If you want to load a model using multiple GPUs, please refer to the `Multiple GPUs` section.
path = 'OpenGVLab/InternVL2-1B'
model = AutoModel.from_pretrained(
    path,
    torch_dtype=torch.float16,
    low_cpu_mem_usage=True,
    use_flash_attn=True,
    trust_remote_code=True).eval().cuda()
tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True, use_fast=True)


The code below is for mapping unit short foms to names

In [None]:
entity_unit_map = {
    'width': {'centimetre':'cm', 'foot':'ft', 'inch':"\"", 'millimetre':'mm','metre':'m' , 'yard':'yard'},
    'depth': {'centimetre':'cm', 'foot':'ft', 'inch':"\"",'millimetre':'mm', 'metre':'m ',  'yard':'yard'},
    'height': {'centimetre':'cm', 'foot':'ft', 'inch':"\"",  'millimetre':'mm','metre':'m ', 'yard':'yard'},
    'item_weight': {'gram':'g','kilogram':'kg','microgram':'µg','milligram':'mg','ounce':'oz','pound':'lb','ton':'t'},
    'maximum_weight_recommendation': {'gram':'g','kilogram':'kg','microgram':'µg','milligram':'mg','ounce':'oz','pound':'lb','ton':'t'},
    'voltage': {'kilovolt':'kV', 'millivolt':'mV', 'volt':'V'},
    'wattage': {'kilowatt':'kW', 'watt':'W'},
    'item_volume': {'centilitre':'cL','cubic foot':'ft³','cubic inch':'in³','cup':'cup','decilitre':'dL','fluid ounce':'fl oz','gallon':'US gal','imperial gallon':'UK gal','litre':'L','microlitre':'µL','millilitre':'mL','pint':'pint','quart':'quart'}
}

This code makes Map for vocab of the feature to actual units and makes a pickle file for that

In [None]:
entities=[]
Dict={}
for i in entity_unit_map:
    entities+=[i]
    Dict[i]={}
    for k in entity_unit_map[i]:
        Dict[i][k]=k
        Dict[i][entity_unit_map[i][k]]=k
print(Dict)
import pickle
with open('Map.pkl', 'wb') as file:
    pickle.dump(Dict, file)

Load the pickle file

In [None]:
import pickle

with open('Map.pkl', 'rb') as file:
    loaded_data = pickle.load(file)

def getval(entity,text,loaded_data):
    text=text.lower()
    indexes=[]
    # L=text.split()
    L=text.split()
    M=[]
    for i in range(len(L)):
        num=""
        check=True
        j=0
        while(check and j<len(L[i])):
            try:
                float(L[i][:j+1])
                num+=L[i][j]
                j+=1
            except ValueError:
                check=False
        if(len(num)!=0):
            M+=[num]
            if(len(num)==len(L[i])):
              continue
            M+=[L[i][len(num):]]
        else:
            M+=[L[i]]
    L=M
    for i in range(len(L)):
        try:
            float(L[i])
            indexes+=[i]
        except ValueError:
            continue
    for i in indexes:
        string=str(L[i+1:i+4])[2:-2]
        for j in loaded_data[entity]:
          p=j.lower()
          k=p.split()
          m=str(k)[2:-2]
          if m == string[:len(m)]:
              return L[i]+" "+loaded_data[entity][j]

This is the pipeline for inference:

In [None]:
import os
import math
import re
import pandas as pd
def dryrun(csvpath,i,j,imagesfolder,hop):
  df=pd.read_csv(csvpath)
  if 'prediction' in df.columns:
    pass
  else:
    df['prediction']=[None]*len(df)
  l=0
  for k in range(i,j):
    p=list(df.iloc[k,:])

    impath=p[1][36:]
    if(str(p[-1])!="nan"):
      continue
    impath=os.path.join(imagesfolder,impath)
    if os.path.exists(impath)==False:
      print(impath)
      continue
    entity=p[3]
    use = entity
    if(entity == "height"):
      use = "vertical height"
    if(entity == "depth"):
      use = "vertical depth"
    if(entity == "width"):
      use = "horizontal width"
    if(entity == "item_weight"):
      use = "net item weight"
    if(entity == "maximum_weight_recommendation"):
      use = "maximum weight recommendation"
    if(entity == "voltage"):
      use = "mentioned voltage"
    # print(path)
    pixel_values = load_image(impath, max_num=8).to(torch.float16).cuda()
    generation_config = dict(max_new_tokens=4096, do_sample=False)
    # question = f'<image>\nImagine you are a product specialist. What would you report as the {use} of this item to a customer? Be precise and include units'
    # question = f'<image>\ntell in 2 line what is the {use} of product shown and only if {use} is explicitly written in two different units , then give me the imperial unit do not convert metric to imperial in any circumstance. Try to give only the needed {use} answer and not anything else'
    question = f'<image>\ntell in 1 line what is the {use} of product shown in the correct units given'
    # question = f'<image>\n1. Locate any text or labels related to {use}.\n2. If found, report the exact value and unit.\n3. If not found, estimate based on visual cues and clearly state it is an estimate.'


    # question = f'<image>\ntell me IN 3 LINES   ALL ABOUT

    response = model.chat(tokenizer, pixel_values, question, generation_config, history=None, return_history=False)

    # pattern = r"(\d+)-(\d+)\s*"
    first = ""
    second =""
    third = ""
    pattern = r"(\d+)-(\d+)\s*([a-zA-Z]+)"

    def replace_range_with_unit(match):
          # Extract the range values and the unit
          # lower_value = int(match.group(1))
          # upper_value = int(match.group(2))
          # first = lower_value
          # second = upper_value
          # unit = match.group(3)  # Capture the unit (e.g., "V", "W", "Hz")
          # third = unit
          # Return the formatted string as "[lower_value, upper_value] unit"
          return ""

      # Replace all occurrences of the pattern in the input string
    result_str = re.sub(pattern, replace_range_with_unit, response)



    def remove_common_prefix_suffix(str1, str2):
      # Find the common prefix
      min_len = min(len(str1), len(str2))
      prefix_len = 0
      while prefix_len < min_len and str1[prefix_len] == str2[prefix_len]:
          prefix_len += 1

      # Find the common suffix
      suffix_len = 0
      while suffix_len < min_len - prefix_len and str1[-(suffix_len + 1)] == str2[-(suffix_len + 1)]:
          suffix_len += 1

      # Extract the middle part (excluding common prefix and suffix)
      middle_str1 = str1[prefix_len:len(str1) - suffix_len]
      middle_str2 = str2[prefix_len:len(str2) - suffix_len]

      return middle_str1, middle_str2

    # Example usage
    # str1 = "The quick brown fox jumped over the lazy dog"
    # str2 = "The quick brown cat jumped over the lazy dog"


    if(result_str != response):
        unit_mapping = {
            'V': 'volt',
            'v': 'volt',
            'volts': 'volt',
            'mm': 'millimetre',
            'MM': 'millimetre',
            'm': 'metre',
            'metres': 'metre',
            'meter': 'metre',
            'M': 'metre',
            'cm': 'centimetre',
            'CM': 'centimetre',
            'centimeters': 'centimetre',
            'kg': 'kilogram',
            'KG': 'kilogram',
            'kilograms': 'kilogram',
            'G': 'gram',
            'g': 'gram',
            'grams': 'gram',
            'yards':'yard',
            '\"': 'inch',
            'A': 'ampere',
            'a': 'ampere',
            'amperes': 'ampere',
            'W': 'watt',
            # 'w': 'watt',
            'watts': 'watt',
            'w': 'watt',
            'Hz': 'hertz',
            'hz': 'hertz'
            # Add more units as needed
        }
        # print("result:",result_str)
        # print("response:",response)

        result_str1, result_str2 = remove_common_prefix_suffix(result_str, response)
        # print("1:   ",result_str1)
        # print("2:   ",result_str2)
        ans = result_str2
        if(ans == ""):
          abs = result_str1

        index = 0

        ans.replace(" ", "")

        print(ans)
        first = second = third = ""
        found_dash = False
        index = 0
        check=False
        # Loop through the string
        for i in range(len(ans)):
            if(check):
              continue
            # Check for the dash to split the parts
            if ans[i] == "-":
                first = ans[:i]
                index = i + 1
                found_dash = True
                continue

            # After finding the dash, split based on non-digit characters
            if found_dash and (not ans[i].isdigit()):
                second = ans[index:i]
                third = ans[i:]
                check=True
        third = unit_mapping.get(third, third)
        stri = f"[{first}, {second}] {third}"
        df.iloc[k,-1]=stri
        l+=1
        continue



    pattern = r"(\d+)(\d+)/(\d+)\s*inches"
    pattern2 = r"(\d+)(\d+)/(\d+)\s*\""

    def replace_fraction(match):
          # Extract the whole part, numerator, and denominator from the match
          whole_number = int(match.group(1))  # First part of the number
          numerator = int(match.group(2))     # Fraction numerator
          denominator = int(match.group(3))   # Fraction denominator

          # Compute the decimal equivalent
          fractional_value = numerator / denominator
          total_value = whole_number + fractional_value

          # Return the formatted string with 2 decimal places
          return f"{total_value:.2f} inches"



    response = re.sub(pattern, replace_fraction, response)
    response = re.sub(pattern2, replace_fraction, response)
    # if "in" in response :

    #   response = response.replace("in", "inches")
    # if "inches" in response and "cm" in response:
    #   print("-----------------FOUND-------------------")
    #   response = response.replace("cm", "")
    # if "in" in response and "cm" in response:
    #   print("-----------------FOUND-------------------")
    #   response = response.replace("cm", "")

    # if "inches" in response and "mm" in response:
    #   print("-----------------FOUND-------------------")
    #   response = response.replace("mm", "")
    # if "oz" in response and "g" in response:
    #   print("-----------------FOUND-------------------")
    #   response = response.replace("g", "")
    # if "lb" in response and "g" in response:
    #   print("-----------------FOUND-------------------")
    #   response = response.replace("g", "")


    # return getval(entity,response,loaded_data)
    df.iloc[k,-1]=getval(entity,response,loaded_data)
    l+=1
    if l%hop==0:
      print(k)
      df.to_csv(csvpath,index=False)
  df.to_csv(csvpath,index=False)
dryrun('/content/drive/MyDrive/Amazon_Test/test.csv',0,131188,'/content/Images/Images',20)

The code above stores the result in test.csv itself , This code helped us achieve a score of 0.53 initially so we have applied several augmentations

Augmentations:


In [None]:
import pandas as pd
import re

# Function to extract numeric value from prediction
def extract_numeric_value(prediction):
    match = re.search(r'\d+\.?\d*', prediction)
    return float(match.group()) if match else None

# Load the CSV file
df = pd.read_csv('test0.csv')
count = 0
# Sort the dataframe by group_id and entity_name to ensure correct order
# df = df.sort_values(by=['group_id', 'entity_name']).reset_index(drop=True)

# Iterate through the DataFrame by pairs of consecutive rows
for i in range(len(df) - 1):
    # Check if consecutive rows have the same group_id
    if df.loc[i, 'group_id'] == df.loc[i + 1, 'group_id']:
        # Extract numeric values from the predictions
        prediction_1 = df.loc[i, 'prediction']
        prediction_2 = df.loc[i + 1, 'prediction']

        # Check if the predictions are the same
        if prediction_1 == prediction_2:
            # print(df.loc[i, 'entity_name'])
            if 'depth' == df.loc[i, 'entity_name']:
                    count+=1
                    df.loc[i, 'prediction'] = None
            elif 'depth' == df.loc[i+1, 'entity_name']:
                    count+=1
                    df.loc[i + 1, 'prediction'] = None



for i in range(len(df) - 1):
    # Check if consecutive rows have the same group_id
    if df.loc[i, 'group_id'] == df.loc[i + 1, 'group_id']:
        # Extract numeric values from the predictions
        prediction_1 = df.loc[i, 'prediction']
        prediction_2 = df.loc[i + 1, 'prediction']

        # Check if the predictions are the same
        if prediction_1 == prediction_2:
            # print(df.loc[i, 'entity_name'])
            if 'height' == df.loc[i, 'entity_name']:
                    count+=1
                    df.loc[i, 'prediction'] = None
            elif 'height' == df.loc[i+1, 'entity_name']:
                    count+=1
                    df.loc[i + 1, 'prediction'] = None

            # Check if one row is 'depth' and the other is 'width'
            # print(i)
            # if()
            # if ('depth' == df.loc[i, 'entity_name'] and 'width' == df.loc[i + 1, 'entity_name']) or ('width' == df.loc[i, 'entity_name'] and 'depth' == df.loc[i + 1, 'entity_name']):
            #     # Remove the 'depth' prediction
            #     print("rem")
            #     if 'depth' == df.loc[i, 'entity_name']:
            #         df.loc[i, 'prediction'] = pd.NA
            #     else:
            #         df.loc[i + 1, 'prediction'] = pd.NA

# Save the modified CSV
print(count)
df.to_csv('test0_without_depth_and_height.csv', index=False)

print("Depth predictions removed where applicable.")


some other augmentations like promptings