In [1]:
import pandas as pd

df = pd.read_csv('train.csv')

In [2]:
import re
# Material class mapping dictionary
material_mapping = {
    # Plastic
    'plastic': 'plastic',
    'pvc': 'plastic',
    'silicon': 'plastic',
    'plastic and metal' : 'plastic',
    'plastic metals': 'plastic',  # Assuming primary material is plastic
    'faux leather': 'plastic',
    'leather / some plastic?' : 'plastic',
    
    # Glass
    'glass': 'glass',
    'glass and metal?': 'glass',  # Assuming primary material is glass
    
    # Ceramic
    'ceramic': 'ceramic',
    'clay': 'ceramic',
    'bricks': 'ceramic',
    'bricks (maybe it goes into ceramic)' : 'ceramic',
    
    # Metal
    'metal': 'metal',
    'aliminium': 'metal',
    'alluminium': 'metal',
    'steel': 'metal',
    'tin': 'metal',
    'gold': 'metal',
    'silver': 'metal',
    'can' : 'metal',
    'metal and plastic?': 'metal',  # Assuming primary material is metal
    'metals / plastic?' : 'metal',
    
    # Wood
    'wood': 'wood',
    'wood and metals?': 'wood',  # Assuming primary material is wood
    
    # Paper
    'paper': 'paper',
    'cardboard': 'paper',
    'sticky paper?': 'paper',
    'cellulose': 'paper',
    
    # Fabric
    'fabric': 'fabric',
    'cotton': 'fabric',
    'silk': 'fabric',
    'wool': 'fabric',
    'synthetic fabric': 'fabric',
    'polyester': 'fabric',
    'nylon': 'fabric',
    'leather': 'fabric',
    'cowhide?': 'fabric',
    'rubber fabric composite': 'fabric',
    
    # Food
    'food': 'food',
    'food / for shaker its glass and metal' : 'food',
    'food / for shaker its glass and metal?' : 'food',
    
    # Unknown
    'unknown': 'unknown',
    'not sure': 'unknown',
    'na': 'unknown',
    
    # Other
    'plant': 'other',
    'plant natural': 'other',
    'plant organic': 'other',
    'plant?': 'other',
    'real plant. natural': 'other',
    'foliage': 'other',
    'straw': 'other',
    'composite': 'other',
    'rubber': 'other',
    'latex': 'other',
    'foam': 'other',
    'liquid': 'other',
    'water': 'other',
    'componets eletronic': 'other',
    'wax': 'other',
    'pearl': 'other',
    'pile': 'other',
    'tape': 'other',
    'organic material': 'other',
    'stone': 'other'
}

def remove_extra_whitespace(text):
    """
    Remove extra whitespace from the middle of the string.
    
    Parameters:
    -----------
    text : str
        Input string with potential extra whitespace
    
    Returns:
    --------
    str
        String with extra whitespace removed
    """
    return re.sub(r'\s+', ' ', text).strip()

labels = [material_mapping[remove_extra_whitespace(y)] for y in df['label'].to_list()]

In [None]:
from PIL import Image

# Open an image
new_size = (64, 64)  # Replace with desired dimensions

for i in range(df.shape[0]):
    # Resize image
    image = Image.open(f'img/{i}.jpg')
    resized_image = image.resize(new_size)

    # Save the resized image
    resized_image.save(f'resized_img_64/{i}.jpg')


In [3]:
import json
query = f'<image>\nWhat material is this object made of? Respond unknown if you are not sure. Answer only with the name of the material.\nShort answer:'
# query = f"You are a helpful language and vision assistant. You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language.\n\nUser: <image_placeholder>\nWhat material is this object made of? Respond unknown if you are not sure. Answer only with the name of the material.\nShort answer:\n\nAssistant:"
# query = f"['You are a helpful language and vision assistant. You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language.\n\nUser: <image_placeholder>What material is this object made of? Respond unknown if you are not sure. Answer only with the name of the material.\nShort answer:\n\nAssistant:']"
# Sample data: list of dictionaries
data = [{"query": query, "images": [f'resized_img/{i}.jpg'], "response": label} for i, label in enumerate(labels)]


# Specify the output file path
output_file = 'train_internvl.jsonl'

# Write to a JSONL file
with open(output_file, 'w') as file:
    for item in data:
        # Convert each dictionary to a JSON string and write it as a line
        file.write(json.dumps(item) + '\n')

print(f"Data written to {output_file}")


Data written to train_internvl.jsonl


# HF Dataset

In [None]:
from datasets import load_dataset

ds = load_dataset("Erland/AI701_project")
test_images = ds['test']['resized_image_64']
id2label = {
    0: 'ceramic',
    1: 'fabric',
    2: 'food',
    3: 'glass',
    4: 'metal',
    5: 'other',
    6: 'paper',
    7: 'plastic',
    8: 'unknown',
    9: 'wood'
}
ds['train']['label'] = [id2label[y] for y in ds['train']['label']]
ds['test']['label'] = [id2label[y] for y in ds['test']['label']]

In [None]:
ds['test']['label']