<a href="https://colab.research.google.com/github/Arindaym1/Entity-Information-ML-project/blob/main/Amazon_ML_Challenge_Submission_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install necessary libraries
!pip install easyocr

Collecting easyocr
  Downloading easyocr-1.7.1-py3-none-any.whl.metadata (11 kB)
Collecting python-bidi (from easyocr)
  Downloading python_bidi-0.6.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.6 kB)
Collecting pyclipper (from easyocr)
  Downloading pyclipper-1.3.0.post5-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl.metadata (9.0 kB)
Collecting ninja (from easyocr)
  Downloading ninja-1.11.1.1-py2.py3-none-manylinux1_x86_64.manylinux_2_5_x86_64.whl.metadata (5.3 kB)
Downloading easyocr-1.7.1-py3-none-any.whl (2.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.9/2.9 MB[0m [31m37.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading ninja-1.11.1.1-py2.py3-none-manylinux1_x86_64.manylinux_2_5_x86_64.whl (307 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m307.2/307.2 kB[0m [31m19.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyclipper-1.3.0.post5-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (

In [None]:
# Importing necessary libraries
import pandas as pd
import numpy as np
import requests
import os
from tqdm import tqdm
import easyocr
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from PIL import Image
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Input, Concatenate, Lambda, GlobalAveragePooling2D, GlobalAveragePooling1D, Flatten, Embedding
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.optimizers import Adam

In [None]:
# Load dataset
train_df = pd.read_csv('/content/train.csv')

In [None]:
# Step 1: Balance the dataset
def balance_sample(df, sample_size=1500):
    group_id_sample = df.groupby('group_id').apply(lambda x: x.sample(n=min(len(x), sample_size // len(df['group_id'].unique())), random_state=42))
    entity_type_sample = df.groupby('entity_name').apply(lambda x: x.sample(n=min(len(x), sample_size // len(df['entity_name'].unique())), random_state=42))

    balanced_sample = pd.concat([group_id_sample, entity_type_sample]).drop_duplicates().sample(n=sample_size, random_state=42, replace=True)
    return balanced_sample

train_df = balance_sample(train_df)

In [None]:
# Step 2: Download images based on image URLs
drive_image_dir = '/content/images'
os.makedirs(drive_image_dir, exist_ok=True)

def download_image(url, file_path):
    try:
        img_data = requests.get(url).content
        with open(file_path, 'wb') as handler:
            handler.write(img_data)
    except Exception as e:
        print(f"Error downloading {url}: {e}")

for i, row in tqdm(train_df.iterrows(), total=train_df.shape[0]):
    image_url = row['image_link']
    image_name = image_url.split('/')[-1]
    image_path = os.path.join(drive_image_dir, f"{row['group_id']}_{image_name}")
    if not os.path.exists(image_path):
        download_image(image_url, image_path)

100%|██████████| 1500/1500 [00:19<00:00, 78.10it/s] 


In [None]:
# Step 3: Extract text from images using EasyOCR
reader = easyocr.Reader(['en'])
image_directory = '/content/images'

def extract_text_from_image(group_id, image_name):
    filename = f"{group_id}_{image_name}"
    image_path = os.path.join(image_directory, filename)
    # Check if the image file exists
    if not os.path.exists(image_path):
        print(f"Image file not found: {image_path}")
        return ""

    try:
        result = reader.readtext(image_path)
        text = ' '.join([res[1] for res in result])
        return text
    except Exception as e:
        print(f"Error reading image {image_path}: {e}")
        return ""

train_df['image_name'] = train_df['image_link'].apply(lambda x: x.split('/')[-1])
train_df['extracted_text'] = train_df.apply(lambda row: extract_text_from_image(row['group_id'], row['image_name']), axis=1)




Progress: |██████████████████████████████████████████████████| 100.0% Complete



Progress: |██████████████████████████████████████████████████| 100.0% Complete

  net.load_state_dict(copyStateDict(torch.load(trained_model, map_location=device)))
  model.load_state_dict(torch.load(model_path, map_location=device))


In [None]:
# Step 4: Parse and normalize numeric values
import re
def parse_entity_value(value):
    match = re.match(r'([0-9.]+)\s*([a-zA-Z]+)', value)
    if match:
        numeric_value = float(match.group(1))
        unit = match.group(2)
        return pd.Series([numeric_value, unit])
    return pd.Series([np.nan, np.nan])

train_df[['numeric_value', 'unit']] = train_df['entity_value'].apply(parse_entity_value)
train_df.dropna(subset=['numeric_value', 'unit'], inplace=True)

# Normalize numeric values using MinMaxScaler
numeric_scaler = MinMaxScaler()
train_df['numeric_value_normalized'] = numeric_scaler.fit_transform(train_df[['numeric_value']])


In [None]:
# Step 5: Encode categorical features
group_id_encoder = LabelEncoder()
entity_name_encoder = LabelEncoder()

train_df['group_id_encoded'] = group_id_encoder.fit_transform(train_df['group_id'])
train_df['entity_name_encoded'] = entity_name_encoder.fit_transform(train_df['entity_name'])


In [None]:
# Step 6: Constrain unit predictions based on entity_name
entity_unit_map = {
    "width": {"centimetre", "foot", "millimetre", "metre", "inch", "yard"},
    "depth": {"centimetre", "foot", "millimetre", "metre", "inch", "yard"},
    "height": {"centimetre", "foot", "millimetre", "metre", "inch", "yard"},
    "item_weight": {"milligram", "kilogram", "microgram", "gram", "ounce", "ton", "pound"},
    "maximum_weight_recommendation": {"milligram", "kilogram", "microgram", "gram", "ounce", "ton", "pound"},
    "voltage": {"millivolt", "kilovolt", "volt"},
    "wattage": {"kilowatt", "watt"},
    "item_volume": {"cubic foot", "microlitre", "cup", "fluid ounce", "centilitre", "imperial gallon", "pint",
                    "decilitre", "litre", "millilitre", "quart", "cubic inch", "gallon"}
}

unit_labels = sorted(set(unit for units in entity_unit_map.values() for unit in units))
unit_encoder = LabelEncoder()
unit_encoder.fit(unit_labels)

def get_unit_options(entity_name):
    return entity_unit_map.get(entity_name, set())

def encode_unit(entity_name, unit):
    allowed_units = get_unit_options(entity_name)
    if unit in allowed_units:
        return unit_encoder.transform([unit])[0]
    return np.nan

train_df['unit_encoded'] = train_df.apply(lambda row: encode_unit(row['entity_name'], row['unit']), axis=1)
train_df.dropna(subset=['unit_encoded'], inplace=True)

In [None]:
# Step 7: Preprocess images
def load_and_preprocess_image(image_path, target_size=(224, 224)):
    try:
        img = Image.open(image_path).convert('RGB')
        img = img.resize(target_size)
        img = np.array(img) / 255.0
        return img
    except Exception as e:
        print(f"Error processing image {image_path}: {e}")
        return np.zeros((target_size[0], target_size[1], 3))

train_images = np.array([load_and_preprocess_image(os.path.join(image_directory, f"{row['group_id']}_{row['image_name']}")) for _, row in train_df.iterrows()])


In [None]:
# Step 8: Preprocess text
def preprocess_text(text_data, max_vocab_size=10000, max_seq_length=100):
    text_vectorizer = tf.keras.layers.TextVectorization(max_tokens=max_vocab_size, output_sequence_length=max_seq_length)
    text_vectorizer.adapt(text_data)
    preprocessed_texts = text_vectorizer(text_data)
    return text_vectorizer, preprocessed_texts

train_texts = train_df['extracted_text'].values
text_vectorizer, train_texts_preprocessed = preprocess_text(train_texts)


In [None]:
# Step 9: Define the multitask model
def build_multitask_model(image_input_shape, text_input_shape, num_units):
    image_input = Input(shape=image_input_shape)
    resnet_model = ResNet50(weights='imagenet', include_top=False, input_shape=image_input_shape)
    x = resnet_model(image_input)
    x = GlobalAveragePooling2D()(x)
    x = Flatten()(x)

    text_input = Input(shape=text_input_shape)
    text_features = Embedding(input_dim=10000, output_dim=64, input_length=text_input_shape[0])(text_input)
    text_features = GlobalAveragePooling1D()(text_features)

    combined_features = Concatenate()([x, text_features])

    numeric_output = Dense(1, name='numeric_output')(combined_features)
    unit_output = Dense(num_units, activation='softmax', name='unit_output')(combined_features)

    model = Model(inputs=[image_input, text_input], outputs=[numeric_output, unit_output])
    model.compile(optimizer='adam',
                  loss={'numeric_output': 'mean_squared_error', 'unit_output': 'sparse_categorical_crossentropy'},
                  metrics={'numeric_output': 'mae', 'unit_output': 'accuracy'})
    return model

image_input_shape = train_images.shape[1:]
text_input_shape = (train_texts_preprocessed.shape[1],)
num_units = len(unit_labels)

model = build_multitask_model(image_input_shape, text_input_shape, num_units)

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/resnet/resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5
[1m94765736/94765736[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 0us/step




In [None]:
train_entity_encoded = train_df['entity_name_encoded'].values

In [None]:
# Step 9: Define the multitask model
def build_multitask_model(image_input_shape, text_input_shape, entity_input_shape, num_units):
    image_input = Input(shape=image_input_shape)
    resnet_model = ResNet50(weights='imagenet', include_top=False, input_shape=image_input_shape)
    x = resnet_model(image_input)
    x = GlobalAveragePooling2D()(x)
    x = Flatten()(x)

    text_input = Input(shape=text_input_shape)
    text_features = Embedding(input_dim=10000, output_dim=64, input_length=text_input_shape[0])(text_input)
    text_features = GlobalAveragePooling1D()(text_features)

    entity_input = Input(shape=(1,))  # Assume entity_name_encoded is a single integer per example
    entity_features = Embedding(input_dim=100, output_dim=8)(entity_input)  # Modify input_dim and output_dim as needed
    entity_features = Flatten()(entity_features)

    combined_features = Concatenate()([x, text_features, entity_features])

    numeric_output = Dense(1, name='numeric_output')(combined_features)
    unit_output = Dense(num_units, activation='softmax', name='unit_output')(combined_features)

    model = Model(inputs=[image_input, text_input, entity_input], outputs=[numeric_output, unit_output])
    model.compile(optimizer='adam',
                  loss={'numeric_output': 'mean_squared_error', 'unit_output': 'sparse_categorical_crossentropy'},
                  metrics={'numeric_output': 'mae', 'unit_output': 'accuracy'})
    return model

image_input_shape = train_images.shape[1:]
text_input_shape = (train_texts_preprocessed.shape[1],)
entity_input_shape = (1,)
num_units = len(unit_labels)

model = build_multitask_model(image_input_shape, text_input_shape, entity_input_shape, num_units)



In [None]:
# Step 10: Set up checkpoint and training
checkpoint_path = '/content/final_model_1034.keras'
checkpoint_callback = ModelCheckpoint(filepath=checkpoint_path, save_best_only=False, monitor='loss', save_freq=20, verbose=1)

In [None]:
history = model.fit(
    [train_images, train_texts_preprocessed, train_entity_encoded],
    {'numeric_output': train_df['numeric_value_normalized'].values, 'unit_output': train_df['unit_encoded'].values},
    epochs=10,
    batch_size=32,
    callbacks=[checkpoint_callback]
)


Epoch 1/10
[1m19/45[0m [32m━━━━━━━━[0m[37m━━━━━━━━━━━━[0m [1m7s[0m 291ms/step - loss: 7.1599 - numeric_output_mae: 1.2121 - unit_output_accuracy: 0.2286
Epoch 1: saving model to /content/final_model_1034.keras
[1m39/45[0m [32m━━━━━━━━━━━━━━━━━[0m[37m━━━[0m [1m4s[0m 749ms/step - loss: 5.8691 - numeric_output_mae: 0.9501 - unit_output_accuracy: 0.2527
Epoch 1: saving model to /content/final_model_1034.keras
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m161s[0m 2s/step - loss: 5.5489 - numeric_output_mae: 0.8833 - unit_output_accuracy: 0.2588
Epoch 2/10
[1m14/45[0m [32m━━━━━━[0m[37m━━━━━━━━━━━━━━[0m [1m8s[0m 284ms/step - loss: 1.9492 - numeric_output_mae: 0.1008 - unit_output_accuracy: 0.3286
Epoch 2: saving model to /content/final_model_1034.keras
[1m34/45[0m [32m━━━━━━━━━━━━━━━[0m[37m━━━━━[0m [1m16s[0m 1s/step - loss: 1.9275 - numeric_output_mae: 0.1002 - unit_output_accuracy: 0.3447
Epoch 2: saving model to /content/final_model_1034.keras
[1

In [None]:
import pandas as pd
test_df = pd.read_csv('/content/test.csv')
test = test_df.dropna()
test = test.head(1500)

In [None]:
drive_image_dir = '/content/test_images_2'
os.makedirs(drive_image_dir, exist_ok=True)

for i, row in tqdm(test.iterrows(), total=test.shape[0]):
    image_url = row['image_link']
    # Use group_id to create unique image file names
    image_name = image_url.split('/')[-1]  # Extract image name from the URL
    image_path = os.path.join(drive_image_dir, f"{row['group_id']}_{image_name}")

    if not os.path.exists(image_path):  # Check if the image already exists
        download_image(image_url, image_path)

print(f"Images saved in: {drive_image_dir}")

100%|██████████| 1500/1500 [00:28<00:00, 53.44it/s]

Images saved in: /content/test_images_2





In [None]:
# Directory containing the downloaded images
image_directory = '/content/test_images_2'

# Function to extract text from an image file
def extract_text_from_image(group_id, image_name):
    filename = f"{group_id}_{image_name}"
    image_path = os.path.join(image_directory, filename)
    result = reader.readtext(image_path)
    text = ' '.join([res[1] for res in result])
    return text

# Extract image names from the image URLs
test['image_name'] = test['image_link'].apply(lambda x: x.split('/')[-1])

# Extract text from images
test['extracted_text'] = test.apply(lambda row: extract_text_from_image(row['group_id'], row['image_name']), axis=1)

In [None]:
from sklearn.preprocessing import LabelEncoder

#Initiate encoders
group_id_encoder = LabelEncoder()
entity_name_encoder = LabelEncoder()
test['group_id_encoded'] = group_id_encoder.fit_transform(test['group_id'])
test['entity_name_encoded'] = entity_name_encoder.fit_transform(test['entity_name'])

In [None]:
image_directory = '/content/test_images'
# Load and preprocess all images
test_images = np.array([load_and_preprocess_image(os.path.join(image_directory, f"{row['group_id']}_{row['image_link'].split('/')[-1]}")) for _, row in test.iterrows()])

In [None]:
test_texts = test['extracted_text'].values

In [None]:
text_vectorizer, test_texts_preprocessed = preprocess_text(test_texts)
text_input_shape = (test_texts_preprocessed.shape[1],)

In [None]:
text_input_shape

(100,)

In [None]:
test_texts_preprocessed

<tf.Tensor: shape=(150, 100), dtype=int64, numpy=
array([[364, 356, 335, ...,   0,   0,   0],
       [  5,  14,  16, ...,   0,   0,   0],
       [  5,  14,  16, ...,   0,   0,   0],
       ...,
       [314, 164, 226, ...,   0,   0,   0],
       [115, 104, 117, ...,   0,   0,   0],
       [115, 104, 117, ...,   0,   0,   0]])>

In [None]:
test_entity_encoded = test['entity_name_encoded'].values

In [None]:
test_texts_preprocessed = tf.reshape(test_texts_preprocessed, [test_texts_preprocessed.shape[0], 100])

In [None]:
predictions = model.predict([test_images, test_texts_preprocessed, test_entity_encoded])

# Extract numeric and unit predictions
predicted_numeric_values = predictions[0]
predicted_units = np.argmax(predictions[1], axis=-1)

[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 904ms/step


In [None]:
print(predictions)

[array([[0.45435172],
       [0.45422116],
       [0.4559934 ],
       [0.45652092],
       [0.45617333],
       [0.45564583],
       [0.45387357],
       [0.44267216],
       [0.45315883],
       [0.4546415 ],
       [0.49043813],
       [0.47529367],
       [0.4681591 ],
       [0.4676316 ],
       [0.46585938],
       [0.46552303],
       [0.45234406],
       [0.49666563],
       [0.5125043 ],
       [0.44614315],
       [0.43596604],
       [0.45569202],
       [0.45934695],
       [0.44229558],
       [0.44459534],
       [0.466106  ],
       [0.46840575],
       [0.46242484],
       [0.43848684],
       [0.4792185 ],
       [0.47994784],
       [0.47527254],
       [0.47265702],
       [0.47495678],
       [0.47442928],
       [0.49886468],
       [0.45455104],
       [0.42755786],
       [0.49391904],
       [0.46545509],
       [0.46315533],
       [0.4554018 ],
       [0.46420962],
       [0.46598187],
       [0.44147083],
       [0.47945955],
       [0.47893205],
       [0.47

In [None]:
predictions[1]

array([[1.8329001e-06, 7.4842197e-01, 2.6800840e-06, ..., 1.2977362e-01,
        3.1612720e-02, 7.5050998e-06],
       [2.6125679e-06, 6.5449202e-01, 3.4965890e-06, ..., 1.6361904e-01,
        4.7846925e-02, 1.0128152e-05],
       [2.8043683e-06, 6.2789905e-01, 3.7112327e-06, ..., 1.7445916e-01,
        5.0971381e-02, 1.0796457e-05],
       ...,
       [3.7333797e-07, 7.9334784e-01, 7.5725666e-07, ..., 1.6493054e-01,
        1.7959699e-02, 1.6902201e-06],
       [3.1118209e-06, 6.3517159e-01, 3.7815478e-06, ..., 1.7065001e-01,
        5.6326315e-02, 1.0484154e-05],
       [3.4252103e-06, 5.8761466e-01, 4.0760651e-06, ..., 1.9483495e-01,
        6.2240440e-02, 1.1357541e-05]], dtype=float32)

In [None]:
max_values = np.argmax(predictions[1], axis=1)
print(max_values)

[ 1  1  1  1  1  1  1  1  1  1  9  1  1  1  1  1  1  9  1 28  1  1  1  1
  1  1  1  1  1  1  1  1  1  1  1 28  1 28  1  1  1  1  1  1  1  1  1  1
  1  1  1  9  1  1  1  1  1  1  1  1 28  9  1  1  1  1  9  1  1  1  1  1
  1  1  1 28  1  1  1  1  9  1  1  1  1  1  1 29 28  9  1  1  1  1  9  1
  1  1  1  1  1  1  1  1  1  1  1  1 28  1  1  1  1  1  9  1  9  1  1  1
 28  1  9  1  1  1 28  1  1  9  1  1  1  1  1  1  1  1  1  1  1  1  1  1
  1  1  1  1  1  1]


In [None]:
unit_labels[1], unit_labels[9], unit_labels[28], unit_labels[29]

('centimetre', 'gram', 'volt', 'watt')

In [None]:
predicted_numeric_values = predictions[0]
predicted_units = predictions[1]

In [None]:
predicted_numeric_values_denorm = numeric_scaler.inverse_transform(predicted_numeric_values)

decoded_units = unit_encoder.inverse_transform(predicted_units)