In [27]:
import os
import pandas as pd
import requests
from PIL import Image
from io import BytesIO
csv_file = 'train.csv'
data = pd.read_csv(csv_file)
image_dir = 'train_images'
os.makedirs(image_dir, exist_ok=True)
def download_image(url, group_id):
    try:
        response = requests.get(url, stream=True)
        if response.status_code == 200:
            image_name = os.path.join(image_dir, f'{group_id}_{os.path.basename(url)}')
            image = Image.open(BytesIO(response.content))
            image.save(image_name)
            print(f"Downloaded: {image_name}")
        else:
            print(f"Failed to download image from: {url}")
    except Exception as e:
        print(f"Error downloading image {url}: {e}")
for idx, row in data.iterrows():
    download_image(row['image_link'], row['group_id'])

Downloaded: train_images/748919_61I9XdN6OFL.jpg
Downloaded: train_images/916768_71gSRbyXmoL.jpg
Downloaded: train_images/459516_61BZ4zrjZXL.jpg
Downloaded: train_images/459516_612mrlqiI4L.jpg
Downloaded: train_images/731432_617Tl40LOXL.jpg
Downloaded: train_images/731432_61QsBSE7jgL.jpg
Downloaded: train_images/731432_81xsq6vf2qL.jpg
Downloaded: train_images/731432_71DiLRHeZdL.jpg
Downloaded: train_images/731432_91Cma3RzseL.jpg
Downloaded: train_images/731432_71jBLhmTNlL.jpg
Downloaded: train_images/149159_81N73b5khVL.jpg
Downloaded: train_images/308856_61oMj2iXOuL.jpg
Downloaded: train_images/281678_91LPf6OjV9L.jpg
Downloaded: train_images/281678_81fOxWWWKYL.jpg
Downloaded: train_images/281678_81dzao1Ob4L.jpg
Downloaded: train_images/281678_91-iahVGEDL.jpg
Downloaded: train_images/731432_81S2+GnYpTL.jpg
Downloaded: train_images/731432_81e2YtCOKvL.jpg
Downloaded: train_images/731432_81RNsNEM1EL.jpg


In [28]:
import tensorflow as tf
from tensorflow.keras.layers import Dense, Conv2D, MaxPooling2D, Flatten, Input, Concatenate
from tensorflow.keras.models import Model
from sklearn.preprocessing import LabelEncoder, StandardScaler
import numpy as np
import cv2
import os
def loadAndPreprocess(img_path, target_size=(224, 224)):
    img = cv2.imread(img_path)
    img = cv2.resize(img, target_size)
    img = img / 255.0
    return img


In [29]:
print(data.columns)

image_paths = [os.path.join(image_dir, f'{row["group_id"]}_{os.path.basename(row["image_link"])}') for idx, row in data.iterrows()]
images = np.array([loadAndPreprocess(img_path) for img_path in image_paths])

label_encoder = LabelEncoder()
data['entity_name_encoded'] = label_encoder.fit_transform(data['entity_name'])

import re
def extract_numeric_value(value):
    match = re.search(r"[\d\.]+", value)
    if match:
        return float(match.group(0))
    return None

data['entity_value'] = data['entity_value'].apply(extract_numeric_value)


scaler = StandardScaler()
numeric_features = ['entity_value', 'entity_name_encoded', 'group_id']
scaled_numeric_data = scaler.fit_transform(data[numeric_features])



Index(['image_link', 'group_id', 'entity_name', 'entity_value'], dtype='object')


**Scratch model**

In [32]:
image_input = Input(shape=(224, 224, 3), name='image_input')
x = Conv2D(32, (3, 3), activation='relu')(image_input)
x = MaxPooling2D((2, 2))(x)
x = Conv2D(64, (3, 3), activation='relu')(x)
x = MaxPooling2D((2, 2))(x)
x = Conv2D(128, (3, 3), activation='relu')(x)
x = MaxPooling2D((2, 2))(x)
x = Flatten()(x)

structured_input = Input(shape=(len(numeric_features),), name='structured_input')
y = Dense(64, activation='relu')(structured_input)
y = Dense(32, activation='relu')(y)

combined = Concatenate()([x, y])

z = Dense(64, activation='relu')(combined)
z = Dense(32, activation='relu')(z)
output = Dense(1, activation='linear')(z) 

model = Model(inputs=[image_input, structured_input], outputs=output)

model.compile(optimizer='adam', loss='mse', metrics=['mse'])

X_images = np.array(images)
X_structured = scaled_numeric_data
y = data['entity_value'].values  
model.fit([X_images, X_structured], y, epochs=10, batch_size=16, validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7fd7d78d2d90>

In [35]:
test_csv_file = 'test.csv'
test_data = pd.read_csv(test_csv_file)
test_data.columns = test_data.columns.str.strip()
print(test_data.columns)

test_dir = 'test_images'
os.makedirs(test_dir, exist_ok=True)

def download_image_test(url, group_id):
    try:
        response = requests.get(url, stream=True)
        if response.status_code == 200:
            image_name = os.path.join(test_dir, f'{group_id}_{os.path.basename(url)}')
            image = Image.open(BytesIO(response.content))
            image.save(image_name)
            print(f"Downloaded: {image_name}")
        else:
            print(f"Failed to download image from: {url}")
    except Exception as e:
        print(f"Error downloading image {url}: {e}")


for idx, row in test_data.iterrows():
    download_image_test(row['image_link'], row['group_id'])


Index(['index', 'image_link', 'group_id', 'entity_name'], dtype='object')
Downloaded: test_images/156839_110EibNyclL.jpg
Downloaded: test_images/792578_11TU2clswzL.jpg
Downloaded: test_images/792578_11TU2clswzL.jpg
Downloaded: test_images/792578_11TU2clswzL.jpg
Downloaded: test_images/792578_11gHj8dhhrL.jpg
Downloaded: test_images/792578_11gHj8dhhrL.jpg
Downloaded: test_images/792578_11gHj8dhhrL.jpg
Downloaded: test_images/156839_11lshEUmCrL.jpg
Downloaded: test_images/478357_21+i52HRW4L.jpg
Downloaded: test_images/478357_21-LmSmehZL.jpg
Downloaded: test_images/953313_213oP6n7jtL.jpg
Downloaded: test_images/276611_213wY3gUsmL.jpg
Downloaded: test_images/648011_214CLs1oznL.jpg
Downloaded: test_images/648011_214CLs1oznL.jpg
Downloaded: test_images/648011_214CLs1oznL.jpg
Downloaded: test_images/279307_216rjgJHAeL.jpg
Downloaded: test_images/569206_2174yonQBtL.jpg
Downloaded: test_images/348551_218BCzgKxuL.jpg
Downloaded: test_images/348551_218BCzgKxuL.jpg


In [36]:
test_path = [os.path.join(test_dir, f'{row["group_id"]}_{os.path.basename(row["image_link"])}') for idx, row in test_data.iterrows()]
test_images = np.array([loadAndPreprocess(img_path) for img_path in test_path])

test_data['entity_name_encoded'] = label_encoder.transform(test_data['entity_name'])

test_scaled_numeric_data = scaler.transform(test_data[numeric_features])

test_X_images = np.array(test_images)
test_X_structured = test_scaled_numeric_data

test_predictions = model.predict([test_X_images, test_X_structured])

print("Predicted values for test data:")
for i in range(10):
    print(f"Group ID: {test_data['group_id'].iloc[i]}, Predicted Value: {test_predictions[i][0]}")

ValueError: y contains previously unseen labels: 'height'

**Tesseract and SpaCy**

In [37]:
import pytesseract
from PIL import Image

def extract_text_from_image(image_path):
    try:
        image = Image.open(image_path)
        text = pytesseract.image_to_string(image)
        return text
    except Exception as e:
        print(f"Error extracting text from {image_path}: {e}")
        return ""

In [39]:
import spacy

nlp = spacy.load("en_core_web_sm")

def perform_ner(text):
    doc = nlp(text)
    entities = [(ent.text, ent.label_) for ent in doc.ents]
    return entities
