<a href="https://colab.research.google.com/github/1002Preeti/Copilot-VS-Code-Extension/blob/main/Amazon_ml_challange.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive

In [None]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install pandas pillow requests numpy



In [None]:
import sys

In [None]:
sys.path.append('/content/drive/MyDrive/student_resource 3/src/utils.py')
sys.path.append('/content/drive/MyDrive/student_resource 3/dataset/test.csv')
sys.path.append('/content/drive/MyDrive/student_resource 3/src/constants')
sys.path.append('/content/drive/MyDrive/student_resource 3/src')

In [None]:
import pandas as pd
import requests
from PIL import Image
from io import BytesIO
import numpy as np

In [None]:
train_df= pd.read_csv("/content/drive/MyDrive/student_resource 3/dataset/train.csv")
test_df = pd.read_csv("/content/drive/MyDrive/student_resource 3/dataset/test.csv")

In [None]:
train_sample = train_df.sample(n=1000, random_state=42)

In [None]:
test_df.head()

Unnamed: 0,index,image_link,group_id,entity_name
0,0,https://m.media-amazon.com/images/I/110EibNycl...,156839,height
1,1,https://m.media-amazon.com/images/I/11TU2clswz...,792578,width
2,2,https://m.media-amazon.com/images/I/11TU2clswz...,792578,height
3,3,https://m.media-amazon.com/images/I/11TU2clswz...,792578,depth
4,4,https://m.media-amazon.com/images/I/11gHj8dhhr...,792578,depth


In [None]:
#train_sample = train_df.sample(n=1000, random_state=42)


In [None]:
def preprocess_image_from_url(image_url, target_size=(224, 224)):
    try:
        # Step 1: Download the image
        response = requests.get(image_url)
        response.raise_for_status()

        # Step 2: Open the image
        img = Image.open(BytesIO(response.content))

        # Step 3: Resize the image
        img = img.resize(target_size)

        # Step 4: Convert to RGB
        img = img.convert('RGB')

        # Step 5: Convert to NumPy array and normalize
        img_array = np.array(img).astype('float32') / 255.0

        # Step 6: Expand dimensions


        return img_array
    except Exception as e:
        black_image_array = np.zeros((1, 224, 224, 3), dtype='float32')
        print(f"Error processing image from {image_url}: {e}")
        return black_image_array


In [None]:
train_sample['preprocessed_image'] = train_sample['image_link'].apply(preprocess_image_from_url)

In [None]:
test_df['preprocessed_image'] = test_df['image_link'].apply(preprocess_image_from_url)

In [None]:
train_sample.head()

In [None]:
import re
from constants import entity_unit_map, allowed_units

In [None]:
def process_entity_value_with_validation(entity_name, value):
    try:
        # Regex to match the numeric part and the unit
        match = re.match(r"([0-9.]+)\s*(\w+)", value)
        if match:
            numeric_value = float(match.group(1))  # Extract the number
            unit = match.group(2)  # Extract the unit

            # Check if the unit is valid for the given entity_name
            if entity_name in entity_unit_map and unit in entity_unit_map[entity_name]:
                return numeric_value, unit
            else:
                raise ValueError(f"Invalid unit '{unit}' for entity '{entity_name}'")
        return np.nan, np.nan
    except Exception as e:
        print(f"Error processing entity_value: {value}, Error: {e}")
        return np.nan, np.nan

In [None]:
train_sample[['numeric_value', 'unit']] = train_sample.apply(
    lambda row: pd.Series(process_entity_value_with_validation(row['entity_name'], row['entity_value'])),
    axis=1
)

In [None]:
train_sample = train_sample.dropna(subset=['numeric_value', 'unit'])

In [None]:
train_sample.head()

In [None]:
train_sample.size

In [None]:
train_sample['preprocessed_image'].iloc[0].shape

In [None]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
train_sample['unit_encoded'] = label_encoder.fit_transform(train_sample['unit'])

In [None]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler

In [None]:
standard_scaler = StandardScaler()
train_sample['numeric_value_standardized'] = standard_scaler.fit_transform(train_sample[['numeric_value']])

In [None]:
train_sample.head(10)

In [None]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Conv2D, MaxPooling2D, Flatten, Concatenate
from tensorflow.keras.optimizers import Adam

units = train_sample['unit'].unique()
num_units = len(units)
# Input layer for preprocessed images
input_image = Input(shape=(224, 224, 3))  # Adjust shape based on your image dimensions

# Convolutional layers
x = Conv2D(32, (3, 3), activation='relu', padding='same')(input_image)
x = AvgPooling2D((2, 2))(x)
x = Conv2D(64, (3, 3), activation='relu', padding='same')(x)
x = AvgPooling2D((2, 2))(x)
x = Conv2D(128, (3, 3), activation='relu', padding='same')(x)
x = AvgPooling2D((2, 2))(x)
x = Flatten()(x)

# Branch for numeric value prediction
numeric_branch = Dense(128, activation='relu')(x)
numeric_output = Dense(1, name='numeric_value')(numeric_branch)

# Branch for unit classification
unit_branch = Dense(128, activation='relu')(x)
unit_output = Dense(len(set(units)), activation='softmax', name='unit')(unit_branch)

# Combine the branches into a single model
model = Model(inputs=input_image, outputs=[numeric_output, unit_output])

# Compile the model
model.compile(optimizer=Adam(),
              loss={'numeric_value': 'mean_squared_error', 'unit': 'sparse_categorical_crossentropy'},
              metrics={'unit': 'accuracy'})

In [None]:
X_train_images = np.array([np.array(img) for img in train_sample['preprocessed_image']])
y_train_numeric = np.array(train_sample['numeric_value'])
y_train_unit = np.array(train_sample['unit_encoded'])

In [None]:
history = model.fit(X_train_images,
                    {'numeric_value': y_train_numeric, 'unit': y_train_unit},
                    epochs=50,
                    batch_size=32,
                    validation_split=0.2)

In [None]:
def predict_entity_value(model, image_array):
    # Predict using the model
     predicted_numeric_value, predicted_unit = model.predict(img_array)

    # Extract numeric_value and unit from predictions
    # Ensure the indices or keys match your model's output
     predicted_numeric_value = standard_scaler.inverse_transform(predicted_numeric_value.reshape(-1, 1))

    # Convert unit prediction back to string
    predicted_unit = label_encoder.inverse_transform(np.argmax(predicted_unit, axis=-1))

    # Construct entity_value string
    entity_value = f"{predicted_numeric_value[0][0]} {predicted_unit[0]}"
    return entity_value

In [None]:
def predict_and_format_entity_value(model, image_array):
    numeric_value, unit = predict_entity_value(model, image_array)
    return format_entity_value(numeric_value, unit)

In [None]:
entity_values = []
for index, row in test_df.iterrows():
    image_array = np.array(eval(row['preprocessed_image']))  # Adjust as needed for your image format
    entity_value = predict_and_format_entity_value(model, image_array)
    entity_values.append(entity_value)

# Add the predictions to the DataFrame
test_df['entity_value'] = entity_values