# MultiCPU code

In [None]:
import numpy as np
from PIL import Image
import os
from concurrent.futures import ThreadPoolExecutor

def preprocess_image(image_path, output_path=None):
    # Open the image
    img = Image.open(image_path)
    
    # Convert to grayscale
    gray_img = img.convert('L')
    
    # Convert to numpy array
    img_array = np.array(gray_img)
    
    # Normalize pixel values to range [0, 1]
    normalized_img = img_array / 255.0
    
    # Optional: Apply additional normalization (e.g., zero mean and unit variance)
    mean = np.mean(normalized_img)
    std = np.std(normalized_img)
    standardized_img = (normalized_img - mean) / std
    
    # Convert back to PIL Image
    preprocessed_img = Image.fromarray((standardized_img * 255).astype(np.uint8))
    
    # Save the preprocessed image if output_path is provided
    if output_path:
        preprocessed_img.save(output_path)
    
    return preprocessed_img

def process_single_image(args):
    input_image_path, output_image_path = args
    # Preprocess the image
    preprocess_image(input_image_path, output_image_path)
    print(f"Processed {os.path.basename(input_image_path)} and saved to {output_image_path}")

def preprocess_images_in_folder(input_folder, output_folder, max_workers=4):
    # Ensure output directory exists
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    
    # Create a list of image paths
    image_paths = [
        (os.path.join(input_folder, filename), os.path.join(output_folder, filename))
        for filename in os.listdir(input_folder)
        if filename.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.gif'))
    ]
    
    # Use ThreadPoolExecutor to parallelize the image preprocessing
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        executor.map(process_single_image, image_paths)

# Example usage
if __name__ == "__main__":
    input_folder = r"C:\Users\Abhishek\Desktop\Amazon_ML\train_images\all_images"
    output_folder = r"C:\Users\Abhishek\Desktop\Amazon_ML\processed_train_images"
    
    # Preprocess all images in the folder using multiple CPU threads
    preprocess_images_in_folder(input_folder, output_folder, max_workers=8)


# GreyScale Code

In [None]:
import os
import cv2
import numpy as np
from multiprocessing import Pool
from tqdm import tqdm

input_folder = "C:/Users/Abhishek/Desktop/Amazon_ML/train_images/all_images"
output_folder = "C:/Users/Abhishek/Desktop/Amazon_ML/grey_train_images"

# Create output directory if it doesn't exist
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

# Function to preprocess an image
def preprocess_image(filename):
    img_path = os.path.join(input_folder, filename)
    
    if os.path.isfile(img_path):
        try:
            # Load the image
            image = cv2.imread(img_path)
            
            if image is None:
                print(f"Warning: Image {filename} could not be loaded.")
                return

            # Convert to grayscale
            gray_image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
            
            # Normalize the image
            normalized_image = gray_image / 255.0
            
            # Resize the image
            resized_image = cv2.resize(normalized_image, (256, 256))
            
            # Save the preprocessed image
            output_path = os.path.join(output_folder, filename)
            cv2.imwrite(output_path, (resized_image * 255).astype(np.uint8))
            
        except Exception as e:
            print(f"Error processing {filename}: {e}")

# Process images with multiple threads
def process_images_in_parallel(image_filenames):
    with Pool(processes=8) as pool:
        for _ in tqdm(pool.imap_unordered(preprocess_image, image_filenames), total=len(image_filenames)):
            pass

def batch_process_images(batch_size=1000):
    valid_extensions = ['.jpg', '.jpeg', '.png']
    all_filenames = [f for f in os.listdir(input_folder) if os.path.splitext(f)[1].lower() in valid_extensions]
    
    for i in range(0, len(all_filenames), batch_size):
        batch_filenames = all_filenames[i:i + batch_size]
        process_images_in_parallel(batch_filenames)
        print(f"Processed batch {i // batch_size + 1}")

# Start the processing
if __name__ == '__main__':
    batch_process_images()
    print(f"Preprocessing complete. Processed images saved in {output_folder}")


  0%|                                                                                         | 0/1000 [00:00<?, ?it/s]

# Feature Extraction on RGB

In [None]:
import numpy as np
from PIL import Image
import os
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.applications.resnet50 import preprocess_input
from tensorflow.keras.preprocessing.image import img_to_array
from concurrent.futures import ThreadPoolExecutor

# Load pre-trained ResNet50 model without the top classification layer
model = ResNet50(weights='imagenet', include_top=False, input_shape=(224, 224, 3))

def preprocess_image(image_path, output_path=None):
    # Open the image
    img = Image.open(image_path).convert('RGB')
    
    # Resize the image to 224x224 (ResNet input size)
    img = img.resize((224, 224))
    
    # Convert image to numpy array
    img_array = img_to_array(img)
    
    # Preprocess the image for ResNet50
    img_array = preprocess_input(img_array)
    
    return img_array

def extract_features(image_path):
    # Preprocess the image
    img_array = preprocess_image(image_path)
    
    # Expand dimensions to match model input
    img_array = np.expand_dims(img_array, axis=0)
    
    # Extract features using ResNet50
    features = model.predict(img_array)
    
    return features.flatten()

def process_single_image(args):
    input_image_path, output_feature_path = args
    # Extract features from the image
    features = extract_features(input_image_path)
    
    # Save features as a .npy file
    np.save(output_feature_path, features)
    print(f"Extracted features from {os.path.basename(input_image_path)} and saved to {output_feature_path}")

def extract_features_from_images(input_folder, output_folder, max_workers=4):
    # Ensure output directory exists
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    
    # Create a list of image paths
    image_paths = [
        (os.path.join(input_folder, filename), os.path.join(output_folder, filename.split('.')[0] + '.npy'))
        for filename in os.listdir(input_folder)
        if filename.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.gif'))
    ]
    
    # Use ThreadPoolExecutor to parallelize the feature extraction
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        executor.map(process_single_image, image_paths)

# Example usage
if __name__ == "__main__":
    input_folder = r"C:\Users\Abhishek\Desktop\Amazon_ML\train_images\all_images"
    output_folder = r"C:\Users\Abhishek\Desktop\Amazon_ML\extracted_features"
    
    # Extract features from all images in the folder using multiple CPU threads
    extract_features_from_images(input_folder, output_folder, max_workers=12)


# training

In [5]:
import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator

# Define your input shape and number of classes
input_shape = (256, 256, 1)  # Grayscale images of size 256x256
num_classes = 10  # Assuming you have 10 classes

# Create an instance of your model
model = create_resnet_model(input_shape=input_shape, num_classes=num_classes)

# Compile the model (define optimizer, loss function, and metrics)
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Load your dataset (example with ImageDataGenerator)
# Ensure your images are grayscale and resized to (256, 256)

train_datagen = ImageDataGenerator(rescale=1./255, validation_split=0.2)

train_generator = train_datagen.flow_from_directory(
    'C:/Users/Abhishek/Desktop/Amazon_ML/output',  # Replace with the path to your dataset
    target_size=(256, 256),
    color_mode='grayscale',
    batch_size=32,
    class_mode='categorical',
    subset='training'
)

validation_generator = train_datagen.flow_from_directory(
    'C:/Users/Abhishek/Desktop/Amazon_ML/output',
    target_size=(256, 256),
    color_mode='grayscale',
    batch_size=32,
    class_mode='categorical',
    subset='validation'
)

# Train the model
model.fit(train_generator, validation_data=validation_generator, epochs=10)

# Save the trained model
model.save('resnet_model.h5')


Found 0 images belonging to 0 classes.
Found 0 images belonging to 0 classes.


ValueError: Must provide at least one structure

In [7]:
import pandas as pd
df=pd.read_csv('dataset/train.csv')
df.head(3)

Unnamed: 0,image_link,group_id,entity_name,entity_value
0,https://m.media-amazon.com/images/I/61I9XdN6OF...,748919,item_weight,500.0 gram
1,https://m.media-amazon.com/images/I/71gSRbyXmo...,916768,item_volume,1.0 cup
2,https://m.media-amazon.com/images/I/61BZ4zrjZX...,459516,item_weight,0.709 gram


In [8]:
x=df['entity_name']
x

0         item_weight
1         item_volume
2         item_weight
3         item_weight
4         item_weight
             ...     
263854         height
263855         height
263856         height
263857         height
263858         height
Name: entity_name, Length: 263859, dtype: object

In [10]:
from sklearn.preprocessing import LabelEncoder

In [14]:
label_encoder = LabelEncoder()
df['entity_name'] = label_encoder.fit_transform(x)

In [15]:
print(df['entity_name'])

0         3
1         2
2         3
3         3
4         3
         ..
263854    1
263855    1
263856    1
263857    1
263858    1
Name: entity_name, Length: 263859, dtype: int32


# Main

In [86]:
import numpy as np

# Initialize an empty list to hold the rows
all_arrays = []

# Loop through numbers 0 to 111
for a in range(0, 112):
    file_path = f'output/image_{a}.npy'

    # Load the .npy file
    data = np.load(file_path)
    
    # Append the 1D array to the list
    all_arrays.append(data)

# Convert the list of arrays into a 2D NumPy array
x = np.vstack(all_arrays)

# Print the resulting 2D array
print(x)


[[0.    0.    0.    ... 4.535 0.    0.   ]
 [0.    0.    0.    ... 5.21  0.    0.   ]
 [0.    0.    0.    ... 3.758 0.    0.   ]
 ...
 [0.    0.    0.    ... 4.387 0.    0.   ]
 [0.    0.    0.    ... 4.54  0.    0.   ]
 [0.    0.    0.    ... 4.32  0.    0.   ]]


In [82]:
length = len(data)
print(length)

100352


In [78]:
import pandas as pd
import numpy as np

# Example data to save
df=pd.read_csv('dataset/train.csv')  # A 5x3 array of random numbers

# Convert the NumPy array to a DataFrame
df = df.iloc[:112, :]
print(df)
# Save the DataFrame to a CSV file
# df.to_csv('output/data.csv', index=False)


                                            image_link  group_id  entity_name  \
0    https://m.media-amazon.com/images/I/61I9XdN6OF...    748919  item_weight   
1    https://m.media-amazon.com/images/I/71gSRbyXmo...    916768  item_volume   
2    https://m.media-amazon.com/images/I/61BZ4zrjZX...    459516  item_weight   
3    https://m.media-amazon.com/images/I/612mrlqiI4...    459516  item_weight   
4    https://m.media-amazon.com/images/I/617Tl40LOX...    731432  item_weight   
..                                                 ...       ...          ...   
107  https://m.media-amazon.com/images/I/61xxqfM2Ew...    991868  item_weight   
108  https://m.media-amazon.com/images/I/81yG9eUKvx...    208023  item_weight   
109  https://m.media-amazon.com/images/I/61FMOl299l...    593600  item_weight   
110  https://m.media-amazon.com/images/I/41Kn+YOyPj...    459516  item_volume   
111  https://m.media-amazon.com/images/I/51q9OE6hfg...    459516  item_volume   

         entity_value  
0  

In [79]:
df.rename(columns={'image_link': 'image_data'}, inplace=True)
print(df)

                                            image_data  group_id  entity_name  \
0    https://m.media-amazon.com/images/I/61I9XdN6OF...    748919  item_weight   
1    https://m.media-amazon.com/images/I/71gSRbyXmo...    916768  item_volume   
2    https://m.media-amazon.com/images/I/61BZ4zrjZX...    459516  item_weight   
3    https://m.media-amazon.com/images/I/612mrlqiI4...    459516  item_weight   
4    https://m.media-amazon.com/images/I/617Tl40LOX...    731432  item_weight   
..                                                 ...       ...          ...   
107  https://m.media-amazon.com/images/I/61xxqfM2Ew...    991868  item_weight   
108  https://m.media-amazon.com/images/I/81yG9eUKvx...    208023  item_weight   
109  https://m.media-amazon.com/images/I/61FMOl299l...    593600  item_weight   
110  https://m.media-amazon.com/images/I/41Kn+YOyPj...    459516  item_volume   
111  https://m.media-amazon.com/images/I/51q9OE6hfg...    459516  item_volume   

         entity_value  
0  

In [85]:
y=df[['group_id','entity_name','entity_value']]
print(y)

     group_id  entity_name      entity_value
0      748919  item_weight        500.0 gram
1      916768  item_volume           1.0 cup
2      459516  item_weight        0.709 gram
3      459516  item_weight        0.709 gram
4      731432  item_weight    1400 milligram
..        ...          ...               ...
107    991868  item_weight      9.0 kilogram
108    208023  item_weight      2.5 kilogram
109    593600  item_weight         50.0 gram
110    459516  item_volume   10.0 millilitre
111    459516  item_volume  200.0 millilitre

[112 rows x 3 columns]


In [92]:
import numpy as np
from sklearn.preprocessing import OneHotEncoder

# Example data
# data = np.array([['A'], ['B'], ['A'], ['C']])

# Create OneHotEncoder instance
encoder = OneHotEncoder()
df=pd.DataFrame(y)

print(df)

     group_id  entity_name      entity_value
0      748919  item_weight        500.0 gram
1      916768  item_volume           1.0 cup
2      459516  item_weight        0.709 gram
3      459516  item_weight        0.709 gram
4      731432  item_weight    1400 milligram
..        ...          ...               ...
107    991868  item_weight      9.0 kilogram
108    208023  item_weight      2.5 kilogram
109    593600  item_weight         50.0 gram
110    459516  item_volume   10.0 millilitre
111    459516  item_volume  200.0 millilitre

[112 rows x 3 columns]


In [96]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

vals = le.fit_transform(df['entity_name'])

print("Encoded labels:", vals)
print(len(vals))

Encoded labels: [1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 2 1 1 1 1 1 1 1 1 1 1 3
 2 1 1 1 1 3 3 3 3 1 1 3 0 1 1 1 1 1 1 1 1 1 1 1 1 3 0 0 0 0 0 1 1 1 1 1 1
 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 3 1 1 3 1 1 1 1 0 1 1 1 1 1 1 1 1 1 0
 0]
112


In [97]:
y=vals

In [103]:
from sklearn.model_selection import train_test_split
xtrain,xtest,ytrain,ytest = train_test_split(x,y,test_size = 0.05,random_state=0)

In [101]:
print(xtest)

[[0.    0.    0.    ... 4.375 0.    0.   ]
 [0.    0.    0.    ... 6.027 0.    0.   ]
 [0.    0.    0.    ... 5.07  0.    0.   ]
 ...
 [0.    0.    0.    ... 4.53  0.    0.   ]
 [0.    0.    0.    ... 4.996 0.    0.   ]
 [0.    0.    0.    ... 4.38  0.    0.   ]]


In [104]:
print(ytest)

[1 3 1 0 1 1]


In [106]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical

# Build the fully connected model
model = Sequential([
    Dense(512, activation='relu', input_shape=(100352,)),  # Adjust input_shape to match feature vector length
    Dense(256, activation='relu'),
    Dense(128, activation='relu'),
    Dense(10, activation='softmax')  # Number of classes in the output layer
])

# Compile the model
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

# Train the model
history = model.fit(xtrain, ytrain, epochs=10, batch_size=32, validation_split=0.1)

# Evaluate the model
test_loss, test_accuracy = model.evaluate(xtest, ytest)
print(f'Test loss: {test_loss:.4f}')
print(f'Test accuracy: {test_accuracy:.4f}')

# Make predictions
predictions = model.predict(xtest)
print(f'Predictions shape: {predictions.shape}')


Epoch 1/10


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 528ms/step - accuracy: 0.1063 - loss: 16.8584 - val_accuracy: 0.1250 - val_loss: 79.9239
Epoch 2/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 435ms/step - accuracy: 0.1141 - loss: 81.3689 - val_accuracy: 0.0000e+00 - val_loss: 36.5894
Epoch 3/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 441ms/step - accuracy: 0.0907 - loss: 36.3230 - val_accuracy: 0.1250 - val_loss: 52.8579
Epoch 4/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 442ms/step - accuracy: 0.0326 - loss: 53.2074 - val_accuracy: 0.2500 - val_loss: 24.2229
Epoch 5/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 435ms/step - accuracy: 0.1506 - loss: 24.6958 - val_accuracy: 0.1250 - val_loss: 39.9213
Epoch 6/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 430ms/step - accuracy: 0.2478 - loss: 25.0381 - val_accuracy: 0.0000e+00 - val_loss: 39.5184
Epoch 7/10
[1m3/3[0m [32m━━━