In [1]:
import pandas as pd
import requests
from PIL import Image
from io import BytesIO
import torch
import torchvision.transforms as transforms
from torchvision import models
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
import numpy as np
import re
import random
import csv

In [2]:
# Load ResNet model for feature extraction
model = models.resnet18(pretrained=True)
model.eval()  # Set to evaluation mode

# Define preprocessing transform
preprocess = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])




In [3]:
# Function to download image from URL
def download_image(image_url):
    response = requests.get(image_url)
    image = Image.open(BytesIO(response.content)).convert('RGB')
    return image

# Function to extract features from an image
def extract_features(image):
    input_tensor = preprocess(image).unsqueeze(0)  # Create a mini-batch as expected by the model
    with torch.no_grad():
        features = model(input_tensor)
    return features.numpy().flatten()  # Flatten to 1D array

# Function to extract numeric value from string
def extract_numeric(value):
    match = re.search(r"[\d\.]+", value)
    return float(match.group()) if match else None

# Load dataset
df = pd.read_csv(r'D:\React18\newshop\Feature-Extraction-from-Images-DEVMATES\dataset\sample_train.csv')

In [4]:
# Initialize LabelEncoders
entity_name_encoder = LabelEncoder()
entity_value_encoder = LabelEncoder()

# Fit encoders
entity_name_encoder.fit(df['entity_name'].unique())
entity_value_encoder.fit(df['entity_value'].unique())

# Initialize a list to store results
results = []

# Iterate over the dataset
for idx, row in df.iterrows():
    image_url = row['image_link']
    entity_name = row['entity_name']
    entity_value = row['entity_value']
    
    # Download and process image
    image = download_image(image_url)
    features = extract_features(image)
    
    # Encode entity names and values
    encoded_entity_name = entity_name_encoder.transform([entity_name])[0]
    encoded_entity_value = entity_value_encoder.transform([entity_value])[0]
    
    results.append({
        'features': features.tolist(),
        'encoded_entity_name': encoded_entity_name,
        'encoded_entity_value': encoded_entity_value
    })

# Convert results to DataFrame
features_df = pd.DataFrame(results)

In [5]:
features_df.head()

Unnamed: 0,features,encoded_entity_name,encoded_entity_value
0,"[-3.2995729446411133, -4.209080696105957, -3.1...",3,7
1,"[-2.2146027088165283, -2.2432150840759277, -0....",7,76
2,"[-6.495326042175293, -2.995445489883423, -1.69...",1,12
3,"[-3.9437735080718994, 1.7602643966674805, 0.82...",0,40
4,"[-1.8861312866210938, -2.0388591289520264, -2....",3,10


In [6]:
# Expand features into separate columns
features_expanded = pd.DataFrame(features_df['features'].tolist(), columns=[f'feature_{i}' for i in range(len(features_df['features'][0]))])
features_df = pd.concat([features_df.drop(columns=['features']), features_expanded], axis=1)

# Print column names and first few rows to check if 'features' exists
print(features_df.columns)
print(features_df.head())

# Prepare data for modeling
X = features_df.drop(columns=['encoded_entity_value'])
y = features_df['encoded_entity_value']


Index(['encoded_entity_name', 'encoded_entity_value', 'feature_0', 'feature_1',
       'feature_2', 'feature_3', 'feature_4', 'feature_5', 'feature_6',
       'feature_7',
       ...
       'feature_990', 'feature_991', 'feature_992', 'feature_993',
       'feature_994', 'feature_995', 'feature_996', 'feature_997',
       'feature_998', 'feature_999'],
      dtype='object', length=1002)
   encoded_entity_name  encoded_entity_value  feature_0  feature_1  feature_2  \
0                    3                     7  -3.299573  -4.209081  -3.180969   
1                    7                    76  -2.214603  -2.243215  -0.999183   
2                    1                    12  -6.495326  -2.995445  -1.694585   
3                    0                    40  -3.943774   1.760264   0.826122   
4                    3                    10  -1.886131  -2.038859  -2.236290   

   feature_3  feature_4  feature_5  feature_6  feature_7  ...  feature_990  \
0  -0.735489   0.982856  -3.855928  -6.124228

In [7]:
features_df.head()

Unnamed: 0,encoded_entity_name,encoded_entity_value,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,...,feature_990,feature_991,feature_992,feature_993,feature_994,feature_995,feature_996,feature_997,feature_998,feature_999
0,3,7,-3.299573,-4.209081,-3.180969,-0.735489,0.982856,-3.855928,-6.124228,-2.362949,...,-2.388696,-3.69641,-4.502243,-3.338748,-4.913984,-5.171934,0.420845,-5.645026,-1.203192,2.595617
1,7,76,-2.214603,-2.243215,-0.999183,-0.998531,-0.802855,-0.843652,-2.884283,-0.114411,...,-0.262714,-2.034765,-4.008804,-3.590989,-3.246768,-2.582219,-1.996611,-2.494826,0.456023,-1.245745
2,1,12,-6.495326,-2.995445,-1.694585,-1.302688,-0.531743,-1.468706,-4.796603,-1.348032,...,-1.093952,-3.690505,-4.137578,-3.121323,-1.220875,-2.373565,-0.618034,-3.169698,-1.611693,3.783908
3,0,40,-3.943774,1.760264,0.826122,1.107996,0.007,-1.325379,-1.08626,-1.358907,...,-0.749811,-2.013833,-2.405521,-2.928203,0.183115,-2.732264,-1.353597,-2.588346,0.835229,5.307116
4,3,10,-1.886131,-2.038859,-2.23629,-0.052591,1.068134,2.215033,-2.115244,-2.361414,...,-2.190348,-4.783791,-5.093812,-3.842153,-4.416555,-1.162961,-1.455895,-3.95243,1.734921,2.158779


In [8]:
# from sklearn.preprocessing import MinMaxScaler

# # Initialize MinMaxScaler
# scaler = MinMaxScaler()

# # Select all feature columns for scaling
# feature_columns = [col for col in features_df.columns if col.startswith('feature_')]

# # Fit and transform the selected feature columns
# features_df[feature_columns] = scaler.fit_transform(features_df[feature_columns])

In [9]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define models
models = {
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "SVM": SVC(kernel='linear', random_state=42),
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
    "K-Nearest Neighbors": KNeighborsClassifier(n_neighbors=5),
    "AdaBoost": AdaBoostClassifier(n_estimators=100, random_state=42)
}

# Iterate over models, train, and evaluate
for model_name, model in models.items():
    # Train the model
    model.fit(X_train, y_train)
    
    # Make predictions
    y_pred = model.predict(X_test)
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
    recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
    f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)
    
    # Print metrics
    print(f"Model: {model_name}")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print()  # Print a newline for better readability


Model: Random Forest
Accuracy: 0.0000
Precision: 0.0000
Recall: 0.0000
F1 Score: 0.0000

Model: SVM
Accuracy: 0.0000
Precision: 0.0000
Recall: 0.0000
F1 Score: 0.0000

Model: Logistic Regression
Accuracy: 0.0000
Precision: 0.0000
Recall: 0.0000
F1 Score: 0.0000

Model: K-Nearest Neighbors
Accuracy: 0.0500
Precision: 0.0250
Recall: 0.0500
F1 Score: 0.0333





Model: AdaBoost
Accuracy: 0.0500
Precision: 0.0028
Recall: 0.0500
F1 Score: 0.0053



In [25]:
X_test= pd.read_csv(r"D:\React18\newshop\Feature-Extraction-from-Images-DEVMATES\dataset\test_data.csv")
X_test = X_test.drop(columns=['index','decoded_entity','decoded_entity_name'])


In [26]:
X_test.head()
X_test.columns

Index(['encoded_entity_name', 'feature_0', 'feature_1', 'feature_2',
       'feature_3', 'feature_4', 'feature_5', 'feature_6', 'feature_7',
       'feature_8',
       ...
       'feature_990', 'feature_991', 'feature_992', 'feature_993',
       'feature_994', 'feature_995', 'feature_996', 'feature_997',
       'feature_998', 'feature_999'],
      dtype='object', length=1001)

In [27]:
X_train.columns

Index(['encoded_entity_name', 'feature_0', 'feature_1', 'feature_2',
       'feature_3', 'feature_4', 'feature_5', 'feature_6', 'feature_7',
       'feature_8',
       ...
       'feature_990', 'feature_991', 'feature_992', 'feature_993',
       'feature_994', 'feature_995', 'feature_996', 'feature_997',
       'feature_998', 'feature_999'],
      dtype='object', length=1001)

In [28]:
model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Entity to unit mapping
entity_unit_map = {
    'width': {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'},
    'depth': {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'},
    'height': {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'},
    'item_weight': {'gram', 'kilogram', 'microgram', 'milligram', 'ounce', 'pound', 'ton'},
    'maximum_weight_recommendation': {'gram', 'kilogram', 'microgram', 'milligram', 'ounce', 'pound', 'ton'},
    'voltage': {'kilovolt', 'millivolt', 'volt'},
    'wattage': {'kilowatt', 'watt'},
    'item_volume': {'centilitre', 'cubic foot', 'cubic inch', 'cup', 'decilitre', 'fluid ounce', 'gallon',
                    'imperial gallon', 'litre', 'microlitre', 'millilitre', 'pint', 'quart'}
}

# Allowed units set
allowed_units = {unit for entity in entity_unit_map for unit in entity_unit_map[entity]}

# Function to format prediction with a unit
def format_prediction(prediction, unit):
    """Format prediction as 'x unit'."""
    return f"{prediction:.2f} {unit}"

# Function to assign a random unit from the allowed units of a given entity
def assign_unit(entity):
    """Assign a unit based on the entity from the entity_unit_map."""
    if entity in entity_unit_map:
        return random.choice(list(entity_unit_map[entity]))  # Choose a random unit from valid ones for the entity
    else:
        return random.choice(list(allowed_units))  # Fall back to any allowed unit if entity is not found

# Example entities list based on your dataset
entities = ['width', 'height', 'item_weight', 'voltage', 'wattage', 'item_volume']  # Replace with actual entity predictions

# List to store the formatted predictions for saving to CSV
formatted_predictions = []

# Iterate over the predictions and format them accordingly
for idx, pred in enumerate(y_pred):
    entity = entities[idx % len(entities)]  # Assign an entity in a cycle (this is an example)
    unit = assign_unit(entity)  # Get the correct unit for the entity
    formatted_pred = format_prediction(pred, unit)  # Format the prediction with a unit
    
    # Append the result in a dictionary format for easy conversion to a DataFrame later
    formatted_predictions.append({"index": idx, "entity": entity, "prediction": formatted_pred})

# Convert the list of formatted predictions to a DataFrame
predictions_df = pd.DataFrame(formatted_predictions)

# Save the DataFrame to a CSV file
predictions_df.to_csv("predicted_entity_values.csv", index=False)

print("Predicted entity values saved to 'predicted_entity_values.csv'.")


Predicted entity values saved to 'predicted_entity_values.csv'.
