In [1]:
import pandas as pd
import requests
from PIL import Image
from io import BytesIO
import torch
import torchvision.transforms as transforms
from torchvision import models
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
import numpy as np
import re


In [2]:
# Load ResNet model for feature extraction
model = models.resnet18(pretrained=True)
model.eval()  # Set to evaluation mode

# Define preprocessing transform
preprocess = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])




In [3]:
# Function to download image from URL
def download_image(image_url):
    response = requests.get(image_url)
    image = Image.open(BytesIO(response.content)).convert('RGB')
    return image

# Function to extract features from an image
def extract_features(image):
    input_tensor = preprocess(image).unsqueeze(0)  # Create a mini-batch as expected by the model
    with torch.no_grad():
        features = model(input_tensor)
    return features.numpy().flatten()  # Flatten to 1D array

# Function to extract numeric value from string
def extract_numeric(value):
    match = re.search(r"[\d\.]+", value)
    return float(match.group()) if match else None

# Load dataset
df = pd.read_csv(r'D:\Hackathon\Amazon-ML-Hackathon\dataset\sample_train.csv')

In [5]:
# Initialize LabelEncoders
entity_name_encoder = LabelEncoder()
entity_value_encoder = LabelEncoder()

# Fit encoders
entity_name_encoder.fit(df['entity_name'].unique())
entity_value_encoder.fit(df['entity_value'].unique())

# Initialize a list to store results
results = []

# Iterate over the dataset
for idx, row in df.iterrows():
    image_url = row['image_link']
    entity_name = row['entity_name']
    entity_value = row['entity_value']
    
    # Download and process image
    image = download_image(image_url)
    features = extract_features(image)
    
    # Encode entity names and values
    encoded_entity_name = entity_name_encoder.transform([entity_name])[0]
    encoded_entity_value = entity_value_encoder.transform([entity_value])[0]
    
    results.append({
        'features': features.tolist(),
        'encoded_entity_name': encoded_entity_name,
        'encoded_entity_value': encoded_entity_value
    })

# Convert results to DataFrame
features_df = pd.DataFrame(results)

In [7]:
features_df.head()

Unnamed: 0,features,encoded_entity_name,encoded_entity_value
0,"[-3.299569606781006, -4.209078311920166, -3.18...",3,7
1,"[-2.2146050930023193, -2.2432167530059814, -0....",7,76
2,"[-6.4953227043151855, -2.9954447746276855, -1....",1,12
3,"[-3.9437732696533203, 1.7602670192718506, 0.82...",0,40
4,"[-1.886131763458252, -2.0388600826263428, -2.2...",3,10


In [8]:
# Expand features into separate columns
features_expanded = pd.DataFrame(features_df['features'].tolist(), columns=[f'feature_{i}' for i in range(len(features_df['features'][0]))])
features_df = pd.concat([features_df.drop(columns=['features']), features_expanded], axis=1)

# Print column names and first few rows to check if 'features' exists
print(features_df.columns)
print(features_df.head())

# Prepare data for modeling
X = features_df.drop(columns=['encoded_entity_value'])
y = features_df['encoded_entity_value']


Index(['encoded_entity_name', 'encoded_entity_value', 'feature_0', 'feature_1',
       'feature_2', 'feature_3', 'feature_4', 'feature_5', 'feature_6',
       'feature_7',
       ...
       'feature_990', 'feature_991', 'feature_992', 'feature_993',
       'feature_994', 'feature_995', 'feature_996', 'feature_997',
       'feature_998', 'feature_999'],
      dtype='object', length=1002)
   encoded_entity_name  encoded_entity_value  feature_0  feature_1  feature_2  \
0                    3                     7  -3.299570  -4.209078  -3.180965   
1                    7                    76  -2.214605  -2.243217  -0.999182   
2                    1                    12  -6.495323  -2.995445  -1.694584   
3                    0                    40  -3.943773   1.760267   0.826120   
4                    3                    10  -1.886132  -2.038860  -2.236290   

   feature_3  feature_4  feature_5  feature_6  feature_7  ...  feature_990  \
0  -0.735484   0.982860  -3.855927  -6.124227

In [9]:
features_df.head()

Unnamed: 0,encoded_entity_name,encoded_entity_value,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,...,feature_990,feature_991,feature_992,feature_993,feature_994,feature_995,feature_996,feature_997,feature_998,feature_999
0,3,7,-3.29957,-4.209078,-3.180965,-0.735484,0.98286,-3.855927,-6.124227,-2.362946,...,-2.388695,-3.696411,-4.502243,-3.338749,-4.913984,-5.171935,0.420844,-5.645025,-1.20319,2.595614
1,7,76,-2.214605,-2.243217,-0.999182,-0.99853,-0.802852,-0.84365,-2.884282,-0.114408,...,-0.262716,-2.034761,-4.008806,-3.590987,-3.24677,-2.58222,-1.996609,-2.494827,0.456024,-1.245744
2,1,12,-6.495323,-2.995445,-1.694584,-1.302688,-0.531744,-1.468706,-4.796602,-1.348033,...,-1.093956,-3.690507,-4.137578,-3.121324,-1.220873,-2.373567,-0.618037,-3.1697,-1.611694,3.783907
3,0,40,-3.943773,1.760267,0.82612,1.107993,0.006997,-1.325381,-1.08626,-1.358906,...,-0.749811,-2.013833,-2.40552,-2.928204,0.183117,-2.732266,-1.353598,-2.588347,0.83523,5.307117
4,3,10,-1.886132,-2.03886,-2.23629,-0.052591,1.068133,2.215033,-2.115243,-2.361414,...,-2.190347,-4.783792,-5.093812,-3.842153,-4.416555,-1.162963,-1.455896,-3.952432,1.734922,2.158779


In [None]:
# from sklearn.preprocessing import MinMaxScaler

# # Initialize MinMaxScaler
# scaler = MinMaxScaler()

# # Select all feature columns for scaling
# feature_columns = [col for col in features_df.columns if col.startswith('feature_')]

# # Fit and transform the selected feature columns
# features_df[feature_columns] = scaler.fit_transform(features_df[feature_columns])

In [10]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define models
models = {
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "SVM": SVC(kernel='linear', random_state=42),
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
    "K-Nearest Neighbors": KNeighborsClassifier(n_neighbors=5),
    "AdaBoost": AdaBoostClassifier(n_estimators=100, random_state=42)
}

# Iterate over models, train, and evaluate
for model_name, model in models.items():
    # Train the model
    model.fit(X_train, y_train)
    
    # Make predictions
    y_pred = model.predict(X_test)
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
    recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
    f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)
    
    # Print metrics
    print(f"Model: {model_name}")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print()  # Print a newline for better readability


Model: Random Forest
Accuracy: 0.0000
Precision: 0.0000
Recall: 0.0000
F1 Score: 0.0000

Model: SVM
Accuracy: 0.0000
Precision: 0.0000
Recall: 0.0000
F1 Score: 0.0000

Model: Logistic Regression
Accuracy: 0.0000
Precision: 0.0000
Recall: 0.0000
F1 Score: 0.0000

Model: K-Nearest Neighbors
Accuracy: 0.0500
Precision: 0.0250
Recall: 0.0500
F1 Score: 0.0333

Model: AdaBoost
Accuracy: 0.0500
Precision: 0.0028
Recall: 0.0500
F1 Score: 0.0053

