In [1]:
import os
import numpy as np
import pandas as pd
import random
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
from sklearn.metrics import f1_score
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.preprocessing import image
from tensorflow.keras.layers import GlobalAveragePooling2D, Dense, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tqdm import tqdm


In [2]:
# Load and preprocess data
category_attributes = pd.read_parquet('category_attributes.parquet')
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
train_df = train_df.drop(columns=['len'])

# Preprocess category-to-attributes dictionary
category_to_attributes = {
    row['Category']: row['Attribute_list']
    for _, row in category_attributes.iterrows()
}

In [3]:
# Fill missing values with random imputation
def random_impute(df, attribute_positions):
    for attr_col in attribute_positions.keys():
        if df[attr_col].isna().sum() > 0:
            non_na_values = df[attr_col].dropna().unique()
            df[attr_col] = df[attr_col].apply(lambda x: random.choice(non_na_values) if pd.isna(x) else x)
    return df

# Apply random imputation for each category
for _, row in category_attributes.iterrows():
    category = row['Category']
    attributes = row['Attribute_list']
    attribute_positions = {f'attr_{i+1}': attr_name for i, attr_name in enumerate(attributes)}
    category_df = train_df[train_df['Category'] == category]
    filled_category_df = random_impute(category_df, attribute_positions)
    train_df.update(filled_category_df)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[attr_col] = df[attr_col].apply(lambda x: random.choice(non_na_values) if pd.isna(x) else x)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[attr_col] = df[attr_col].apply(lambda x: random.choice(non_na_values) if pd.isna(x) else x)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[attr_col] = 

In [4]:
# Replace irrelevant attributes with 'DV' and encode attributes
for i in range(1, 11):
    attr_col = f'attr_{i}'
    train_df[attr_col].fillna('DV', inplace=True)
train_df['id'] = train_df['id'].astype('int64')

encoders = {}
for col in [f'attr_{i}' for i in range(1, 11)]:
    encoder = LabelEncoder()
    train_df[col] = encoder.fit_transform(train_df[col].astype(str))
    encoders[col] = encoder


In [5]:
# Custom ResNet-50 model with multi-layer feature extraction
def create_resnet_feature_extractor(input_shape=(256, 256, 3), reduced_dim=512, fine_tune_at_layer=100):
    base_model = ResNet50(weights='imagenet', include_top=False, input_shape=input_shape)

    # Fine-tune from an earlier layer to capture more details
    for layer in base_model.layers[:fine_tune_at_layer]:
        layer.trainable = False

    # Extract features from both final and intermediate layers
    intermediate_layer_model = Model(inputs=base_model.input, outputs=[
        base_model.get_layer("conv4_block1_out").output,  # Intermediate layer
        base_model.get_layer("conv5_block3_out").output   # Final ResNet layer
    ])

    # Add pooling and custom dense layers
    intermediate_output, final_output = intermediate_layer_model.output
    intermediate_output = GlobalAveragePooling2D()(intermediate_output)
    final_output = GlobalAveragePooling2D()(final_output)
    
    # Concatenate intermediate and final layer outputs
    concatenated_output = tf.keras.layers.Concatenate()([intermediate_output, final_output])
    x = Dense(reduced_dim, activation='relu')(concatenated_output)
    x = Dropout(0.5)(x)
    model = Model(inputs=base_model.input, outputs=x)

    model.compile(optimizer=Adam(learning_rate=1e-5), loss='categorical_crossentropy')
    return model

# Initialize feature extractor with multi-layer features
feature_extractor = create_resnet_feature_extractor()


In [6]:
# Function to extract features from images
def extract_resnet_features(img_path, target_size=(256, 256)):
    img = image.load_img(img_path, target_size=target_size)
    img_array = image.img_to_array(img)
    img_array = np.expand_dims(img_array, axis=0) / 255.0
    features = feature_extractor.predict(img_array)
    return features.flatten()

# Batch-wise image feature extraction
def extract_image_features_batch(df, img_dir, output_path, feature_extractor):
    features = {}
    for img_id in df['id']:
        img_path = os.path.join(img_dir, f"{int(img_id):06}.jpg")
        if os.path.exists(img_path):
            features[img_id] = extract_resnet_features(img_path)
    features_df = pd.DataFrame.from_dict(features, orient='index')
    features_df.to_csv(output_path)
    return features_df

# # Function to extract features from images
# def extract_resnet_features(img_path, feature_extractor, target_size=(256, 256)):
#     img = image.load_img(img_path, target_size=target_size)
#     img_array = image.img_to_array(img)
#     img_array = np.expand_dims(img_array, axis=0) / 255.0
#     features = feature_extractor.predict(img_array)
#     return features.flatten()

# # Batch-wise image feature extraction with progress tracking
# def extract_image_features_batch(df, img_dir, output_path, feature_extractor):
#     features = {}
#     for img_id in tqdm(df['id'], desc="Extracting Features"):
#         img_path = os.path.join(img_dir, f"{int(img_id):06}.jpg")
#         if os.path.exists(img_path):
#             features[img_id] = extract_resnet_features(img_path, feature_extractor)
#     features_df = pd.DataFrame.from_dict(features, orient='index')
#     features_df.to_csv(output_path)
#     return features_df


In [7]:
train_features_path = 'train_image_features_multilayer.csv'

if not os.path.exists(train_features_path):
    train_img_features_df = extract_image_features_batch(train_df, 'train_images', train_features_path, feature_extractor)
else:
    train_img_features_df = pd.read_csv(train_features_path, index_col=0)

In [8]:
train_img_features_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,502,503,504,505,506,507,508,509,510,511
0,0.022255,0.0,0.0,0.689740,0.865177,0.0,0.000000,0.624610,0.018495,0.0,...,0.0,0.232569,0.0,0.452350,1.064275,1.288730,0.0,2.001462,1.316511,0.479237
1,0.000000,0.0,0.0,0.829285,1.066783,0.0,0.003637,0.502917,0.000000,0.0,...,0.0,0.362627,0.0,0.462133,1.147076,1.244628,0.0,1.971780,1.202177,0.416088
2,0.000000,0.0,0.0,0.823437,1.120516,0.0,0.025263,0.449964,0.000000,0.0,...,0.0,0.422802,0.0,0.458623,1.199253,1.273217,0.0,1.971126,1.201153,0.351132
3,0.000000,0.0,0.0,0.920630,1.118722,0.0,0.062921,0.395592,0.000000,0.0,...,0.0,0.429789,0.0,0.491309,1.184932,1.263458,0.0,1.981495,1.223018,0.425713
4,0.000000,0.0,0.0,0.841864,1.035506,0.0,0.028875,0.489829,0.000000,0.0,...,0.0,0.369076,0.0,0.443651,1.151554,1.247658,0.0,1.969832,1.186001,0.405526
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
70374,0.000000,0.0,0.0,0.643625,0.967281,0.0,0.000000,0.615319,0.048219,0.0,...,0.0,0.302742,0.0,0.517153,1.073927,1.298169,0.0,1.939384,1.341032,0.420804
70375,0.000000,0.0,0.0,0.712026,1.158837,0.0,0.478972,0.444187,0.240014,0.0,...,0.0,0.672450,0.0,0.611231,1.466686,1.379860,0.0,1.798207,1.186979,0.000000
70376,0.000000,0.0,0.0,0.867143,1.032658,0.0,0.000000,0.573031,0.086450,0.0,...,0.0,0.455007,0.0,0.436562,1.088179,1.338972,0.0,1.917899,1.314930,0.400409
70377,0.011309,0.0,0.0,0.756985,0.985720,0.0,0.000000,0.574852,0.046948,0.0,...,0.0,0.297944,0.0,0.427995,1.107717,1.356371,0.0,2.000670,1.364415,0.464064


In [9]:
# Merge image features with train and test data
train_df = train_df.merge(train_img_features_df, left_on='id', right_index=True)


# Prepare data for XGBoost
train_df['Category'] = train_df['Category'].astype('category').cat.codes
X = train_df.drop(columns=['id', 'attr_1', 'attr_2', 'attr_3', 'attr_4', 'attr_5', 
                           'attr_6', 'attr_7', 'attr_8', 'attr_9', 'attr_10'])
y_attributes = train_df[['attr_1', 'attr_2', 'attr_3', 'attr_4', 'attr_5', 
                         'attr_6', 'attr_7', 'attr_8', 'attr_9', 'attr_10']]

# Split data for training and validation
X_train, X_val, y_train_attrs, y_val_attrs = train_test_split(X, y_attributes, test_size=0.2, random_state=42)


In [11]:
X_train

Unnamed: 0,Category,0,1,2,3,4,5,6,7,8,...,502,503,504,505,506,507,508,509,510,511
33510,4,0.007645,0.0,0.000000,0.844502,0.924122,0.0,0.000000,0.501675,0.000000,...,0.0,0.362035,0.0,0.467673,1.165974,1.293850,0.0,2.014301,1.277717,0.469044
12486,2,0.005848,0.0,0.000000,0.731659,0.939519,0.0,0.000000,0.484815,0.039116,...,0.0,0.306591,0.0,0.508091,1.130859,1.194992,0.0,1.950154,1.215159,0.431288
41888,4,0.043288,0.0,0.000000,0.791480,0.977552,0.0,0.038242,0.575311,0.098381,...,0.0,0.417813,0.0,0.472730,1.122290,1.296171,0.0,1.944820,1.318709,0.374003
64755,3,0.000000,0.0,0.000000,0.684672,1.042818,0.0,0.051466,0.603697,0.000000,...,0.0,0.461360,0.0,0.470313,1.071408,1.411013,0.0,1.844043,1.418078,0.320643
43956,4,0.000000,0.0,0.000000,0.692414,0.909816,0.0,0.000000,0.633609,0.042000,...,0.0,0.353771,0.0,0.474872,1.126492,1.340509,0.0,1.955878,1.339561,0.333649
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37194,4,0.019845,0.0,0.016161,0.964591,1.173508,0.0,0.140219,0.314867,0.050530,...,0.0,0.427887,0.0,0.472412,1.183986,1.249412,0.0,1.956842,1.329571,0.390479
6265,1,0.000000,0.0,0.000000,0.952198,1.008503,0.0,0.337828,0.406006,0.035491,...,0.0,0.436505,0.0,0.459295,1.307488,1.178103,0.0,1.797342,1.186611,0.166288
54886,3,0.041388,0.0,0.001812,0.848586,1.039537,0.0,0.000000,0.502696,0.079170,...,0.0,0.310190,0.0,0.459634,1.081609,1.255658,0.0,1.942527,1.300690,0.468613
860,1,0.000000,0.0,0.000000,0.818041,1.054909,0.0,0.144149,0.560007,0.000000,...,0.0,0.480492,0.0,0.413958,1.213924,1.369703,0.0,1.892915,1.289477,0.283148


In [12]:
# Train an XGBoost model for each attribute
models = {}
f1_scores = []

for i, attr in enumerate(y_attributes.columns):
    print(f"\nTraining XGBoost model for {attr}")
    
    y_train = y_train_attrs[attr]
    y_val = y_val_attrs[attr]
    
    model = xgb.XGBClassifier(
        objective='multi:softmax',
        num_class=len(np.unique(y_train)),
        eval_metric='mlogloss',
        use_label_encoder=False
    )
    
    model.fit(X_train, y_train)
    models[attr] = model
    
    y_pred = model.predict(X_val)
    f1 = f1_score(y_val, y_pred, average='weighted')
    f1_scores.append(f1)
    
    print(f"F1 Score for {attr}: {f1:.4f}")

# Calculate and print average F1 score
average_f1_score = np.mean(f1_scores)
print(f"\nAverage F1 Score across all attributes: {average_f1_score:.4f}")



Training XGBoost model for attr_1


Parameters: { "use_label_encoder" } are not used.



F1 Score for attr_1: 0.4748

Training XGBoost model for attr_2


Parameters: { "use_label_encoder" } are not used.



F1 Score for attr_2: 0.6648

Training XGBoost model for attr_3


Parameters: { "use_label_encoder" } are not used.



F1 Score for attr_3: 0.7426

Training XGBoost model for attr_4


Parameters: { "use_label_encoder" } are not used.



F1 Score for attr_4: 0.6753

Training XGBoost model for attr_5


Parameters: { "use_label_encoder" } are not used.



F1 Score for attr_5: 0.6796

Training XGBoost model for attr_6


Parameters: { "use_label_encoder" } are not used.



F1 Score for attr_6: 0.6724

Training XGBoost model for attr_7


Parameters: { "use_label_encoder" } are not used.



F1 Score for attr_7: 0.6572

Training XGBoost model for attr_8


Parameters: { "use_label_encoder" } are not used.



F1 Score for attr_8: 0.6926

Training XGBoost model for attr_9


Parameters: { "use_label_encoder" } are not used.



F1 Score for attr_9: 0.7707

Training XGBoost model for attr_10


Parameters: { "use_label_encoder" } are not used.



F1 Score for attr_10: 0.7420

Average F1 Score across all attributes: 0.6772


In [13]:
test_features_path = 'test_image_features_multilayer.csv'
if not os.path.exists(test_features_path):
    test_img_features_df = extract_image_features_batch(test_df, 'test_images', test_features_path, feature_extractor)
else:
    test_img_features_df = pd.read_csv(test_features_path, index_col=0)
test_df = test_df.merge(test_img_features_df, left_on='id', right_index=True)

In [14]:
# Prepare test data and make predictions
test_df['Category'] = test_df['Category'].astype('category').cat.codes
X_test_final = test_df.drop(columns=['id'])


In [15]:
X_test_final

Unnamed: 0,Category,0,1,2,3,4,5,6,7,8,...,502,503,504,505,506,507,508,509,510,511
0,1,0.0,0.0,0.000000,0.660861,0.916551,0.0,0.127660,0.705858,0.113085,...,0.0,0.670076,0.0,0.597402,1.171146,1.415220,0.0,1.944925,1.313812,0.207337
1,1,0.0,0.0,0.000000,0.776056,0.979775,0.0,0.000000,0.514789,0.000000,...,0.0,0.346264,0.0,0.435001,1.171969,1.371264,0.0,1.930488,1.276301,0.414265
2,1,0.0,0.0,0.000000,0.722993,0.931115,0.0,0.000000,0.607304,0.030614,...,0.0,0.413042,0.0,0.531207,1.123717,1.363787,0.0,1.926084,1.337384,0.325754
3,1,0.0,0.0,0.047061,0.797601,1.088529,0.0,0.182216,0.243571,0.038832,...,0.0,0.423012,0.0,0.452084,1.076712,1.180552,0.0,1.680155,1.175131,0.460504
4,1,0.0,0.0,0.000000,0.781933,1.039646,0.0,0.199993,0.486625,0.088393,...,0.0,0.679748,0.0,0.387622,1.084767,1.231349,0.0,1.836759,1.064677,0.325395
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30200,3,0.0,0.0,0.000000,0.646956,0.896937,0.0,0.000000,0.682525,0.023387,...,0.0,0.224227,0.0,0.520822,1.113395,1.365531,0.0,1.984479,1.317122,0.434052
30201,3,0.0,0.0,0.000000,0.721103,0.955345,0.0,0.000000,0.600525,0.013362,...,0.0,0.260498,0.0,0.503209,1.075390,1.245884,0.0,1.962016,1.298716,0.410173
30202,3,0.0,0.0,0.000000,0.738599,1.005875,0.0,0.000000,0.598262,0.013731,...,0.0,0.337793,0.0,0.487084,1.099078,1.362642,0.0,1.965711,1.379475,0.450563
30203,3,0.0,0.0,0.000000,0.832554,1.043268,0.0,0.000000,0.486623,0.014042,...,0.0,0.421896,0.0,0.430027,1.130291,1.344029,0.0,1.942652,1.281846,0.485166


In [16]:
predictions = {}
for i, (attr, model) in enumerate(models.items()):
    print(f"Making predictions for {attr}")
    predictions[attr] = model.predict(X_test_final)

# Decode predictions back to original label values
predicted_attributes = {
    attr: encoders[attr].inverse_transform(predictions[attr])
    for attr in y_attributes.columns
}
predicted_df = pd.DataFrame(predicted_attributes)

# Concatenate predictions with 'id' and 'Category' columns for submission
test_predictions = pd.concat([test_df[['id', 'Category']], predicted_df], axis=1)

# Map encoded 'Category' values back to original labels
original_test_df = pd.read_csv('test.csv')
category_mapping = dict(enumerate(original_test_df['Category'].astype('category').cat.categories))
test_predictions['Category'] = test_predictions['Category'].map(category_mapping)


Making predictions for attr_1
Making predictions for attr_2
Making predictions for attr_3
Making predictions for attr_4
Making predictions for attr_5
Making predictions for attr_6
Making predictions for attr_7
Making predictions for attr_8
Making predictions for attr_9
Making predictions for attr_10


In [17]:
# Save predictions to CSV
test_predictions.to_csv('submission_ft_rnt_x_516_1.csv', index=False)
print("submission_finetuned_rs_x.csv file saved successfully!")

submission_finetuned_rs_x.csv file saved successfully!
