In [1]:
import os
import numpy as np
import pandas as pd
import random
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
from sklearn.metrics import f1_score
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.preprocessing import image
from tensorflow.keras.layers import GlobalAveragePooling2D, Dense, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tqdm import tqdm
import optuna

In [2]:
# Load and preprocess data
category_attributes = pd.read_parquet('category_attributes.parquet')
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
train_df = train_df.drop(columns=['len'])

# Preprocess category-to-attributes dictionary
category_to_attributes = {
    row['Category']: row['Attribute_list']
    for _, row in category_attributes.iterrows()
}

In [3]:
# Fill missing values with random imputation
def random_impute(df, attribute_positions):
    for attr_col in attribute_positions.keys():
        if df[attr_col].isna().sum() > 0:
            non_na_values = df[attr_col].dropna().unique()
            df[attr_col] = df[attr_col].apply(lambda x: random.choice(non_na_values) if pd.isna(x) else x)
    return df

# Apply random imputation for each category
for _, row in category_attributes.iterrows():
    category = row['Category']
    attributes = row['Attribute_list']
    attribute_positions = {f'attr_{i+1}': attr_name for i, attr_name in enumerate(attributes)}
    category_df = train_df[train_df['Category'] == category]
    filled_category_df = random_impute(category_df, attribute_positions)
    train_df.update(filled_category_df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[attr_col] = df[attr_col].apply(lambda x: random.choice(non_na_values) if pd.isna(x) else x)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[attr_col] = df[attr_col].apply(lambda x: random.choice(non_na_values) if pd.isna(x) else x)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[attr_col] = 

In [4]:
# Replace irrelevant attributes with 'DV' and encode attributes
for i in range(1, 11):
    attr_col = f'attr_{i}'
    train_df[attr_col].fillna('DV', inplace=True)
train_df['id'] = train_df['id'].astype('int64')

encoders = {}
for col in [f'attr_{i}' for i in range(1, 11)]:
    encoder = LabelEncoder()
    train_df[col] = encoder.fit_transform(train_df[col].astype(str))
    encoders[col] = encoder


In [5]:
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, GlobalAveragePooling2D
from tensorflow.keras.models import Sequential

def create_custom_cnn(input_shape=(128, 128, 3), reduced_dim=512):
    model = Sequential([
        Conv2D(96, (3, 3), activation='relu', input_shape=input_shape),
        MaxPooling2D((2, 2)),
        Dropout(0.4),

        Conv2D(256, (3, 3), activation='relu'),
        MaxPooling2D((2, 2)),
        Dropout(0.4),

        Conv2D(384, (3, 3), activation='relu'),
        MaxPooling2D((2, 2)),
        Dropout(0.4),

        Conv2D(256, (3, 3), activation='relu'),
        MaxPooling2D((2, 2)),
        Dropout(0.5),
        
        GlobalAveragePooling2D(),  # Pool across all spatial dimensions to get a feature vector
        Dense(reduced_dim, activation='relu'),
        Dropout(0.4)
    ])

    model.compile(optimizer=Adam(learning_rate=1e-4), loss='categorical_crossentropy')
    return model

# Initialize custom CNN feature extractor
feature_extractor = create_custom_cnn()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [6]:
# Function to extract features from images
def extract_custom_cnn_features(img_path, target_size=(128, 128)):
    img = image.load_img(img_path, target_size=target_size)
    img_array = image.img_to_array(img)
    img_array = np.expand_dims(img_array, axis=0) / 255.0
    features = feature_extractor.predict(img_array)
    return features.flatten()


# Batch-wise image feature extraction
def extract_image_features_batch(df, img_dir, output_path, feature_extractor):
    features = {}
    for img_id in df['id']:
        img_path = os.path.join(img_dir, f"{int(img_id):06}.jpg")
        if os.path.exists(img_path):
            features[img_id] = extract_custom_cnn_features(img_path)
    features_df = pd.DataFrame.from_dict(features, orient='index')
    features_df.to_csv(output_path)
    return features_df



In [7]:
train_features_path = 'train_image_features_multilayer_2.csv'

if not os.path.exists(train_features_path):
    train_img_features_df = extract_image_features_batch(train_df, 'train_images', train_features_path, feature_extractor)
else:
    train_img_features_df = pd.read_csv(train_features_path, index_col=0)

In [8]:
train_img_features_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,502,503,504,505,506,507,508,509,510,511
0,0.0,0.005409,0.0,0.0,0.009460,0.0,0.007700,0.0,0.020625,0.011393,...,0.015672,0.0,0.008304,0.020765,0.003993,0.000000,0.016222,0.000164,0.017930,0.003970
1,0.0,0.003050,0.0,0.0,0.001893,0.0,0.015424,0.0,0.039768,0.012066,...,0.029899,0.0,0.009582,0.018250,0.017895,0.000000,0.010045,0.000000,0.039062,0.029877
2,0.0,0.001412,0.0,0.0,0.002201,0.0,0.021935,0.0,0.042767,0.013229,...,0.029253,0.0,0.008133,0.018146,0.015689,0.003610,0.007465,0.000000,0.043219,0.027267
3,0.0,0.004688,0.0,0.0,0.006735,0.0,0.015019,0.0,0.041632,0.014679,...,0.029391,0.0,0.009342,0.024435,0.011317,0.000000,0.012909,0.000000,0.039925,0.022227
4,0.0,0.001384,0.0,0.0,0.002471,0.0,0.019506,0.0,0.044680,0.013551,...,0.032837,0.0,0.009510,0.020025,0.018581,0.001886,0.010590,0.000000,0.045064,0.029458
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
70374,0.0,0.006419,0.0,0.0,0.004811,0.0,0.023222,0.0,0.043504,0.013253,...,0.027202,0.0,0.009714,0.017239,0.006776,0.006258,0.014984,0.000000,0.046916,0.016415
70375,0.0,0.006930,0.0,0.0,0.008637,0.0,0.019204,0.0,0.033372,0.017676,...,0.030136,0.0,0.011495,0.021976,0.008498,0.004063,0.017001,0.000000,0.037098,0.019618
70376,0.0,0.006383,0.0,0.0,0.005898,0.0,0.015584,0.0,0.035437,0.014044,...,0.025806,0.0,0.006933,0.016043,0.007201,0.003737,0.012383,0.000000,0.038345,0.013832
70377,0.0,0.011287,0.0,0.0,0.000000,0.0,0.017004,0.0,0.024785,0.013025,...,0.018171,0.0,0.003574,0.016109,0.003295,0.004139,0.012210,0.000000,0.025680,0.014510


In [9]:
# Merge image features with train and test data
train_df = train_df.merge(train_img_features_df, left_on='id', right_index=True)


# Prepare data for XGBoost
train_df['Category'] = train_df['Category'].astype('category').cat.codes
X = train_df.drop(columns=['id', 'attr_1', 'attr_2', 'attr_3', 'attr_4', 'attr_5', 
                           'attr_6', 'attr_7', 'attr_8', 'attr_9', 'attr_10'])
y_attributes = train_df[['attr_1', 'attr_2', 'attr_3', 'attr_4', 'attr_5', 
                         'attr_6', 'attr_7', 'attr_8', 'attr_9', 'attr_10']]

# Split data for training and validation
X_train, X_val, y_train_attrs, y_val_attrs = train_test_split(X, y_attributes, test_size=0.2, random_state=42)


In [10]:
X_train

Unnamed: 0,Category,0,1,2,3,4,5,6,7,8,...,502,503,504,505,506,507,508,509,510,511
33510,4,0.0,0.004107,0.0,0.0,0.008559,0.0,0.011779,0.0,0.031383,...,0.028108,0.0,0.010251,0.023820,0.002353,0.003159,0.016683,0.0,0.032450,0.014590
12486,2,0.0,0.002727,0.0,0.0,0.004705,0.0,0.019031,0.0,0.038607,...,0.028184,0.0,0.007356,0.011114,0.012857,0.004555,0.009663,0.0,0.041517,0.021430
41888,4,0.0,0.015173,0.0,0.0,0.012154,0.0,0.006233,0.0,0.028777,...,0.027621,0.0,0.007092,0.020835,0.008168,0.000000,0.023707,0.0,0.026803,0.019949
64755,3,0.0,0.001908,0.0,0.0,0.003066,0.0,0.016635,0.0,0.042017,...,0.028485,0.0,0.007496,0.017294,0.015123,0.002104,0.010007,0.0,0.044094,0.021809
43956,4,0.0,0.002575,0.0,0.0,0.002628,0.0,0.023485,0.0,0.042978,...,0.031461,0.0,0.008073,0.013138,0.016818,0.006505,0.008621,0.0,0.047203,0.027273
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37194,4,0.0,0.005506,0.0,0.0,0.004087,0.0,0.013905,0.0,0.032411,...,0.026779,0.0,0.007513,0.018332,0.009807,0.002647,0.011420,0.0,0.034423,0.017569
6265,1,0.0,0.006141,0.0,0.0,0.006325,0.0,0.015034,0.0,0.035154,...,0.025459,0.0,0.009649,0.022311,0.007189,0.003242,0.014461,0.0,0.033343,0.018182
54886,3,0.0,0.008849,0.0,0.0,0.008235,0.0,0.010837,0.0,0.030932,...,0.026910,0.0,0.008226,0.018314,0.009991,0.000000,0.017318,0.0,0.032974,0.018676
860,1,0.0,0.000706,0.0,0.0,0.002273,0.0,0.017759,0.0,0.035360,...,0.024247,0.0,0.008418,0.015132,0.016607,0.004433,0.009365,0.0,0.039062,0.021308


In [14]:
def optimize_xgboost(trial, X_train, y_train, X_val, y_val, num_classes):
    # Suggest hyperparameters to be tuned
    params = {
        'objective': 'multi:softmax',
        'num_class': num_classes,
        'eval_metric': 'mlogloss',
        'use_label_encoder': False,
        'eta': trial.suggest_float('eta', 0.01, 0.3, log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'min_child_weight': trial.suggest_float('min_child_weight', 1e-3, 10, log=True)
    }
    
    # Train the model
    model = xgb.XGBClassifier(**params)
    model.fit(X_train, y_train)
    
    # Predict and calculate F1 score
    y_pred = model.predict(X_val)
    f1 = f1_score(y_val, y_pred, average='weighted')
    
    return -f1  # Optuna minimizes, so return negative F1 score

In [15]:
# Train an XGBoost model for each attribute with Optuna optimization
optimized_models = {}
f1_scores = []

for i, attr in enumerate(y_attributes.columns):
    print(f"\nOptimizing XGBoost model for {attr}")
    
    y_train = y_train_attrs[attr]
    y_val = y_val_attrs[attr]
    num_classes = len(np.unique(y_train))
    
    # Define an Optuna study
    study = optuna.create_study(direction='minimize')
    study.optimize(lambda trial: optimize_xgboost(trial, X_train, y_train, X_val, y_val, num_classes), n_trials=5)
    
    print(f"Best parameters for {attr}: {study.best_params}")
    
    # Train final model with best parameters
    best_params = study.best_params
    best_params.update({
        'objective': 'multi:softmax',
        'num_class': num_classes,
        'eval_metric': 'mlogloss',
        'use_label_encoder': False,
    })
    
    model = xgb.XGBClassifier(**best_params)
    model.fit(X_train, y_train)
    optimized_models[attr] = model
    
    # Evaluate final model
    y_pred = model.predict(X_val)
    f1 = f1_score(y_val, y_pred, average='weighted')
    f1_scores.append(f1)
    
    print(f"F1 Score for {attr}: {f1:.4f}")

# Calculate and print average F1 score
average_f1_score = np.mean(f1_scores)
print(f"\nAverage F1 Score across all attributes: {average_f1_score:.4f}")

[I 2024-11-16 23:19:50,797] A new study created in memory with name: no-name-7270a0e8-3a6b-44e9-ac5c-42268fe4d616



Optimizing XGBoost model for attr_1


Parameters: { "use_label_encoder" } are not used.

[I 2024-11-16 23:25:23,738] Trial 0 finished with value: -0.5657634828193087 and parameters: {'eta': 0.052385660243620555, 'max_depth': 9, 'min_child_weight': 0.002463788084111508}. Best is trial 0 with value: -0.5657634828193087.
Parameters: { "use_label_encoder" } are not used.

[I 2024-11-16 23:28:01,516] Trial 1 finished with value: -0.5585216393898358 and parameters: {'eta': 0.07846989888405989, 'max_depth': 7, 'min_child_weight': 2.605622463295473}. Best is trial 0 with value: -0.5657634828193087.
Parameters: { "use_label_encoder" } are not used.

[I 2024-11-16 23:31:45,647] Trial 2 finished with value: -0.5666439980968322 and parameters: {'eta': 0.09548627608036607, 'max_depth': 8, 'min_child_weight': 0.23637747683420576}. Best is trial 2 with value: -0.5666439980968322.
Parameters: { "use_label_encoder" } are not used.

[I 2024-11-16 23:33:16,370] Trial 3 finished with value: -0.5053989569271421 and parameters: {'eta': 0.079778

Best parameters for attr_1: {'eta': 0.09548627608036607, 'max_depth': 8, 'min_child_weight': 0.23637747683420576}


Parameters: { "use_label_encoder" } are not used.

[I 2024-11-16 23:41:07,987] A new study created in memory with name: no-name-615fb4b0-d14e-4521-bf0b-dbef0e66597b


F1 Score for attr_1: 0.5666

Optimizing XGBoost model for attr_2


Parameters: { "use_label_encoder" } are not used.

[I 2024-11-16 23:45:20,746] Trial 0 finished with value: -0.640963410760646 and parameters: {'eta': 0.010349397990232982, 'max_depth': 10, 'min_child_weight': 0.22545118131346978}. Best is trial 0 with value: -0.640963410760646.
Parameters: { "use_label_encoder" } are not used.

[W 2024-11-16 23:45:48,724] Trial 1 failed with parameters: {'eta': 0.17530893673116546, 'max_depth': 4, 'min_child_weight': 1.1784453047233683} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "C:\Users\Ghosl\anaconda3\Lib\site-packages\optuna\study\_optimize.py", line 197, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "C:\Users\Ghosl\AppData\Local\Temp\ipykernel_15456\543810778.py", line 14, in <lambda>
    study.optimize(lambda trial: optimize_xgboost(trial, X_train, y_train, X_val, y_val, num_classes), n_trials=5)
                                 ^^^^^^^^^^^^^^^^^^^^^^^

KeyboardInterrupt: 

In [None]:
test_features_path = 'test_image_features_multilayer_2.csv'
if not os.path.exists(test_features_path):
    test_img_features_df = extract_image_features_batch(test_df, 'test_images', test_features_path, feature_extractor)
else:
    test_img_features_df = pd.read_csv(test_features_path, index_col=0)
test_df = test_df.merge(test_img_features_df, left_on='id', right_index=True)

In [None]:
# Prepare test data and make predictions
test_df['Category'] = test_df['Category'].astype('category').cat.codes
X_test_final = test_df.drop(columns=['id'])


In [None]:
X_test_final

In [None]:
predictions = {}
for i, (attr, model) in enumerate(models.items()):
    print(f"Making predictions for {attr}")
    predictions[attr] = model.predict(X_test_final)

# Decode predictions back to original label values
predicted_attributes = {
    attr: encoders[attr].inverse_transform(predictions[attr])
    for attr in y_attributes.columns
}
predicted_df = pd.DataFrame(predicted_attributes)

# Concatenate predictions with 'id' and 'Category' columns for submission
test_predictions = pd.concat([test_df[['id', 'Category']], predicted_df], axis=1)

# Map encoded 'Category' values back to original labels
original_test_df = pd.read_csv('test.csv')
category_mapping = dict(enumerate(original_test_df['Category'].astype('category').cat.categories))
test_predictions['Category'] = test_predictions['Category'].map(category_mapping)


In [None]:
# Save predictions to CSV
test_predictions.to_csv('submission_vistax_amit_rs_x_finetune_2_multi_3.csv', index=False)
print("submission_finetuned_rs_x.csv file saved successfully!")

In [None]:
# # Function to extract features from images
# def extract_resnet_features(img_path, feature_extractor, target_size=(256, 256)):
#     img = image.load_img(img_path, target_size=target_size)
#     img_array = image.img_to_array(img)
#     img_array = np.expand_dims(img_array, axis=0) / 255.0
#     features = feature_extractor.predict(img_array)
#     return features.flatten()

# # Batch-wise image feature extraction with progress tracking
# def extract_image_features_batch(df, img_dir, output_path, feature_extractor):
#     features = {}
#     for img_id in tqdm(df['id'], desc="Extracting Features"):
#         img_path = os.path.join(img_dir, f"{int(img_id):06}.jpg")
#         if os.path.exists(img_path):
#             features[img_id] = extract_resnet_features(img_path, feature_extractor)
#     features_df = pd.DataFrame.from_dict(features, orient='index')
#     features_df.to_csv(output_path)
#     return features_df
