In [None]:
import os
import time
import torch
import numpy as np
import pandas as pd
from PIL import Image
from tqdm import tqdm
from catboost import Pool, CatBoostRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, PolynomialFeatures
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from torchvision import transforms

tqdm.pandas()

In [None]:
class Config:
    def __init__(self):
        self.target_columns = ['X4_mean', 'X11_mean', 'X18_mean', 'X26_mean', 'X50_mean', 'X3112_mean']
        self.n_val_samples = 4096
        self.seed = 999
        self.device = 'cuda:0' if torch.cuda.is_available() else 'cpu'

def set_seed(seed_value: int):
    os.environ['PYTHONHASHSEED'] = str(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    torch.cuda.manual_seed(seed_value)

# Initialize config and seed
CONFIG = Config()
set_seed(CONFIG.seed)

print(f"Currently using device: {CONFIG.device}")

In [None]:
# Define paths
base_path = 'C:/Users/vince/PyCharmProjects/cs680'
dataset_path = os.path.join(base_path, 'data/cs-480-2024-spring/data')
train_img_dir = os.path.join(dataset_path, 'train_images')
test_img_dir = os.path.join(dataset_path, 'test_images')

print("Paths defined")

# Load data
train_fullset = pd.read_csv(os.path.join(dataset_path, 'train.csv'))
train_fullset['file_path'] = train_fullset['id'].apply(lambda s: os.path.join(train_img_dir, f'{s}.jpeg'))
print(train_fullset.head(2).to_string())

test = pd.read_csv(os.path.join(dataset_path, 'test.csv'))
test['file_path'] = test['id'].apply(lambda s: os.path.join(test_img_dir, f'{s}.jpeg'))
print(test.head(2).to_string())
print("Data loaded")

In [None]:
CONFIG.FEATURE_COLUMNS = test.columns.values[1:-1]

train, val = train_test_split(train_fullset, test_size=CONFIG.n_val_samples, shuffle=True, random_state=CONFIG.seed)
train = train.reset_index(drop=True)
val = val.reset_index(drop=True)

print("Train-validation split complete")

In [None]:
def generate_mask(dataframe, stats_df):
    mask_array = np.empty(shape=dataframe[CONFIG.target_columns].shape, dtype=bool)

    for index, column in enumerate(CONFIG.target_columns):
        column_values = dataframe[column].values
        lower_bound = stats_df.loc[column]['0.1%']
        upper_bound = stats_df.loc[column]['99%']
        mask_array[:, index] = (column_values > lower_bound) & (column_values < upper_bound)

    return mask_array.all(axis=1)

labels_describe_df = train[CONFIG.target_columns].describe(percentiles=[0.001, 0.99]).round(3).T

mask_train = generate_mask(train, labels_describe_df)
mask_val = generate_mask(val, labels_describe_df)

train_mask = train[mask_train].reset_index(drop=True)
val_mask = val[mask_val].reset_index(drop=True)

for mask, subset_name, data in zip([train_mask, val_mask], ['train', 'val'], [train, val]):
    subset_size = len(data)
    masked_count = subset_size - len(mask)
    masked_percentage = (masked_count / subset_size) * 100

feature_scaler = StandardScaler()
train_features_scaled = feature_scaler.fit_transform(train_mask[CONFIG.FEATURE_COLUMNS].values.astype(np.float32))
val_features_scaled = feature_scaler.transform(val_mask[CONFIG.FEATURE_COLUMNS].values.astype(np.float32))
test_features_scaled = feature_scaler.transform(test[CONFIG.FEATURE_COLUMNS].values.astype(np.float32))

y_train_scaled = train_mask[CONFIG.target_columns].values
y_val_scaled = val_mask[CONFIG.target_columns].values


In [None]:
def extract_image_embeddings(model, preprocess_pipeline, batch_size, dataframe):
    embeddings = []
    for start_idx in tqdm(range(0, len(dataframe), batch_size)):
        batch_paths = dataframe['file_path'][start_idx:start_idx + batch_size]
        batch_images = [preprocess_pipeline(Image.open(path)) for path in batch_paths]
        image_tensor = torch.stack(batch_images).to(CONFIG.device)
        with torch.no_grad():
            batch_embeddings = model(image_tensor)
        embeddings.extend(batch_embeddings.cpu().numpy())
    return embeddings

model = torch.hub.load('facebookresearch/dinov2', 'dinov2_vitg14_reg').to(CONFIG.device)
model.eval()
preprocess_pipeline = transforms.Compose([
    transforms.Resize(224, interpolation=3),
    transforms.ToTensor(),
    transforms.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)),
])

batch_size = 64
file_suffix = 'image_embs_dinov2_vitg14_reg'
train_image_embeddings = extract_image_embeddings(model, preprocess_pipeline, batch_size, train_mask)
np.save(f'train_{file_suffix}', np.array(train_image_embeddings))
val_image_embeddings = extract_image_embeddings(model, preprocess_pipeline, batch_size, val_mask)
np.save(f'val_{file_suffix}', np.array(val_image_embeddings))
test_image_embeddings = extract_image_embeddings(model, preprocess_pipeline, batch_size, test)
np.save(f'test_{file_suffix}', np.array(test_image_embeddings))

In [None]:
poly_features = 2000

train_poly_feats = PolynomialFeatures(2).fit_transform(train_features_scaled)[:, :poly_features]
train_features_mask_all = np.concatenate((train_poly_feats, train_image_embeddings), axis=1)

val_poly_feats = PolynomialFeatures(2).fit_transform(val_features_scaled)[:, :poly_features]
val_features_mask_all = np.concatenate((val_poly_feats, val_image_embeddings), axis=1)

test_poly_feats = PolynomialFeatures(2).fit_transform(test_features_scaled)[:, :poly_features]
test_features_all = np.concatenate((test_poly_feats, test_image_embeddings), axis=1)

train_features_mask_df = pd.DataFrame(train_features_mask_all)
train_features_mask_df['emb'] = train_image_embeddings

val_features_mask_df = pd.DataFrame(val_features_mask_all)
val_features_mask_df['emb'] = val_image_embeddings

test_features_mask_df = pd.DataFrame(test_features_all)
test_features_mask_df['emb'] = test_image_embeddings

In [None]:
models = {}
scores = {}
for i, col in tqdm(enumerate(CONFIG.target_columns), total=len(CONFIG.target_columns)):
    y_curr = y_train_scaled[:, i]
    y_curr_val = y_val_scaled[:, i]
    train_pool = Pool(train_features_mask_df, y_curr, embedding_features=['emb'])
    val_pool = Pool(val_features_mask_df, y_curr_val, embedding_features=['emb'])
    
    model = CatBoostRegressor(iterations=2500, learning_rate=0.06, loss_function='RMSE', verbose=1000, random_state=CONFIG.seed)
    model.fit(train_pool, eval_set=val_pool, use_best_model=True, early_stopping_rounds=100)
    models[col] = model

    y_train_pred = model.predict(train_pool)
    y_curr_val_pred = model.predict(val_pool)
    
    mae = mean_absolute_error(y_curr, y_train_pred)
    r2 = r2_score(y_curr, y_train_pred)
    val_mae = mean_absolute_error(y_curr_val, y_curr_val_pred)
    r2_col = r2_score(y_curr_val, y_curr_val_pred)
    loss = mae
    val_loss = val_mae

    scores[col] = {
        'MAE': mae,
        'R2' : r2,
        'val_MAE': val_mae,
        'val_R2': r2_col,
        'loss': loss,
        'val_loss': val_loss
    }
    print(f'Target: {col} | MAE: {mae:.3f} | val_MAE: {val_mae:.3f} |r2: {r2:.3f}| val_R2: {r2_col:.3f} | loss (MAE): {loss:.3f} | val_loss (MAE): {val_loss:.3f}')

mean_scores = {
    'MAE': np.mean([score['MAE'] for score in scores.values()]),
    'R2': np.mean([score['R2'] for score in scores.values()]),
    'val_MAE': np.mean([score['val_MAE'] for score in scores.values()]),
    'val_R2': np.mean([score['val_R2'] for score in scores.values()])
}

In [None]:
submission = pd.DataFrame({'id': test['id']})
submission[CONFIG.target_columns] = 0
submission.columns = submission.columns.str.replace('_mean', '')

for i, col in enumerate(CONFIG.target_columns):
    test_pool = Pool(test_features_mask_df, embedding_features=['emb'])
    col_pred = models[col].predict(test_pool)
    submission[col.replace('_mean', '')] = col_pred

submission_file = os.path.join(base_path, f'submission_{int(time.time())}.csv')
submission.to_csv(submission_file, index=False)

print(f"\nSubmission saved to: {submission_file}")
print("\nSubmission head:")
print(submission.head().to_string())

print("\nSubmission column ranges:")
for col in submission.columns[1:]:  # Skip the 'id' column
    print(f"{col}: [{submission[col].min():.3f}, {submission[col].max():.3f}]")

print("Script completed successfully!")