In [2]:
!cp '/content/drive/MyDrive/Colab Notebooks/Kaggle/cs-480-2024-spring.zip' /content/

In [None]:
!unzip cs-480-2024-spring.zip

In [None]:
pip install timm

In [None]:
pip install catboost

In [6]:
import os
import timm
import torch
from torchvision import transforms
from PIL import Image
import numpy as np
import pandas as pd
from tqdm import tqdm
from catboost import CatBoostRegressor
import lightgbm as lgb
import xgboost as xgb
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

# Load ancillary data
train_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/test.csv')

# Ensure the 'id' column in train_df and test_df is of type string
train_df['id'] = train_df['id'].astype(str)
test_df['id'] = test_df['id'].astype(str)

# List all image paths based on the original data
train_images_dir = 'data/train_images'
test_images_dir = 'data/test_images'

train_image_paths = [os.path.join(train_images_dir, f"{img_id}.jpeg") for img_id in train_df['id'].values]
test_image_paths = [os.path.join(test_images_dir, f"{img_id}.jpeg") for img_id in test_df['id'].values]

# Check if CUDA is available and set the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the DINOv2 model using torch.hub and move it to the GPU if available
model = torch.hub.load('facebookresearch/dinov2', 'dinov2_vitg14_reg').to(device)
model.eval()

# Define augmentation pipeline
augmentation_pipeline = transforms.Compose([
    transforms.Resize((224, 224)),  # Adjusted to match the input size for the model
    transforms.RandomHorizontalFlip(p=0.5),
    transforms.RandomRotation(20),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
])

def load_and_preprocess_images(image_paths, transform):
    images = []
    for img_path in image_paths:
        img = Image.open(img_path).convert('RGB')
        img = transform(img)
        images.append(img)
    return torch.stack(images).to(device)

def extract_vit_embeddings_batch(image_paths, model, batch_size=32):
    embeddings = []
    for i in tqdm(range(0, len(image_paths), batch_size), desc="Extracting Image Embeddings in Batches"):
        batch_paths = image_paths[i:i+batch_size]
        batch_images = load_and_preprocess_images(batch_paths, augmentation_pipeline)
        with torch.no_grad():
            batch_embeddings = model(batch_images).cpu().numpy()
        embeddings.extend(batch_embeddings)
    return np.array(embeddings)

# Extract embeddings for all train and test images in batches
train_image_embeddings = extract_vit_embeddings_batch(train_image_paths, model, batch_size=32)
test_image_embeddings = extract_vit_embeddings_batch(test_image_paths, model, batch_size=32)

# Create DataFrame for image embeddings
train_image_feature_df = pd.DataFrame(train_image_embeddings, index=train_df['id'].values, columns=[f'img_feat_{i}' for i in range(train_image_embeddings.shape[1])])
test_image_feature_df = pd.DataFrame(test_image_embeddings, index=test_df['id'].values, columns=[f'img_feat_{i}' for i in range(test_image_embeddings.shape[1])])

# Merge image embeddings with ancillary data
train_full_feature_df = pd.merge(train_df, train_image_feature_df, left_on='id', right_index=True)
test_full_feature_df = pd.merge(test_df, test_image_feature_df, left_on='id', right_index=True)

# Select all columns for ancillary features (1 to 163) and image embeddings (from 170 onward)
X = train_full_feature_df.iloc[:, 1:164].join(train_full_feature_df.iloc[:, 170:]).values  # Features: ancillary + image embeddings
y = train_df.iloc[:, 164:170].values

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)



Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.

Downloading: "https://github.com/facebookresearch/dinov2/zipball/main" to /root/.cache/torch/hub/main.zip
Downloading: "https://dl.fbaipublicfiles.com/dinov2/dinov2_vitg14/dinov2_vitg14_reg4_pretrain.pth" to /root/.cache/torch/hub/checkpoints/dinov2_vitg14_reg4_pretrain.pth
100%|██████████| 4.23G/4.23G [01:14<00:00, 61.2MB/s]
Extracting Image Embeddings in Batches: 100%|██████████| 1356/1356 [2:05:14<00:00,  5.54s/it]
Extracting Image Embeddings in Batches: 100%|██████████| 200/200 [18:29<00:00,  5.55s/it]


In [None]:
np.save('dino_train_embeddings.npy', train_image_embeddings)
np.save('dino_test_embeddings.npy', test_image_embeddings)

In [14]:
# Train XGBoost models
xgboost_models = [xgb.XGBRegressor(n_estimators=1500, learning_rate=0.1, max_depth=8, objective='reg:squarederror',  tree_method = "hist", device = "cuda") for _ in range(6)]
xgboost_predictions_val = []

for i in tqdm(range(6), desc="Training XGBoost Models"):
    xgboost_models[i].fit(X_train, y_train[:, i])
    xgboost_predictions_val.append(xgboost_models[i].predict(X_val))



Training XGBoost Models: 100%|██████████| 6/6 [23:57<00:00, 239.58s/it]


In [22]:

# Train LightGBM models with GPU support
lightgbm_models = [lgb.LGBMRegressor(n_estimators=1500, learning_rate=0.1) for _ in range(6)]
lightgbm_predictions_val = []

for i in tqdm(range(6), desc="Training LightGBM Models"):
    lightgbm_models[i].fit(X_train, y_train[:, i])
    lightgbm_predictions_val.append(lightgbm_models[i].predict(X_val))


Training LightGBM Models:   0%|          | 0/6 [00:00<?, ?it/s]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 2.050799 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 431034
[LightGBM] [Info] Number of data points in the train set: 34690, number of used features: 1699
[LightGBM] [Info] Start training from score 1.036130


Training LightGBM Models:  17%|█▋        | 1/6 [12:16<1:01:20, 736.13s/it]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 1.888451 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 431034
[LightGBM] [Info] Number of data points in the train set: 34690, number of used features: 1699
[LightGBM] [Info] Start training from score 148.331981


Training LightGBM Models:  33%|███▎      | 2/6 [24:17<48:30, 727.61s/it]  

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 1.924319 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 431034
[LightGBM] [Info] Number of data points in the train set: 34690, number of used features: 1699
[LightGBM] [Info] Start training from score 19701.660189


Training LightGBM Models:  50%|█████     | 3/6 [36:01<35:49, 716.57s/it]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 2.735263 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 431034
[LightGBM] [Info] Number of data points in the train set: 34690, number of used features: 1699
[LightGBM] [Info] Start training from score 3482.081355


Training LightGBM Models:  67%|██████▋   | 4/6 [47:35<23:35, 707.81s/it]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 1.674433 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 431034
[LightGBM] [Info] Number of data points in the train set: 34690, number of used features: 1699
[LightGBM] [Info] Start training from score 15.111195


Training LightGBM Models:  83%|████████▎ | 5/6 [59:21<11:47, 707.28s/it]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 1.749783 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 431034
[LightGBM] [Info] Number of data points in the train set: 34690, number of used features: 1699
[LightGBM] [Info] Start training from score 399128.072384


Training LightGBM Models: 100%|██████████| 6/6 [1:11:05<00:00, 710.92s/it]


In [18]:

# Train CatBoost models
catboost_models = [CatBoostRegressor(iterations=1500, depth=8, learning_rate=0.1, loss_function='RMSE', task_type="GPU", devices='0') for _ in range(6)]
catboost_predictions_val = []

for i in tqdm(range(6), desc="Training CatBoost Models"):
    catboost_models[i].fit(X_train, y_train[:, i], eval_set=(X_val, y_val[:, i]), verbose=100)
    catboost_predictions_val.append(catboost_models[i].predict(X_val))





0:	learn: 0.1339849	test: 0.1350534	best: 0.1350534 (0)	total: 622ms	remaining: 15m 32s
100:	learn: 0.1006300	test: 0.1082108	best: 0.1082108 (100)	total: 48.1s	remaining: 11m 5s
200:	learn: 0.0939557	test: 0.1059489	best: 0.1059489 (200)	total: 1m 32s	remaining: 9m 55s
300:	learn: 0.0891647	test: 0.1047983	best: 0.1047983 (300)	total: 2m 14s	remaining: 8m 57s
400:	learn: 0.0850538	test: 0.1041561	best: 0.1041561 (400)	total: 2m 58s	remaining: 8m 8s
500:	learn: 0.0815046	test: 0.1036296	best: 0.1036296 (500)	total: 3m 41s	remaining: 7m 21s
600:	learn: 0.0781212	test: 0.1030955	best: 0.1030908 (598)	total: 4m 24s	remaining: 6m 36s
700:	learn: 0.0752208	test: 0.1026979	best: 0.1026979 (700)	total: 5m 7s	remaining: 5m 50s
800:	learn: 0.0724929	test: 0.1024238	best: 0.1024213 (799)	total: 5m 50s	remaining: 5m 5s
900:	learn: 0.0699885	test: 0.1022207	best: 0.1022207 (900)	total: 6m 33s	remaining: 4m 21s
1000:	learn: 0.0678439	test: 0.1020323	best: 0.1020297 (997)	total: 7m 15s	remaining: 3m



0:	learn: 6.7774734	test: 6.8095600	best: 6.8095600 (0)	total: 633ms	remaining: 15m 48s
100:	learn: 5.1733128	test: 5.5136138	best: 5.5136138 (100)	total: 48.1s	remaining: 11m 6s
200:	learn: 4.8409569	test: 5.4121626	best: 5.4121626 (200)	total: 1m 31s	remaining: 9m 50s
300:	learn: 4.5901328	test: 5.3687613	best: 5.3687502 (299)	total: 2m 14s	remaining: 8m 55s
400:	learn: 4.3816990	test: 5.3406836	best: 5.3406836 (400)	total: 2m 57s	remaining: 8m 5s
500:	learn: 4.2025242	test: 5.3160710	best: 5.3160710 (500)	total: 3m 39s	remaining: 7m 17s
600:	learn: 4.0421294	test: 5.3017980	best: 5.3016658 (597)	total: 4m 21s	remaining: 6m 31s
700:	learn: 3.8890773	test: 5.2883907	best: 5.2883907 (700)	total: 5m 4s	remaining: 5m 46s
800:	learn: 3.7546236	test: 5.2775280	best: 5.2775280 (800)	total: 5m 46s	remaining: 5m 2s
900:	learn: 3.6357616	test: 5.2693526	best: 5.2693526 (900)	total: 6m 27s	remaining: 4m 17s
1000:	learn: 3.5138333	test: 5.2614898	best: 5.2614898 (1000)	total: 7m 10s	remaining: 3



0:	learn: 4.1472702	test: 4.0812729	best: 4.0812729 (0)	total: 579ms	remaining: 14m 28s
100:	learn: 2.6034550	test: 2.8147015	best: 2.8147015 (100)	total: 47.6s	remaining: 10m 59s
200:	learn: 2.3788192	test: 2.7791819	best: 2.7790733 (199)	total: 1m 30s	remaining: 9m 44s
300:	learn: 2.2057596	test: 2.7622349	best: 2.7622349 (300)	total: 2m 12s	remaining: 8m 49s
400:	learn: 2.0628063	test: 2.7453909	best: 2.7453270 (399)	total: 2m 55s	remaining: 8m 2s
500:	learn: 1.9426868	test: 2.7355841	best: 2.7355841 (500)	total: 3m 38s	remaining: 7m 16s
600:	learn: 1.8339024	test: 2.7287259	best: 2.7287259 (600)	total: 4m 21s	remaining: 6m 30s
700:	learn: 1.7315664	test: 2.7196225	best: 2.7196225 (700)	total: 5m 3s	remaining: 5m 46s
800:	learn: 1.6484695	test: 2.7147135	best: 2.7146175 (798)	total: 5m 46s	remaining: 5m 2s
900:	learn: 1.5719502	test: 2.7108324	best: 2.7108006 (890)	total: 6m 28s	remaining: 4m 18s
1000:	learn: 1.4991558	test: 2.7080356	best: 2.7079604 (999)	total: 7m 11s	remaining: 3



0:	learn: 66.0860625	test: 65.1477491	best: 65.1477491 (0)	total: 618ms	remaining: 15m 26s
100:	learn: 50.5178944	test: 57.3415171	best: 57.3415171 (100)	total: 46s	remaining: 10m 36s
200:	learn: 45.2422377	test: 57.1039638	best: 57.1039638 (200)	total: 1m 28s	remaining: 9m 30s
300:	learn: 41.2014645	test: 56.9366275	best: 56.9118191 (295)	total: 2m 10s	remaining: 8m 39s
400:	learn: 37.9383427	test: 56.8696921	best: 56.8591565 (398)	total: 2m 51s	remaining: 7m 50s
500:	learn: 35.2936436	test: 56.8104233	best: 56.7726019 (488)	total: 3m 32s	remaining: 7m 3s
600:	learn: 33.2316468	test: 56.7977371	best: 56.7726019 (488)	total: 4m 12s	remaining: 6m 17s
700:	learn: 31.3292339	test: 56.7491217	best: 56.7253472 (656)	total: 4m 53s	remaining: 5m 34s
800:	learn: 29.7135202	test: 56.7730122	best: 56.7253472 (656)	total: 5m 33s	remaining: 4m 51s
900:	learn: 28.3472883	test: 56.7187246	best: 56.7155634 (874)	total: 6m 13s	remaining: 4m 8s
1000:	learn: 27.0213052	test: 56.7146649	best: 56.7025226 



0:	learn: 0.5830389	test: 0.5959107	best: 0.5959107 (0)	total: 590ms	remaining: 14m 44s
100:	learn: 0.4706864	test: 0.5131795	best: 0.5131795 (100)	total: 47.9s	remaining: 11m 3s
200:	learn: 0.4405814	test: 0.5057743	best: 0.5057726 (199)	total: 1m 31s	remaining: 9m 53s
300:	learn: 0.4179110	test: 0.5013489	best: 0.5013489 (300)	total: 2m 15s	remaining: 8m 58s
400:	learn: 0.3995557	test: 0.4983643	best: 0.4983398 (398)	total: 2m 57s	remaining: 8m 6s
500:	learn: 0.3825981	test: 0.4959363	best: 0.4959363 (500)	total: 3m 39s	remaining: 7m 18s
600:	learn: 0.3673403	test: 0.4942846	best: 0.4942846 (600)	total: 4m 22s	remaining: 6m 32s
700:	learn: 0.3526498	test: 0.4926736	best: 0.4926574 (696)	total: 5m 5s	remaining: 5m 48s
800:	learn: 0.3400903	test: 0.4914163	best: 0.4914163 (800)	total: 5m 47s	remaining: 5m 3s
900:	learn: 0.3278293	test: 0.4903967	best: 0.4903922 (899)	total: 6m 30s	remaining: 4m 19s
1000:	learn: 0.3170176	test: 0.4894530	best: 0.4894530 (1000)	total: 7m 12s	remaining: 3



0:	learn: 2213.2243723	test: 2135.4450335	best: 2135.4450335 (0)	total: 618ms	remaining: 15m 27s
100:	learn: 1616.6180507	test: 1713.6141079	best: 1713.6141079 (100)	total: 46.4s	remaining: 10m 42s
200:	learn: 1483.8709870	test: 1696.5594408	best: 1696.5352225 (198)	total: 1m 28s	remaining: 9m 32s
300:	learn: 1379.7298407	test: 1687.1669368	best: 1687.1669368 (300)	total: 2m 10s	remaining: 8m 39s
400:	learn: 1300.5809462	test: 1682.4933023	best: 1681.9631933 (397)	total: 2m 51s	remaining: 7m 50s
500:	learn: 1231.5904992	test: 1677.5886281	best: 1677.5886281 (500)	total: 3m 33s	remaining: 7m 4s
600:	learn: 1173.4193478	test: 1675.4056597	best: 1674.9213832 (593)	total: 4m 14s	remaining: 6m 20s
700:	learn: 1128.0237988	test: 1672.7107236	best: 1672.7107236 (700)	total: 4m 54s	remaining: 5m 35s
800:	learn: 1084.3512112	test: 1671.1206151	best: 1671.1206151 (800)	total: 5m 34s	remaining: 4m 52s
900:	learn: 1045.9263922	test: 1669.2194967	best: 1669.2194967 (900)	total: 6m 15s	remaining: 4m

Training CatBoost Models: 100%|██████████| 6/6 [1:06:52<00:00, 668.71s/it]


In [37]:
from itertools import combinations
from sklearn.metrics import r2_score
import numpy as np

# Generate a list of all models' validation predictions
all_predictions_val = [np.array(catboost_predictions_val).T,
                       np.array(xgboost_predictions_val).T,
                       np.array(lightgbm_predictions_val).T]

# Create a list of model names for reference
model_names = ['CatBoost', 'XGBoost', 'LightGBM']

# Initialize a dictionary to store R2 scores for each combination
r2_scores = {}

# Iterate over all possible non-empty combinations of the 3 models
for r in range(1, 4):
    for combo in combinations(range(3), r):
        # Select the corresponding predictions
        selected_predictions = [all_predictions_val[i] for i in combo]

        # Combine selected predictions into meta-features
        val_meta_features = np.column_stack(selected_predictions)

        # Train the meta-model (e.g., Linear Regression)
        meta_model = LinearRegression()
        meta_model.fit(val_meta_features, y_val)

        # Predict using the meta-model on the validation set
        val_meta_pred = meta_model.predict(val_meta_features)

        # Evaluate the performance of the meta-model
        meta_r2_score = r2_score(y_val, val_meta_pred)

        # Store the R2 score with the corresponding model names
        combo_names = [model_names[i] for i in combo]
        r2_scores[', '.join(combo_names)] = meta_r2_score

# Display the R2 scores for all combinations
for combo, score in r2_scores.items():
    print(f"Combination: {combo} | R2 Score: {score:.4f}")


Combination: CatBoost | R2 Score: 0.4237
Combination: XGBoost | R2 Score: 0.4062
Combination: LightGBM | R2 Score: 0.4238
Combination: CatBoost, XGBoost | R2 Score: 0.4378
Combination: CatBoost, LightGBM | R2 Score: 0.4395
Combination: XGBoost, LightGBM | R2 Score: 0.4369
Combination: CatBoost, XGBoost, LightGBM | R2 Score: 0.4448


In [38]:

import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

# Combine predictions for validation set using all three models
val_meta_features = np.column_stack((
    np.array(catboost_predictions_val).T,
    np.array(xgboost_predictions_val).T,
    np.array(lightgbm_predictions_val).T
))

# Train the meta-model (e.g., Linear Regression)
meta_model = LinearRegression()
meta_model.fit(val_meta_features, y_val)

# Predict using the meta-model on the validation set
val_meta_pred = meta_model.predict(val_meta_features)

# Evaluate the performance of the meta-model
meta_r2_score = r2_score(y_val, val_meta_pred)
print(f"Meta-model R2 Score (using all 3 models): {meta_r2_score:.4f}")


Meta-model R2 Score (using all 3 models): 0.4448


In [41]:
# Step 1: Generate predictions for the test set using each model
test_predictions_catboost = []
test_predictions_xgboost = []
test_predictions_lightgbm = []

X_test = test_full_feature_df.iloc[:, 1:]

for i in range(6):
    test_predictions_catboost.append(catboost_models[i].predict(X_test.values))
    test_predictions_xgboost.append(xgboost_models[i].predict(X_test.values))
    test_predictions_lightgbm.append(lightgbm_models[i].predict(X_test.values))

# Step 2: Combine the test predictions to create meta-features
test_meta_features = np.column_stack((
    np.array(test_predictions_catboost).T,
    np.array(test_predictions_xgboost).T,
    np.array(test_predictions_lightgbm).T
))

# Step 3: Generate final predictions using the meta-model
test_meta_pred = meta_model.predict(test_meta_features)

# Step 4: Prepare the submission file
submission_df = pd.DataFrame(test_meta_pred, columns=['X4', 'X11', 'X18', 'X26', 'X50', 'X3112'])
submission_df.insert(0, 'id', test_df['id'])
submission_df.to_csv('submission.csv', index=False)
print("Submission file generated successfully.")



Submission file generated successfully.
