In [None]:
import os
import numpy as np
import pandas as pd
from PIL import Image
from tqdm import tqdm

import torch
import torch.nn as nn
from torchvision import models, transforms

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", device)

# Load tabular data
train = pd.read_excel("data/train.xlsx")
test  = pd.read_excel("data/test.xlsx")

# Make sure id exists
if "id" not in train.columns:
    train["id"] = train.index.astype(str)
if "id" not in test.columns:
    test["id"] = test.index.astype(str)

# Image folders
train_img_dir = "images/train"
test_img_dir  = "images/test"

# Image transforms (match ResNet expectations)
transform = transforms.Compose([
    transforms.Resize((224,224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std =[0.229, 0.224, 0.225]),
])

# Load pretrained ResNet18 and remove last classification layer
resnet = models.resnet18(weights=models.ResNet18_Weights.DEFAULT)
resnet.fc = nn.Identity()   # output becomes 512-d embedding
resnet = resnet.to(device)
resnet.eval()

@torch.no_grad()
def get_embedding(img_path):
    img = Image.open(img_path).convert("RGB")
    x = transform(img).unsqueeze(0).to(device)  # (1,3,224,224)
    emb = resnet(x).squeeze(0).cpu().numpy()    # (512,)
    return emb

def build_embeddings(df, img_dir):
    embs = []
    kept_ids = []
    for _id in tqdm(df["id"].astype(str).tolist()):
        path = os.path.join(img_dir, f"{_id}.png")
        if not os.path.exists(path):
            continue
        embs.append(get_embedding(path))
        kept_ids.append(_id)
    return np.vstack(embs), kept_ids

# Build embeddings
train_emb, train_ids = build_embeddings(train, train_img_dir)
test_emb,  test_ids  = build_embeddings(test,  test_img_dir)

print("Train embeddings:", train_emb.shape)
print("Test embeddings:", test_emb.shape)

# Save for reuse
np.save("train_img_emb.npy", train_emb)
np.save("test_img_emb.npy", test_emb)
pd.Series(train_ids).to_csv("train_ids_used.csv", index=False)
pd.Series(test_ids).to_csv("test_ids_used.csv", index=False)

print("Saved embeddings âœ…")


In [1]:
print("kernel alive")


kernel alive


In [2]:
# ===================== IMPORTS =====================
import os
import numpy as np
import pandas as pd
from PIL import Image
from tqdm import tqdm

import torch
import torch.nn as nn
from torchvision import models, transforms

# ===================== DEVICE =====================
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", device)

# ===================== LOAD DATA =====================
train = pd.read_excel("data/train.xlsx")
test  = pd.read_excel("data/test.xlsx")

if "id" not in train.columns:
    train["id"] = train.index.astype(str)
if "id" not in test.columns:
    test["id"] = test.index.astype(str)

train_img_dir = "images/train"
test_img_dir  = "images/test"

# ===================== TRANSFORMS =====================
transform = transforms.Compose([
    transforms.Resize((224,224)),
    transforms.ToTensor(),
    transforms.Normalize(
        mean=[0.485, 0.456, 0.406],
        std =[0.229, 0.224, 0.225]
    ),
])

# ===================== MODEL =====================
resnet = models.resnet18(weights=models.ResNet18_Weights.DEFAULT)
resnet.fc = nn.Identity()   # remove classifier â†’ embeddings
resnet = resnet.to(device)
resnet.eval()

# ===================== EMBEDDING FUNCTIONS =====================
@torch.no_grad()
def get_embedding(img_path):
    img = Image.open(img_path).convert("RGB")
    x = transform(img).unsqueeze(0).to(device)
    emb = resnet(x).squeeze(0).cpu().numpy()
    return emb

def build_embeddings(df, img_dir):
    embs = []
    ids  = []
    for _id in tqdm(df["id"].astype(str)):
        path = os.path.join(img_dir, f"{_id}.png")
        if not os.path.exists(path):
            continue
        embs.append(get_embedding(path))
        ids.append(_id)
    return np.vstack(embs), ids

# ===================== RUN =====================
train_emb, train_ids = build_embeddings(train, train_img_dir)
test_emb,  test_ids  = build_embeddings(test,  test_img_dir)

print("Train embeddings shape:", train_emb.shape)
print("Test embeddings shape:", test_emb.shape)

# ===================== SAVE =====================
np.save("train_img_emb.npy", train_emb)
np.save("test_img_emb.npy", test_emb)

pd.Series(train_ids).to_csv("train_ids_used.csv", index=False)
pd.Series(test_ids).to_csv("test_ids_used.csv", index=False)

print("Saved embeddings âœ…")


Device: cpu
Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to C:\Users\asus/.cache\torch\hub\checkpoints\resnet18-f37072fd.pth


100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 44.7M/44.7M [00:13<00:00, 3.36MB/s]
100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 16209/16209 [02:47<00:00, 96.89it/s]  
100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 5404/5404 [00:54<00:00, 99.34it/s]  

Train embeddings shape: (3040, 512)
Test embeddings shape: (1001, 512)
Saved embeddings âœ…





In [3]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor

# Load original data
train_df = pd.read_excel("data/train.xlsx")
test_df  = pd.read_excel("data/test.xlsx")

if "id" not in train_df.columns:
    train_df["id"] = train_df.index.astype(str)
if "id" not in test_df.columns:
    test_df["id"] = test_df.index.astype(str)

# Load embeddings
train_emb = np.load("train_img_emb.npy")
test_emb  = np.load("test_img_emb.npy")

train_ids = pd.read_csv("train_ids_used.csv").iloc[:,0].astype(str).tolist()
test_ids  = pd.read_csv("test_ids_used.csv").iloc[:,0].astype(str).tolist()

# Keep only rows that have embeddings
train_small = train_df[train_df["id"].astype(str).isin(train_ids)].copy()
test_small  = test_df[test_df["id"].astype(str).isin(test_ids)].copy()

# Sort to match embedding order
train_small = train_small.set_index("id").loc[train_ids].reset_index()
test_small  = test_small.set_index("id").loc[test_ids].reset_index()

print("Tabular train rows:", train_small.shape, "Embeddings:", train_emb.shape)
print("Tabular test rows :", test_small.shape,  "Embeddings:", test_emb.shape)

# Target
y = train_small["price"]
X_tab = train_small.drop(columns=["price"])

# We'll add embeddings later after preprocessing


KeyError: "None of [Index(['9117000170', '6700390210', '7212660540', '8562780200', '7760400350',\n       '464001025', '3432500486', '1126059095', '3876500290', '1865400075',\n       ...\n       '9407110710', '3523069060', '1788800630', '526059224', '2023049218',\n       '4302201085', '3293700496', '6623400187', '5132000140', '1954420170'],\n      dtype='object', name='id', length=3040)] are in the [index]"

In [4]:
train_df = pd.read_excel("data/train.xlsx")
print("train columns:", train_df.columns.tolist())
print("train id dtype:", train_df["id"].dtype if "id" in train_df.columns else "NO id column")

train_ids = pd.read_csv("train_ids_used.csv").iloc[:,0].astype(str)
print("sample train_ids_used:", train_ids.head().tolist())

if "id" in train_df.columns:
    print("sample train_df id:", train_df["id"].head().astype(str).tolist())


train columns: ['id', 'date', 'price', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade', 'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode', 'lat', 'long', 'sqft_living15', 'sqft_lot15']
train id dtype: int64
sample train_ids_used: ['9117000170', '6700390210', '7212660540', '8562780200', '7760400350']
sample train_df id: ['9117000170', '6700390210', '7212660540', '8562780200', '7760400350']


In [5]:
import numpy as np
import pandas as pd

train_df = pd.read_excel("data/train.xlsx")
test_df  = pd.read_excel("data/test.xlsx")

# Make ids string everywhere
train_df["id"] = train_df["id"].astype(str)
test_df["id"]  = test_df["id"].astype(str)

train_emb = np.load("train_img_emb.npy")
test_emb  = np.load("test_img_emb.npy")

train_ids = pd.read_csv("train_ids_used.csv").iloc[:,0].astype(str)
test_ids  = pd.read_csv("test_ids_used.csv").iloc[:,0].astype(str)

# Check duplicates
print("Duplicate train ids:", train_df["id"].duplicated().sum())
print("Duplicate test ids :", test_df["id"].duplicated().sum())

# âœ… Build mapping id -> row index (first occurrence)
train_map = train_df.drop_duplicates("id").set_index("id")
test_map  = test_df.drop_duplicates("id").set_index("id")

# âœ… Keep only ids that exist in tabular
train_ids_ok = [i for i in train_ids if i in train_map.index]
test_ids_ok  = [i for i in test_ids if i in test_map.index]

print("ids with embeddings:", len(train_ids), len(test_ids))
print("ids found in excel :", len(train_ids_ok), len(test_ids_ok))

# âœ… Align in same order as embeddings (but only ids that exist)
train_small = train_map.loc[train_ids_ok].reset_index()
test_small  = test_map.loc[test_ids_ok].reset_index()

# âœ… Trim embeddings to same length/order
# We must trim embeddings using the same mask
train_mask = [i in set(train_ids_ok) for i in train_ids]
test_mask  = [i in set(test_ids_ok) for i in test_ids]

train_emb_ok = train_emb[train_mask]
test_emb_ok  = test_emb[test_mask]

print("Final aligned shapes:")
print("train_small:", train_small.shape, "train_emb_ok:", train_emb_ok.shape)
print("test_small :", test_small.shape,  "test_emb_ok :", test_emb_ok.shape)


Duplicate train ids: 99
Duplicate test ids : 8
ids with embeddings: 3040 1001
ids found in excel : 3040 1001
Final aligned shapes:
train_small: (3040, 21) train_emb_ok: (3040, 512)
test_small : (1001, 20) test_emb_ok : (1001, 512)


In [6]:
import numpy as np
import pandas as pd

train_df = pd.read_excel("data/train.xlsx")
test_df  = pd.read_excel("data/test.xlsx")

# Make ids string everywhere
train_df["id"] = train_df["id"].astype(str)
test_df["id"]  = test_df["id"].astype(str)

train_emb = np.load("train_img_emb.npy")
test_emb  = np.load("test_img_emb.npy")

train_ids = pd.read_csv("train_ids_used.csv").iloc[:,0].astype(str)
test_ids  = pd.read_csv("test_ids_used.csv").iloc[:,0].astype(str)

# Check duplicates
print("Duplicate train ids:", train_df["id"].duplicated().sum())
print("Duplicate test ids :", test_df["id"].duplicated().sum())

# âœ… Build mapping id -> row index (first occurrence)
train_map = train_df.drop_duplicates("id").set_index("id")
test_map  = test_df.drop_duplicates("id").set_index("id")

# âœ… Keep only ids that exist in tabular
train_ids_ok = [i for i in train_ids if i in train_map.index]
test_ids_ok  = [i for i in test_ids if i in test_map.index]

print("ids with embeddings:", len(train_ids), len(test_ids))
print("ids found in excel :", len(train_ids_ok), len(test_ids_ok))

# âœ… Align in same order as embeddings (but only ids that exist)
train_small = train_map.loc[train_ids_ok].reset_index()
test_small  = test_map.loc[test_ids_ok].reset_index()

# âœ… Trim embeddings to same length/order
# We must trim embeddings using the same mask
train_mask = [i in set(train_ids_ok) for i in train_ids]
test_mask  = [i in set(test_ids_ok) for i in test_ids]

train_emb_ok = train_emb[train_mask]
test_emb_ok  = test_emb[test_mask]

print("Final aligned shapes:")
print("train_small:", train_small.shape, "train_emb_ok:", train_emb_ok.shape)
print("test_small :", test_small.shape,  "test_emb_ok :", test_emb_ok.shape)


Duplicate train ids: 99
Duplicate test ids : 8
ids with embeddings: 3040 1001
ids found in excel : 3040 1001
Final aligned shapes:
train_small: (3040, 21) train_emb_ok: (3040, 512)
test_small : (1001, 20) test_emb_ok : (1001, 512)


In [7]:
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor
import numpy as np

y = train_small["price"]
X_tab = train_small.drop(columns=["price"])

cat_cols = X_tab.select_dtypes(include=["object"]).columns.tolist()
num_cols = [c for c in X_tab.columns if c not in cat_cols]

numeric_tf = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_tf = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocess = ColumnTransformer([
    ("num", numeric_tf, num_cols),
    ("cat", categorical_tf, cat_cols)
])

X_tab_processed = preprocess.fit_transform(X_tab)
X_tab_dense = X_tab_processed.toarray() if hasattr(X_tab_processed, "toarray") else X_tab_processed

X = np.hstack([X_tab_dense, train_emb_ok])
print("Multimodal X shape:", X.shape)

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestRegressor(n_estimators=400, random_state=42, n_jobs=-1)
model.fit(X_train, y_train)

pred = model.predict(X_val)
rmse = mean_squared_error(y_val, pred) ** 0.5
r2 = r2_score(y_val, pred)

print("Multimodal RMSE:", rmse)
print("Multimodal R2:", r2)


Multimodal X shape: (3040, 3839)
Multimodal RMSE: 232377.74057089243
Multimodal R2: 0.682903635612464


In [8]:
print("Multimodal RandomForest RMSE:", rf_rmse)
print("Multimodal RandomForest R2:", rf_r2)


NameError: name 'rf_rmse' is not defined

In [9]:
print("Multimodal RMSE:", rmse)
print("Multimodal R2:", r2)


Multimodal RMSE: 232377.74057089243
Multimodal R2: 0.682903635612464


In [10]:
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score

xgb_model = XGBRegressor(
    n_estimators=800,
    max_depth=8,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1
)

xgb_model.fit(X_train, y_train)

xgb_pred = xgb_model.predict(X_val)

xgb_rmse = mean_squared_error(y_val, xgb_pred) ** 0.5
xgb_r2 = r2_score(y_val, xgb_pred)

print("ðŸ”¥ XGBoost Multimodal RMSE:", xgb_rmse)
print("ðŸ”¥ XGBoost Multimodal R2:", xgb_r2)


ðŸ”¥ XGBoost Multimodal RMSE: 237397.52504185884
ðŸ”¥ XGBoost Multimodal R2: 0.6690559387207031


In [11]:
print("ðŸ“Š MODEL COMPARISON")
print("-" * 40)
print(f"RandomForest RMSE: {rf_rmse:.2f}")
print(f"RandomForest R2  : {rf_r2:.4f}")
print()
print(f"XGBoost RMSE     : {xgb_rmse:.2f}")
print(f"XGBoost R2       : {xgb_r2:.4f}")


ðŸ“Š MODEL COMPARISON
----------------------------------------


NameError: name 'rf_rmse' is not defined

In [12]:
rmse = mean_squared_error(y_val, pred) ** 0.5
r2 = r2_score(y_val, pred)

print("Multimodal RMSE:", rmse)
print("Multimodal R2:", r2)


Multimodal RMSE: 232377.74057089243
Multimodal R2: 0.682903635612464


In [13]:
# Save RF metrics with proper names
rf_rmse = rmse
rf_r2 = r2


In [14]:
print("ðŸ“Š MODEL COMPARISON")
print("-" * 40)

print(f"RandomForest RMSE: {rf_rmse:.2f}")
print(f"RandomForest R2  : {rf_r2:.4f}")
print()

print(f"XGBoost RMSE     : {xgb_rmse:.2f}")
print(f"XGBoost R2       : {xgb_r2:.4f}")


ðŸ“Š MODEL COMPARISON
----------------------------------------
RandomForest RMSE: 232377.74
RandomForest R2  : 0.6829

XGBoost RMSE     : 237397.53
XGBoost R2       : 0.6691


In [15]:
## PCA on Image Embeddings (Dimensionality Reduction)


In [16]:
from sklearn.decomposition import PCA

# Reduce CNN embeddings from 512 â†’ 64
pca = PCA(n_components=64, random_state=42)

train_img_pca = pca.fit_transform(train_emb_ok)
test_img_pca  = pca.transform(test_emb_ok)

print("PCA explained variance:", pca.explained_variance_ratio_.sum())
print("Train PCA shape:", train_img_pca.shape)
print("Test PCA shape :", test_img_pca.shape)


PCA explained variance: 0.75626576
Train PCA shape: (3040, 64)
Test PCA shape : (1001, 64)


In [17]:
import numpy as np

# Combine tabular + PCA image features
X_train_mm_pca = np.hstack([X_tab_train.values, train_img_pca])
X_test_mm_pca  = np.hstack([X_tab_test.values, test_img_pca])

print("Final multimodal train shape:", X_train_mm_pca.shape)
print("Final multimodal test shape :", X_test_mm_pca.shape)


NameError: name 'X_tab_train' is not defined

In [18]:
# Tabular features (drop target)
X_tab_train = train_small.drop("price", axis=1)
X_tab_test  = test_small.drop("price", axis=1)

print("Tabular train shape:", X_tab_train.shape)
print("Tabular test shape :", X_tab_test.shape)


KeyError: "['price'] not found in axis"

In [19]:
# Train tabular features + target
X_tab_train = train_small.drop("price", axis=1)
y_train_tab = train_small["price"]

# Test tabular features (NO price column)
X_tab_test = test_small.copy()

print("Tabular train shape:", X_tab_train.shape)
print("Tabular test shape :", X_tab_test.shape)


Tabular train shape: (3040, 20)
Tabular test shape : (1001, 20)


In [20]:
import numpy as np

X_train_mm_pca = np.hstack([X_tab_train.values, train_img_pca])
X_test_mm_pca  = np.hstack([X_tab_test.values, test_img_pca])

print("Final multimodal train shape:", X_train_mm_pca.shape)
print("Final multimodal test shape :", X_test_mm_pca.shape)


Final multimodal train shape: (3040, 84)
Final multimodal test shape : (1001, 84)


In [21]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

X_tr, X_val, y_tr, y_val = train_test_split(
    X_train_mm_pca, y_train_tab, test_size=0.2, random_state=42
)

rf = RandomForestRegressor(
    n_estimators=300,
    random_state=42,
    n_jobs=-1
)

rf.fit(X_tr, y_tr)
pred = rf.predict(X_val)

rmse = mean_squared_error(y_val, pred) ** 0.5
r2 = r2_score(y_val, pred)

print("ðŸ”¥ PCA Multimodal RF RMSE:", rmse)
print("ðŸ”¥ PCA Multimodal RF R2 :", r2)


ValueError: could not convert string to float: '20141009T000000'

In [22]:
print(X_train_mm_pca.shape)
print(len(y_train_tab))
print(type(X_train_mm_pca), type(y_train_tab))
print(np.isnan(X_train_mm_pca).sum())
print(y_train_tab.isna().sum() if hasattr(y_train_tab, "isna") else np.isnan(y_train_tab).sum())


(3040, 84)
3040
<class 'numpy.ndarray'> <class 'pandas.core.series.Series'>


TypeError: ufunc 'isnan' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe''

In [23]:
import numpy as np

# y (target)
y_train_tab = train_small["price"].astype(float).values

# Tabular features: drop target, then keep only numeric columns
X_tab_train = train_small.drop(columns=["price"], errors="ignore")
X_tab_test  = test_small.copy()

X_tab_train_num = X_tab_train.select_dtypes(include=["number"]).fillna(0)
X_tab_test_num  = X_tab_test.select_dtypes(include=["number"]).fillna(0)

print("Tab train numeric:", X_tab_train_num.shape)
print("Tab test numeric :", X_tab_test_num.shape)


Tab train numeric: (3040, 18)
Tab test numeric : (1001, 18)


In [24]:
X_train_mm_pca = np.hstack([X_tab_train_num.values, train_img_pca])
X_test_mm_pca  = np.hstack([X_tab_test_num.values,  test_img_pca])

print("Final X train:", X_train_mm_pca.shape)
print("dtype:", X_train_mm_pca.dtype)   # should NOT be object


Final X train: (3040, 82)
dtype: float64


In [25]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

X_tr, X_val, y_tr, y_val = train_test_split(
    X_train_mm_pca, y_train_tab, test_size=0.2, random_state=42
)

rf = RandomForestRegressor(n_estimators=300, random_state=42, n_jobs=-1)
rf.fit(X_tr, y_tr)

pred = rf.predict(X_val)
rmse = mean_squared_error(y_val, pred) ** 0.5
r2   = r2_score(y_val, pred)

print("ðŸ”¥ PCA Multimodal RF RMSE:", rmse)
print("ðŸ”¥ PCA Multimodal RF R2  :", r2)


ðŸ”¥ PCA Multimodal RF RMSE: 199669.9256974981
ðŸ”¥ PCA Multimodal RF R2  : 0.7658859241493154


In [26]:
from sklearn.ensemble import RandomForestRegressor
import pandas as pd
import numpy as np

final_rf = RandomForestRegressor(
    n_estimators=600,
    random_state=42,
    n_jobs=-1
)

final_rf.fit(X_train_mm_pca, y_train_tab)

test_pred = final_rf.predict(X_test_mm_pca)

print("Pred done âœ…", test_pred.shape, "min/max:", test_pred.min(), test_pred.max())


Pred done âœ… (1001,) min/max: 172997.115 2204384.7666666666


In [27]:
from sklearn.ensemble import RandomForestRegressor
import pandas as pd
import numpy as np

final_rf = RandomForestRegressor(
    n_estimators=600,
    random_state=42,
    n_jobs=-1
)

final_rf.fit(X_train_mm_pca, y_train_tab)

test_pred = final_rf.predict(X_test_mm_pca)

print("Pred done âœ…", test_pred.shape, "min/max:", test_pred.min(), test_pred.max())


Pred done âœ… (1001,) min/max: 172997.115 2204384.7666666666


In [28]:
submission = pd.DataFrame({
    "id": test_small["id"].astype(str).values,
    "price": test_pred
})

submission.to_csv("submission_rf_pca.csv", index=False)
submission.head()


Unnamed: 0,id,price
0,2591820310,375878.4
1,7974200820,810414.9
2,7701450110,1140396.0
3,9522300010,1697151.0
4,9510861140,716120.4


In [29]:
import joblib
joblib.dump(final_rf, "final_rf_pca.joblib")
print("Saved model âœ…")


Saved model âœ…


In [30]:
submission = pd.DataFrame({
    "id": test_small["id"].astype(str).values,
    "price": test_pred
})
submission.to_csv("submission_rf_pca.csv", index=False)
print("Saved âœ… submission_rf_pca.csv")
submission.head()


Saved âœ… submission_rf_pca.csv


Unnamed: 0,id,price
0,2591820310,375878.4
1,7974200820,810414.9
2,7701450110,1140396.0
3,9522300010,1697151.0
4,9510861140,716120.4


In [31]:
import os
print("CSV exists:", os.path.exists("submission_rf_pca.csv"))
print("Model exists:", os.path.exists("final_rf_pca.joblib"))


CSV exists: True
Model exists: True


In [32]:
import pandas as pd
pd.read_csv("submission_rf_pca.csv").head()


Unnamed: 0,id,price
0,2591820310,375878.4
1,7974200820,810414.9
2,7701450110,1140396.0
3,9522300010,1697151.0
4,9510861140,716120.4
