In [35]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import xgboost as xgb

In [36]:
train_df = pd.read_csv("../Data/train_clean.csv")
test_df  = pd.read_csv("../Data/test_clean.csv")

train_df.head()

y = train_df["price"]

Drop = ["id" ,"price"]
X_tab = train_df.drop(columns=Drop, errors="ignore")

In [None]:
#**************Tabular Model part**********************

X_train, X_test, y_train, y_test = train_test_split(
    X_tab, y, test_size=0.2, random_state=42
)

xgb_model = xgb.XGBRegressor(
    n_estimators=300,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

xgb_model.fit(X_train, y_train)

y_pred = xgb_model.predict(X_test)

rmse_tab = np.sqrt(mean_squared_error(y_test, y_pred))
r2_tab = r2_score(y_test, y_pred)

rmse_tab, r2_tab

(np.float64(106209.96201863552), 0.9077191352844238)

In [38]:
# ******************** Feature Extraction part (image embeddings)*****************

# device = torch.device("cpu")

# resnet = models.resnet18(weights="IMAGENET1K_V1")
# resnet.fc = nn.Identity()
# resnet = resnet.to(device)
# resnet.eval()

In [39]:
# transform = transforms.Compose([
#     transforms.Resize((224, 224)),
#     transforms.ToTensor(),
#     transforms.Normalize(
#         mean=[0.485, 0.456, 0.406],
#         std=[0.229, 0.224, 0.225]
#     )
# ])

In [40]:
# from PIL import Image

# class PropertyImageDataset(Dataset):
#     def __init__(self, df, img_dir, transform):
#         self.df = df.reset_index(drop=True)
#         self.img_dir = img_dir
#         self.transform = transform

#     def __len__(self):
#         return len(self.df)

#     def __getitem__(self, idx):
#         img_id = self.df.loc[idx, "id"]
#         img_path = os.path.join(self.img_dir, f"{img_id}.png")

#         image = Image.open(img_path).convert("RGB")
#         image = self.transform(image)

#         return image


In [41]:
# def extract_embeddings(df, img_dir, resnet, transform, batch_size=32):
#     dataset = PropertyImageDataset(df, img_dir, transform)
#     loader = DataLoader(dataset, batch_size=batch_size, shuffle=False)

#     embeddings = []

#     resnet.eval()
#     with torch.no_grad():
#         for imgs in loader:
#             imgs = imgs.to(device)
#             feats = resnet(imgs)
#             embeddings.append(feats.cpu().numpy())

#     return np.vstack(embeddings)

In [42]:
# IMG_DIR_TRAIN = "../Data/images/train"

# train_df = train_df.reset_index(drop=True)

# train_img_features = extract_embeddings(
#     df=train_df,
#     img_dir=IMG_DIR_TRAIN,
#     resnet=resnet,
#     transform=transform
# )

# np.save("../Data/image_embeddings_train.npy", train_img_features)

In [43]:
# IMG_DIR_TEST = "../Data/images/test"

# test_df = test_df.reset_index(drop=True)

# test_img_embeddings = extract_embeddings(
#     df=test_df,
#     img_dir=IMG_DIR_TEST,
#     resnet=resnet,
#     transform=transform
# )

# np.save("../Data/image_embeddings_test.npy", test_img_embeddings)

In [None]:
# ***************Tabular + Satellite Image Model part**********************

img_features = np.load("../Data/image_embeddings_train.npy")

X_tab = X_tab.to_numpy()
y = y.to_numpy()

X_multi = np.hstack([X_tab, img_features])

In [45]:
X_train, X_val, y_train, y_val = train_test_split(
    X_multi, y, test_size=0.2, random_state=42
)

In [46]:
xgb_multi = xgb.XGBRegressor(
    n_estimators=300,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1
)

xgb_multi.fit(X_train, y_train)

0,1,2
,objective,'reg:squarederror'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.8
,device,
,early_stopping_rounds,
,enable_categorical,False


In [47]:
y_pred = xgb_multi.predict(X_val)

rmse_multi = np.sqrt(mean_squared_error(y_val, y_pred))
r2_multi = r2_score(y_val, y_pred)

rmse_multi, r2_multi

(np.float64(114618.87202376405), 0.8925284743309021)

In [48]:
# ***********Prediction for test data (Multimodal Prediction)*********************



test_img_embeddings = np.load("../Data/image_embeddings_test.npy")
test_img_embeddings.shape


X_test_tab = test_df.drop(columns=["id"], errors="ignore")

X_test_multi = np.hstack([X_test_tab, test_img_embeddings])

test_preds_clean = xgb_multi.predict(X_test_multi)

pred_map = dict(zip(test_df["id"], test_preds_clean)) # (for handelling duplicates)

test_original = pd.read_excel("../Data/test2.xlsx")
test_original.shape

# To have the same oreder of id in original data set and predicted file 
test_original["predicted_price"] = test_original["id"].map(pred_map)

In [49]:
# saving the file

# final_df = test_original[["id", "predicted_price"]]

# final_df.to_csv("../24117044_file.csv", index=False)