In [2]:
import os
import torch
import torch.nn as nn
from torchvision import models, transforms
from PIL import Image
import pandas as pd
import numpy as np


First using the cnn to read the visual data

In [3]:
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(
        mean=[0.485, 0.456, 0.406],
        std=[0.229, 0.224, 0.225]
    )
])


In [4]:
device = "cuda" if torch.cuda.is_available() else "cpu"

resnet = models.resnet50(pretrained=True)

# Remove classification head
resnet.fc = nn.Identity()

resnet = resnet.to(device)
resnet.eval()




ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): Bottleneck(
      (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (downsample): Sequential(
        (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 

In [5]:
def creating_ids(tab_df):
# create geo_id
    tab_df["geo_id"] = (
    tab_df["lat"].astype(str) + "_" +
    tab_df["long"].astype(str)
    )


    valid_geo_ids = set(tab_df["geo_id"])

    print("Target samples:", len(valid_geo_ids)) 

df_test=pd.read_csv('test2.csv')
df_train=pd.read_csv('train_preprocessed.csv')
creating_ids(df_test)
creating_ids(df_train)

Target samples: 5357
Target samples: 14137


In [7]:
image_dir = "images_test2"
image_files = os.listdir(image_dir)

embeddings = []
latitudes = []
longitudes = []
filenames = []
geo_ids = []

with torch.no_grad():
    for file in image_files:
        if not file.endswith(".png"):
            continue

        
        name = file.replace(".png", "")
        lat_str, lon_str = name.split("_")

        lat = float(lat_str)
        lon = float(lon_str)

       
        geo_id = f"{lat}_{lon}"

       
        

        
        img_path = os.path.join(image_dir, file)
        img = Image.open(img_path).convert("RGB")
        img = transform(img).unsqueeze(0).to(device)

        
        emb = resnet(img)             
        emb = emb.squeeze(0).cpu().numpy()

        # store
        embeddings.append(emb)
        latitudes.append(lat)
        longitudes.append(lon)
        filenames.append(file)
        geo_ids.append(geo_id)

embeddings = np.vstack(embeddings)

emb_df_test = pd.DataFrame(
    embeddings,
    columns=[f"img_emb_{i}" for i in range(embeddings.shape[1])]
)

emb_df_test["latitude"] = latitudes
emb_df_test["longitude"] = longitudes
emb_df_test["geo_id"] = geo_ids
emb_df_test["filename"] = filenames
    
emb_df_test.to_csv("image_embeddings_with_lat_lon_test_1.csv", index=False)

final_df_test = df_test.merge(
    emb_df_test,
    on="geo_id",
    how="inner"
)


In [9]:
image_dir = "images_train(1)"
image_files = os.listdir(image_dir)

embeddings = []
latitudes = []
longitudes = []
filenames = []
geo_ids = []

with torch.no_grad():
    for file in image_files:
        if not file.endswith(".png"):
            continue

        
        name = file.replace(".png", "")
        lat_str, lon_str = name.split("_")

        lat = float(lat_str)
        lon = float(lon_str)

       
        geo_id = f"{lat}_{lon}"

       
      

        
        img_path = os.path.join(image_dir, file)
        img = Image.open(img_path).convert("RGB")
        img = transform(img).unsqueeze(0).to(device)

        
        emb = resnet(img)             
        emb = emb.squeeze(0).cpu().numpy()

        # store
        embeddings.append(emb)
        latitudes.append(lat)
        longitudes.append(lon)
        filenames.append(file)
        geo_ids.append(geo_id)

embeddings = np.vstack(embeddings)

emb_df_train= pd.DataFrame(
    embeddings,
    columns=[f"img_emb_{i}" for i in range(embeddings.shape[1])]
)

emb_df_train["latitude"] = latitudes
emb_df_train["longitude"] = longitudes
emb_df_train["geo_id"] = geo_ids
emb_df_train["filename"] = filenames
    
emb_df_train.to_csv("image_embeddings_with_lat_lon.csv", index=False)






now we will start with the regression

In [10]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score,mean_squared_error

from xgboost import XGBRegressor

In [11]:
df=df_train
emb_df = emb_df_train

In [12]:
df["lat_round"] = df["lat"].round(5)
df["lon_round"] = df["long"].round(5)

df["geo_id"] = (
    df["lat_round"].astype(str) + "_" +
    df["lon_round"].astype(str)
)


In [13]:
final_df = df.merge(
    emb_df,
    on="geo_id",
    how="inner"
)


In [14]:
img_cols = [c for c in final_df.columns if c.startswith("img_emb_")]
target_col = "price"  

non_tabular_cols = (
    ["geo_id", "latitude", "longitude"] +
    img_cols
)
non_tabular_cols += ["lat_round", "lon_round", "filename"]
tabular_features = [
    c for c in final_df.columns
    if c not in non_tabular_cols and c != target_col
]



In [15]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

img_cols = [c for c in final_df.columns if c.startswith("img_emb_")]


scaler = StandardScaler()
img_scaled = scaler.fit_transform(final_df[img_cols])


pca = PCA(n_components=50, random_state=42)
img_pca = pca.fit_transform(img_scaled)

img_pca_df = pd.DataFrame(
    img_pca,
    columns=[f"img_pca_{i}" for i in range(50)]
)


X = pd.concat(
    [final_df[tabular_features].reset_index(drop=True),
     img_pca_df],
    axis=1
)


In [16]:
y=final_df['price']

In [17]:
from sklearn.model_selection import train_test_split
X_train ,X_val ,y_train ,y_val = train_test_split(X,y ,test_size =0.2 ,random_state =42)

In [18]:
from xgboost import XGBRegressor
xgb_model=XGBRegressor()


xgb_model.fit(X_train, y_train)


In [19]:
y_pred =xgb_model.predict(X_val)

In [20]:
rmse = np.sqrt(mean_squared_error(y_val, y_pred))

In [21]:
print("r2 score: " ,r2_score((y_val) ,(y_pred)))
print("rmse score: ",rmse)

r2 score:  0.868109941482544
rmse score:  119592.41286971343


now predicting the prices

In [22]:
img_cols = [c for c in final_df_test.columns if c.startswith("img_emb_")]
target_col = "price"  

non_tabular_cols = (
    ["geo_id", "latitude", "longitude"] +
    img_cols
)
non_tabular_cols += ["lat_round", "lon_round", "filename"]
tabular_features = [
    c for c in final_df_test.columns
    if c not in non_tabular_cols and c != target_col
]



In [23]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

img_cols = [c for c in final_df_test.columns if c.startswith("img_emb_")]

scaler = StandardScaler()
img_scaled = scaler.fit_transform(final_df_test[img_cols])

pca = PCA(n_components=50, random_state=42)
img_pca = pca.fit_transform(img_scaled)

img_pca_df = pd.DataFrame(
    img_pca,
    columns=[f"img_pca_{i}" for i in range(50)]
)


X_test = pd.concat(
    [final_df_test[tabular_features].reset_index(drop=True),
     img_pca_df],
    axis=1
)


In [24]:
X_test.drop(['date'],axis=1,inplace=True)


In [25]:
y_pred=xgb_model.predict(X_test.drop(['id'],axis=1))

In [26]:
y_pred=pd.Series(y_pred,
    index=X_test.index,
    name='price'
)

In [27]:
X_test=pd.concat([X_test,y_pred],axis=1)

In [28]:
df_1=X_test[['id','price']]

In [29]:
df_1.to_csv('final_submission_1.csv',index=False)