# Housing Prices Prediction Challenge
## Data Mining - Doctorado UDP 2025
### Bastián González-Bustamante

In [1]:
## Dependencies
import pandas as pd
import numpy as np
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

## Load datasets
## If is necessary change the route
parts = [f"data/X_train_part{i}.csv" for i in range(1, 4)]
X_train = pd.concat((pd.read_csv(p) for p in parts), ignore_index=True)
y_train = pd.read_csv("data/y_train.csv")
X_val = pd.read_csv("data/X_val.csv")
y_val = pd.read_csv("data/y_val.csv")

In [2]:
X_train.shape

(273207, 69)

In [3]:
y_train.shape

(273207, 1)

In [4]:
## PLESE MAINTAIN THIS PART OF THE CODE
## Discretising separately can cause problems (different thresholds)
## Discretise before split could leak information from training to validation/test
## Discretise based on the training data only to avoid data leakage
target = "price_sqm"
labels_all = ['Low', 'Medium-Low', 'Medium', 'Medium-High', 'High']

## Learn quantile bins on TRAIN ONLY
_, bins = pd.qcut(y_train[target], q=5, retbins=True, duplicates='drop')
bins[0], bins[-1] = -np.inf, np.inf
labels = labels_all[:len(bins) - 1]

## Apply the SAME thresholds
y_train[target] = pd.cut(y_train[target], bins=bins, labels=labels, include_lowest=True)
y_val[target]   = pd.cut(y_val[target],   bins=bins, labels=labels, include_lowest=True)

In [5]:
## Select features
selected_features = ['baujahr'] ## Year that housing unit was built
X_train = X_train[selected_features]
X_val = X_val[selected_features]

In [6]:
## Reshape y_train and y_val to 1D arrays
y_train = y_train.values.ravel()
y_val = y_val.values.ravel()

In [7]:
## Random Forest baseline
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [8]:
## Evaluate on validation set
val_predictions = model.predict(X_val)
accuracy = accuracy_score(y_val, val_predictions)
precision = precision_score(y_val, val_predictions, average="macro")
recall = recall_score(y_val, val_predictions, average="macro")
f1 = f1_score(y_val, val_predictions, average="macro")

print(f"Validation Metrics: Accuracy={accuracy}, Precision={precision}, Recall={recall}, F1 Score={f1}")

Validation Metrics: Accuracy=0.28421358294616017, Precision=0.2785572794246269, Recall=0.28411113630681994, F1 Score=0.2641014042771957


In [9]:
## COMPLETE METADDATA FOR SUBMISSION
## Naive Bayes (NB), Support Vector Machine (SVM), Random Forest (RFX) where X is the number of trees, XGBoost
participant = "Baseline" ## Your name
model_name = "RF100" ## Model
submission_number = "1" ## Submission number, please maintain quotes

In [10]:
## Save predictions for submission
## Only the features of the test set are provided for making predictions, if is necessary change the route
X_test = pd.read_csv("data/X_test.csv")
X_test = X_test[selected_features]
test_predictions = model.predict(X_test)
submission = pd.DataFrame({"id": X_test.index, "predicted_label": test_predictions})

## You need to create submissions folder if you are not running the code on GitHub repository
submission.to_csv(("submissions/submission_" + participant + "_" + model_name + "_" + submission_number + ".csv"), index=False)