# Housing Prices Prediction Challenge
## Data Mining - Doctorado UDP 2025
### Bastián González-Bustamante

In [1]:
## Dependencies
import os
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
## from datetime import datetime

## Submissions folder
submissions_dir = 'submissions'
results = []

## Load training set and ground truth
y_train = pd.read_csv("data/y_train.csv")
y_test = pd.read_csv("data/y_test.csv")  ## Load ground truth for test set, not available on GitHub

## Discretise based on the training data only to avoid data leakage
target = "price_sqm"
labels_all = ['Low', 'Medium-Low', 'Medium', 'Medium-High', 'High']

## Learn quantile bins on TRAIN ONLY
_, bins = pd.qcut(y_train[target], q=5, retbins=True, duplicates='drop')
bins[0], bins[-1] = -np.inf, np.inf
labels = labels_all[:len(bins) - 1]

# Apply the SAME thresholds
y_train[target] = pd.cut(y_train[target], bins=bins, labels=labels, include_lowest=True)
y_test[target]  = pd.cut(y_test[target], bins=bins, labels=labels, include_lowest=True)

## Set the date based on participant and submission
def get_dynamic_date(participant, submission):
    if participant == "Baseline" and submission == "1":
        return "2025-11-06"
    elif participant == "Victor" and submission == "1":
        return "2025-11-06"
    elif participant == "Luis" and submission == "1":
        return "2025-11-06"
    elif participant == "Dayana" and submission == "1":
        return "2025-11-07"
    elif participant == "Dayana" and submission == "2":
        return "2025-11-11"
    elif participant == "Dayana" and submission == "3":
        return "2025-11-12"
    elif participant == "Dayana" and submission == "4":
        return "2025-11-13"
    elif participant == "Victor" and submission == "2":
        return "2025-11-13"
    elif participant == "Victor" and submission == "3":
        return "2025-11-14"
    elif participant == "Luis" and submission == "2":
        return "2025-11-14"
    elif participant == "Victor" and submission == "4":
        return "2025-11-15"
    elif participant == "Luis" and submission == "3":
        return "2025-11-15"
    elif participant == "Luis" and submission == "4":
        return "2025-11-17"
    elif participant == "Gabriel" and submission == "1":
        return "2025-11-18"
    elif participant == "Gabriel" and submission == "2":
        return "2025-11-19"
    elif participant == "Victor" and submission == "5":
        return "2025-11-21"
    elif participant == "Gabriel" and submission == "3":
        return "2025-11-26"
    ## Add more submissions as needed
    else:
        return datetime.now().strftime("%Y-%m-%d") 

## Mapping from numbers to labels
label_mapping = {0: "Low", 1: "Medium-Low", 2: "Medium", 3: "Medium-High", 4: "High"}

results = []

for submission_file in os.listdir(submissions_dir):
    if submission_file.endswith(".csv"):
        ## Extract participant name, model name, and submission number from the file name
        parts = submission_file.split("_")
        participant_name = parts[1]
        model_name = parts[2]
        submission_number = parts[3].split(".")[0]
        
        ## Load participant's submission
        submission = pd.read_csv(os.path.join(submissions_dir, submission_file))

        ## Numeric labels to categorical labels if necessary
        if submission["predicted_label"].dtype in [int, float]:
            submission["predicted_label"] = submission["predicted_label"].map(label_mapping)

        ## Compute metrics
        accuracy = accuracy_score(y_test["price_sqm"], submission["predicted_label"])
        precision = precision_score(y_test["price_sqm"], submission["predicted_label"], average="macro")
        recall = recall_score(y_test["price_sqm"], submission["predicted_label"], average="macro")
        f1 = f1_score(y_test["price_sqm"], submission["predicted_label"], average="macro")
        
        ## Record results
        results.append({
            "Participant": participant_name,
            "Date": get_dynamic_date(participant_name, submission_number),
            "Submission": submission_number,
            "Model": model_name,
            "Accuracy": accuracy,
            "Precision": precision,
            "Recall": recall,
            "F1-Score": f1
        })

## Sort by F1-Score
leaderboard = pd.DataFrame(results)
leaderboard = leaderboard.sort_values(by="F1-Score", ascending=False)

## Update Leaderboard
leaderboard.to_csv("leaderboard.csv", index=False)
print(leaderboard)

   Participant        Date Submission  Model  Accuracy  Precision    Recall  \
11        Luis  2025-11-17          4  RF500  0.691314   0.689726  0.691510   
10        Luis  2025-11-15          3  RF500  0.688376   0.686957  0.688566   
9         Luis  2025-11-14          2  RF100  0.686737   0.685288  0.686923   
7      Gabriel  2025-11-26          3  RF500  0.569989   0.565791  0.570350   
6      Gabriel  2025-11-19          2  RF300  0.560407   0.553337  0.560802   
5      Gabriel  2025-11-18          1  RF300  0.532616   0.527531  0.532952   
16      Victor  2025-11-21          5  RF700  0.527902   0.521993  0.528215   
15      Victor  2025-11-15          4  RF100  0.525835   0.520008  0.526140   
8         Luis  2025-11-06          1  RF100  0.416278   0.417357  0.416419   
2       Dayana  2025-11-11          2  RF100  0.416159   0.417243  0.416301   
14      Victor  2025-11-14          3  RF100  0.386779   0.383570  0.386998   
13      Victor  2025-11-13          2  RF100  0.3487