# GDP PPP Prediction Challenge
## Data Mining - Doctorado UDP 2024
**Bastián González-Bustamante**

In [1]:
## Dependencies
import os
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from datetime import datetime

## Submissions folder
submissions_dir = 'submissions'
results = []

## Load ground truth for the test set and categorize 'NY.GDP.MKTP.PP.KD'
y_test = pd.read_csv("data/y_test.csv")  # Load ground truth for test set, not available on GitHub
y_test['NY.GDP.MKTP.PP.KD'] = pd.qcut(y_test['NY.GDP.MKTP.PP.KD'], q=5, labels=['Low', 'Medium-Low', 'Medium', 'Medium-High', 'High'])

## Set the date based on participant and submission
def get_dynamic_date(participant, submission):
    if participant == "Baseline" and submission == "1":
        return "2024-11-08"
    elif participant == "Fabian" and submission == "1":
        return "2024-11-08"
    elif participant == "Laura" and submission == "1":
        return "2024-11-08"
    elif participant == "Nando" and submission == "1":
        return "2024-11-08"
    elif participant == "Nando" and submission == "2":
        return "2024-11-10"
    elif participant == "Nando" and submission == "3":
        return "2024-11-11"
    ## Add more conditions as needed
    else:
        return datetime.now().strftime("%Y-%m-%d") 

## Mapping from numbers to labels
label_mapping = {0: "Low", 1: "Medium-Low", 2: "Medium", 3: "Medium-High", 4: "High"}

results = []

for submission_file in os.listdir(submissions_dir):
    if submission_file.endswith(".csv"):
        # Extract participant name, model name, and submission number from the file name
        parts = submission_file.split("_")
        participant_name = parts[1]
        model_name = parts[2]
        submission_number = parts[3].split(".")[0]
        
        ## Load participant's submission
        submission = pd.read_csv(os.path.join(submissions_dir, submission_file))

        ## Numeric labels to categorical labels if necessary
        if submission["predicted_label"].dtype in [int, float]:
            submission["predicted_label"] = submission["predicted_label"].map(label_mapping)

        ## Compute metrics
        accuracy = accuracy_score(y_test["NY.GDP.MKTP.PP.KD"], submission["predicted_label"])
        precision = precision_score(y_test["NY.GDP.MKTP.PP.KD"], submission["predicted_label"], average="macro")
        recall = recall_score(y_test["NY.GDP.MKTP.PP.KD"], submission["predicted_label"], average="macro")
        f1 = f1_score(y_test["NY.GDP.MKTP.PP.KD"], submission["predicted_label"], average="macro")
        
        ## Record results
        results.append({
            "Participant": participant_name,
            ## "Date": datetime.now().strftime("%Y-%m-%d"),
            "Date": get_dynamic_date(participant_name, submission_number),
            "Submission": submission_number,
            "Model": model_name,
            "Accuracy": accuracy,
            "Precision": precision,
            "Recall": recall,
            "F1-Score": f1
        })

## Sort by F1-Score
leaderboard = pd.DataFrame(results)
leaderboard = leaderboard.sort_values(by="F1-Score", ascending=False)

## Update Leaderboard
leaderboard.to_csv("leaderboard.csv", index=False)
print(leaderboard)

  Participant        Date Submission    Model  Accuracy  Precision    Recall  \
1      Fabian  2024-11-08          1   RF1000  0.897230   0.898401  0.897205   
3       Nando  2024-11-08          1    RF500  0.892761   0.895967  0.892669   
2       Laura  2024-11-08          1    RF100  0.806971   0.810881  0.806907   
5       Nando  2024-11-11          3  XGBoost  0.724754   0.773685  0.724768   
4       Nando  2024-11-10          2  XGBoost  0.695264   0.747207  0.695207   
0    Baseline  2024-11-08          1    RF100  0.529937   0.537815  0.529781   

   F1-Score  
1  0.897364  
3  0.892200  
2  0.808453  
5  0.715252  
4  0.678995  
0  0.532958  
