# GDP PPP Prediction Challenge
## Data Mining - Doctorado UDP 2024
**Bastián González-Bustamante**

In [1]:
## Dependencies
import pandas as pd
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

## Load datasets
## If is necessary change the route
X_train = pd.read_csv("data/X_train.csv")
y_train = pd.read_csv("data/y_train.csv")
X_val = pd.read_csv("data/X_val.csv")
y_val = pd.read_csv("data/y_val.csv")

In [2]:
## PLESE MAINTAIN THIS PART OF THE CODE
## Discretize the target variable
y_train['NY.GDP.MKTP.PP.KD'] = pd.qcut(y_train['NY.GDP.MKTP.PP.KD'], q=5, labels=['Low', 'Medium-Low', 'Medium', 'Medium-High', 'High'])
y_val['NY.GDP.MKTP.PP.KD'] = pd.qcut(y_val['NY.GDP.MKTP.PP.KD'], q=5, labels=['Low', 'Medium-Low', 'Medium', 'Medium-High', 'High'])

In [3]:
## Select features
selected_features = ['SP.POP.TOTL'] 
X_train = X_train[selected_features]
X_val = X_val[selected_features]

In [4]:
## Reshape y_train and y_val to 1D arrays
y_train = y_train.values.ravel()
y_val = y_val.values.ravel()

In [5]:
## Random Forest baseline
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [6]:
## Evaluate on validation set
val_predictions = model.predict(X_val)
accuracy = accuracy_score(y_val, val_predictions)
precision = precision_score(y_val, val_predictions, average="macro")
recall = recall_score(y_val, val_predictions, average="macro")
f1 = f1_score(y_val, val_predictions, average="macro")

print(f"Validation Metrics: Accuracy={accuracy}, Precision={precision}, Recall={recall}, F1 Score={f1}")

Validation Metrics: Accuracy=0.575134168157424, Precision=0.5713410760184536, Recall=0.5749119154388213, F1 Score=0.5728072909169908


In [7]:
## COMPLETE METADDATA FOR SUBMISSION
## Naive Bayes (NB), Support Vector Machine (SVM), Random Forest (RF-X) where X is the number of trees, XGBoost
participant = "Baseline" ## Your name
model_name = "RF100" ## Model
submission_number = "1" ## Submission number, please maintain quotes

In [8]:
## Save predictions for submission
## Only the features of the test set are provided for making predictions, if is necessary change the route
X_test = pd.read_csv("data/X_test.csv")
X_test = X_test[selected_features]
test_predictions = model.predict(X_test)
submission = pd.DataFrame({"id": X_test.index, "predicted_label": test_predictions})

## You need to create submissions folder if you are not running the code on GitHub repository
submission.to_csv(("submissions/submission_" + participant + "_" + model_name + "_" + submission_number + ".csv"), index=False)