## Imported Libraries

In [3]:
import pandas as pd
import os
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split

## Setup

In [4]:
# data taken from Monthly Gold Prices (1979-2021) on Kaggle
gold_data_path = os.path.abspath('gold.csv')
gold_data = pd.read_csv(gold_data_path)

# print(gold_data.describe())

gold_data = gold_data.dropna(axis=0)  # drops not available data

gold_data_features = ['Open', 'Low', 'Volume']  # features for the prediction
y = gold_data.Close  # prediction target
x = gold_data[gold_data_features]

## Validation Testing

In [5]:
train_x, validation_x, train_y, validation_y = train_test_split(x, y, random_state=1)   # splits the data up into training and validation data; seed 1 for rng

gold_model = DecisionTreeRegressor(random_state=1)    # decision tree model

gold_model.fit(train_x, train_y)    # training

validation_predictions = gold_model.predict(validation_x)   # predictions based on validation features

MAE = mean_absolute_error(validation_y, validation_predictions) # actual value vs prediction value

print(f"Non-Optimized Model MAE: {MAE} \n")

Non-Optimized Model MAE: 10.26907122905028 



## Optimized Model

In [6]:
# model optimization: checks to see the amount many leaf nodes needed for a better MAE 
def getMAE(leaf_nodes, train_x, train_y, val_x, val_y): # function for checking other tree sizes
    model = DecisionTreeRegressor(max_leaf_nodes=leaf_nodes, random_state=1)
    model.fit(train_x, train_y)
    val_predict = model.predict(val_x)
    mae = mean_absolute_error(val_y, val_predict)
    return(mae)


leafs_to_try = [10, 30, 50, 70, 100, 150, 200, 300, 400, 500, 600, 1000]    # other tree sizes

MAEvalue = {}   # empty dictionary for the for loop

for i in leafs_to_try:  # iterates through other tree sizes and stores the outcomes in a hash table
    MAEvalue[i] = getMAE(i, train_x, train_y, validation_x, validation_y)
    print(f"Max Leaf Node: {i}  \t\t MAE: {getMAE(i, train_x, train_y, validation_x, validation_y)}")

optimal_tree_size = min(MAEvalue, key=MAEvalue.get)  # finds the smallest MAE value and outputs the key/tree size
print(f"\n Optimal Tree Size: {optimal_tree_size}")

Max Leaf Node: 10  		 MAE: 39.7833661572689
Max Leaf Node: 30  		 MAE: 17.47898893612397
Max Leaf Node: 50  		 MAE: 13.91380965239011
Max Leaf Node: 70  		 MAE: 13.106381307462183
Max Leaf Node: 100  		 MAE: 12.585506401119742
Max Leaf Node: 150  		 MAE: 11.762034122868641
Max Leaf Node: 200  		 MAE: 10.780831027206021
Max Leaf Node: 300  		 MAE: 10.28359831470839
Max Leaf Node: 400  		 MAE: 10.206088972992944
Max Leaf Node: 500  		 MAE: 10.235753685456274
Max Leaf Node: 600  		 MAE: 10.208380659156218
Max Leaf Node: 1000  		 MAE: 10.439191604150114

 Optimal Tree Size: 400


## Final Model

In [8]:
final_model = DecisionTreeRegressor(max_leaf_nodes=optimal_tree_size, random_state=1)

final_model.fit(x, y)   # model has been optimized, so there is no need for using validation data

final_gold_prediction = final_model.predict(x)

## Output to CSV

In [9]:
output = pd.DataFrame({'Date': gold_data.Date,
                       'Close': final_gold_prediction})
output.to_csv('gold_predictions_DT.csv', index=False)