In [2]:
import tensorflow as tf
import tensorflow_decision_forests as tfdf
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Comment this if the data visualisations doesn't work on your side
%matplotlib inline

In [3]:
train_file_path = "../input/house-prices-advanced-regression-techniques/train.csv"
dataset_df = pd.read_csv(train_file_path)
print("Full train dataset shape is {}".format(dataset_df.shape))

Full train dataset shape is (1460, 81)


In [4]:
dataset_df.head(3)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500


In [8]:
dataset_df = dataset_df.drop('Id', axis=1)


taking 20% as a validation data, this helps finding the best hyperparameters and avoiding overfitting

In [9]:
import numpy as np

def split_dataset(dataset, test_ratio=0.20):
  test_indices = np.random.rand(len(dataset)) < test_ratio
  return dataset[~test_indices], dataset[test_indices]

train_ds_pd, valid_ds_pd = split_dataset(dataset_df)
print("{} examples in training, {} examples in testing.".format(
    len(train_ds_pd), len(valid_ds_pd)))

1034 examples in training, 426 examples in testing.


a strong point for tensorflow decision forests is that they can take data with a mix of numerical/non-numerical data

In [10]:
label = 'SalePrice'
train_ds = tfdf.keras.pd_dataframe_to_tf_dataset(train_ds_pd, label=label, task = tfdf.keras.Task.REGRESSION)
valid_ds = tfdf.keras.pd_dataframe_to_tf_dataset(valid_ds_pd, label=label, task = tfdf.keras.Task.REGRESSION)

stopping at 170 decision trees, next you'll find out why

In [11]:
rf = tfdf.keras.RandomForestModel(task = tfdf.keras.Task.REGRESSION,num_trees=170)
rf.compile(metrics=["mse"]) 

Use /tmp/tmpn0sey966 as temporary training directory


In [13]:
rf.fit(x=train_ds)

Reading training dataset...
Training dataset read in 0:00:00.065466. Found 1034 examples.
Training model...


[INFO 23-11-23 20:23:06.7733 UTC kernel.cc:1243] Loading model from path /tmp/tmpn0sey966/model/ with prefix f6a141dd5c0f4e71


Model trained in 0:00:02.002453
Compiling model...


[INFO 23-11-23 20:23:07.1609 UTC decision_forest.cc:660] Model loaded with 300 root(s), 98306 node(s), and 73 input feature(s).
[INFO 23-11-23 20:23:07.1610 UTC abstract_model.cc:1311] Engine "RandomForestOptPred" built
[INFO 23-11-23 20:23:07.1611 UTC kernel.cc:1075] Use fast generic engine


Model compiled.


<keras.src.callbacks.History at 0x7af43c3ce0b0>

In [None]:
import matplotlib.pyplot as plt
logs = rf.make_inspector().training_logs()
plt.plot([log.num_trees for log in logs], [log.evaluation.rmse for log in logs])
plt.xlabel("Number of trees")
plt.ylabel("RMSE (out-of-bag)")
plt.show()

this is why 

In [14]:
test_file_path = "../input/house-prices-advanced-regression-techniques/test.csv"
test_data = pd.read_csv(test_file_path)
ids = test_data.pop('Id')

test_ds = tfdf.keras.pd_dataframe_to_tf_dataset(
    test_data,
    task = tfdf.keras.Task.REGRESSION)

preds = rf.predict(test_ds)
output = pd.DataFrame({'Id': ids,
                       'SalePrice': preds.squeeze()})

output.head()



Unnamed: 0,Id,SalePrice
0,1461,126864.226562
1,1462,153227.53125
2,1463,181783.75
3,1464,184474.859375
4,1465,196864.25


In [15]:
output.to_csv('submission.csv', index=False)

submit this file and enjoy a 0.14 score