In [None]:
import tensorflow as tf
import tensorflow_decision_forests as tfdf
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Comment this if the data visualisations doesn't work on your side
%matplotlib inline

In [None]:
# Comment this if the data visualisations doesn't work on your side
%matplotlib inline
print("TensorFlow v" + tf.__version__)
print("TensorFlow Decision Forests v" + tfdf.__version__)

In [None]:
#Load the dataset
train_file_path = "../input/house-prices-advanced-regression-techniques/train.csv"
dataset_df = pd.read_csv(train_file_path)
print("Full train dataset shape is {}".format(dataset_df.shape))
#Full train dataset shape is (1460, 81)
#he data is composed of 81 columns and 1460 entries. We can see all 81 dimensions of our dataset by printing out the first 3 entries using the following code:

In [None]:
dataset_df.head(3)

In [None]:
dataset_df = dataset_df.drop('Id', axis=1)
dataset_df.head(3)

In [None]:
dataset_df.info()

In [None]:
#House Price Distribution
#Now let us take a look at how the house prices are distributed.

print(dataset_df['SalePrice'].describe())
plt.figure(figsize=(9, 8))
sns.distplot(dataset_df['SalePrice'], color='g', bins=100, hist_kws={'alpha': 0.4});

In [None]:
#`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

#Please adapt your code to use either `displot` (a figure-level function with
#similar flexibility) or `histplot` (an axes-level function for histograms).

#For a guide to updating your code to use the new functions, please see
#https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

 # This is separate from the ipykernel package so we can avoid doing imports until

In [None]:
#Numerical data distribution
#We will now take a look at how the numerical features are distributed. In order to do this, let us first list all the types of data from our dataset and select only the numerical ones.

list(set(dataset_df.dtypes.tolist()))

In [None]:
df_num = dataset_df.select_dtypes(include = ['float64', 'int64'])
df_num.head()

In [None]:
##Now let us plot the distribution for all the numerical features.

df_num.hist(figsize=(16, 20), bins=50, xlabelsize=8, ylabelsize=8);

In [None]:
#Prepare the dataset
#This dataset contains a mix of numeric, categorical and missing features. TF-DF supports all these feature types natively, and no preprocessing is required. This is one advantage of tree-based models, making them a great entry point to Tensorflow and ML.

#Now let us split the dataset into training and testing datasets:

import numpy as np

def split_dataset(dataset, test_ratio=0.30):
  test_indices = np.random.rand(len(dataset)) < test_ratio
  return dataset[~test_indices], dataset[test_indices]

train_ds_pd, valid_ds_pd = split_dataset(dataset_df)
print("{} examples in training, {} examples in testing.".format(
    len(train_ds_pd), len(valid_ds_pd)))
#1010 examples in training, 450 examples in testing.

In [None]:
#There's one more step required before we can train the model. We need to convert the datatset from Pandas format (pd.DataFrame) into TensorFlow Datasets format (tf.data.Dataset).

#TensorFlow Datasets is a high performance data loading library which is helpful when training neural networks with accelerators like GPUs and TPUs.

#By default the Random Forest Model is configured to train classification tasks. Since this is a regression problem, we will specify the type of the task (tfdf.keras.Task.REGRESSION) as a parameter here.

label = 'SalePrice'
train_ds = tfdf.keras.pd_dataframe_to_tf_dataset(train_ds_pd, label=label, task = tfdf.keras.Task.REGRESSION)
valid_ds = tfdf.keras.pd_dataframe_to_tf_dataset(valid_ds_pd, label=label, task = tfdf.keras.Task.REGRESSION)

In [None]:
#Select a Model
#There are several tree-based models for you to choose from.

#RandomForestModel
#GradientBoostedTreesModel
#CartModel
#DistributedGradientBoostedTreesModel
#To start, we'll work with a Random Forest. This is the most well-known of the Decision Forest training algorithms.

#A Random Forest is a collection of decision trees, each trained independently on a random subset of the training dataset (sampled with replacement). The algorithm is unique in that it is robust to overfitting, and easy to use.

#We can list the all the available models in TensorFlow Decision Forests using the following code:

In [None]:
tfdf.keras.get_all_models()

In [None]:
rf = tfdf.keras.RandomForestModel(task = tfdf.keras.Task.REGRESSION)
rf.compile(metrics=["mse"]) # Optional, you can use this to include a list of eval metrics

In [None]:
#Train the model
#We will train the model using a one-liner.

#Note: you may see a warning about Autograph. You can safely ignore this, it will be fixed in the next release.

rf.fit(x=train_ds)

In [None]:
#Visualize the model
#One benefit of tree-based models is that you can easily visualize them. The default number of trees used in the Random Forests is 300. We can select a tree to display below.

tfdf.model_plotter.plot_model_in_colab(rf, tree_idx=0, max_depth=3)

In [None]:
#Evaluate the model on the Out of bag (OOB) data and the validation dataset
Before training the dataset we have manually seperated 20% of the dataset for validation named as valid_ds.

We can also use Out of bag (OOB) score to validate our RandomForestModel. To train a Random Forest Model, a set of random samples from training set are choosen by the algorithm and the rest of the samples are used to finetune the model.The subset of data that is not chosen is known as Out of bag data (OOB). OOB score is computed on the OOB data.

Read more about OOB data here.

The training logs show the Root Mean Squared Error (RMSE) evaluated on the out-of-bag dataset according to the number of trees in the model. Let us plot this.

Note: Smaller values are better for this hyperparameter.

In [None]:
import matplotlib.pyplot as plt
logs = rf.make_inspector().training_logs()
plt.plot([log.num_trees for log in logs], [log.evaluation.rmse for log in logs])
plt.xlabel("Number of trees")
plt.ylabel("RMSE (out-of-bag)")
plt.show()

In [None]:
We can also see some general stats on the OOB dataset:

inspector = rf.make_inspector()
inspector.evaluation()

In [None]:
Evaluation(num_examples=1010, accuracy=None, loss=None, rmse=29660.363022492173, ndcg=None, aucs=None, auuc=None, qini=None)
Now, let us run an evaluation using the validation dataset.

evaluation = rf.evaluate(x=valid_ds,return_dict=True)

for name, value in evaluation.items():
  print(f"{name}: {value:.4f}")
    

In [None]:
#Variable importances
#Variable importances generally indicate how much a feature contributes to the model predictions or quality. There are several ways to identify important features using TensorFlow Decision Forests. Let us list the available Variable Importances for Decision Trees:

print(f"Available variable importances:")
for importance in inspector.variable_importances().keys():
  print("\t", importance)

In [None]:
As an example, let us display the important features for the Variable Importance NUM_AS_ROOT.

The larger the importance score for NUM_AS_ROOT, the more impact it has on the outcome of the model.

By default, the list is sorted from the most important to the least. From the output you can infer that the feature at the top of the list is used as the root node in most number of trees in the random forest than any other feature.

In [None]:
inspector.variable_importances()["NUM_AS_ROOT"]

In [None]:
plt.figure(figsize=(12, 4))

# Mean decrease in AUC of the class 1 vs the others.
variable_importance_metric = "NUM_AS_ROOT"
variable_importances = inspector.variable_importances()[variable_importance_metric]

# Extract the feature name and importance values.
#
# `variable_importances` is a list of <feature, importance> tuples.
feature_names = [vi[0].name for vi in variable_importances]
feature_importances = [vi[1] for vi in variable_importances]
# The feature are ordered in decreasing importance value.
feature_ranks = range(len(feature_names))

bar = plt.barh(feature_ranks, feature_importances, label=[str(x) for x in feature_ranks])
plt.yticks(feature_ranks, feature_names)
plt.gca().invert_yaxis()

# TODO: Replace with "plt.bar_label()" when available.
# Label each bar with values
for importance, patch in zip(feature_importances, bar.patches):
  plt.text(patch.get_x() + patch.get_width(), patch.get_y(), f"{importance:.4f}", va="top")

plt.xlabel(variable_importance_metric)
plt.title("NUM AS ROOT of the class 1 vs the others")
plt.tight_layout()
plt.show()

In [None]:
#Submission
#Finally predict on the competition test data using the model.

test_file_path = "../input/house-prices-advanced-regression-techniques/test.csv"
test_data = pd.read_csv(test_file_path)
ids = test_data.pop('Id')

test_ds = tfdf.keras.pd_dataframe_to_tf_dataset(
    test_data,
    task = tfdf.keras.Task.REGRESSION)

preds = rf.predict(test_ds)
output = pd.DataFrame({'Id': ids,
                       'SalePrice': preds.squeeze()})

output.head()

In [None]:

sample_submission_df = pd.read_csv('../input/house-prices-advanced-regression-techniques/sample_submission.csv')
sample_submission_df['SalePrice'] = rf.predict(test_ds)
sample_submission_df.to_csv('/kaggle/working/submission.csv', index=False)
sample_submission_df.head()
