In [13]:
import pandas as pd 
import seaborn as sns
import numpy as np
import keras_tuner as kt
import tensorflow as tf
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

In [14]:
train_data = pd.read_csv('titanic/train.csv')
test_data = pd.read_csv('titanic/test.csv')

In [15]:
train_data.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [16]:
test_data.columns

Index(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [17]:
train_data['People'] = train_data['SibSp'] + train_data['Parch'] + 1
test_data['People'] = test_data['SibSp'] + test_data['Parch'] + 1

In [18]:
train_data = train_data.drop(columns=['PassengerId', 'Name', 'SibSp', 'Parch','Ticket', 'Cabin'])
test_data = test_data.drop(columns=['PassengerId', 'Name', 'SibSp', 'Parch','Ticket', 'Cabin'])

In [19]:
train_data.columns

Index(['Survived', 'Pclass', 'Sex', 'Age', 'Fare', 'Embarked', 'People'], dtype='object')

In [20]:
train_data['Age'] = train_data['Age'].fillna(train_data['Age'].mode()[0])
test_data['Age'] = train_data['Age'].fillna(train_data['Age'].mode()[0])
train_data['Fare'] = train_data['Fare'].fillna(train_data['Fare'].mean())
test_data['Fare'] = train_data['Fare'].fillna(train_data['Fare'].mean())
train_data['Embarked'] = train_data['Embarked'].fillna(train_data['Embarked'].mode()[0])
test_data['Embarked'] = train_data['Embarked'].fillna(train_data['Embarked'].mode()[0])

In [21]:
train_data = pd.get_dummies(train_data, dtype='int')
test_data = pd.get_dummies(test_data, dtype='int')

In [22]:
X_train = train_data.drop(columns=['Survived'])
y_train = train_data['Survived'] 

In [51]:
def model_builder(hp):
  model = keras.Sequential()
  model.add(keras.layers.Flatten(input_shape=(9,)))

  # Tune the number of units in the first Dense layer
  # Choose an optimal value between 32-512
  for i in range(hp.Int('No. of Layers', min_value=1, max_value=11, step=1)):
    model.add(keras.layers.Dense(units=hp.Int('n '+str(i), min_value = 32, max_value=512, step=32), activation='relu'))
  dr = hp.Float('dropout', min_value=0, max_value=0.5, step=0.1)
  model.add(keras.layers.Dropout(dr))
  model.add(keras.layers.Dense(1, activation='sigmoid'))
  # Tune the learning rate for the optimizer
  # Choose an optimal value from 0.01, 0.001, or 0.0001
  hp_learning_rate = hp.Choice('learning_rate', values=[1e-1, 1e-2, 1e-3, 1e-4])

  model.compile(optimizer=keras.optimizers.Adam(learning_rate=hp_learning_rate),
                loss=keras.losses.BinaryCrossentropy(),
                metrics=['accuracy'])

  return model

In [52]:
tuner = kt.Hyperband(model_builder,
                     objective='val_accuracy',
                     directory='my_dir',
                     factor = 7,
                     project_name='intro_to_kt')

In [53]:
stop_early = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5)

In [54]:
tuner.search(X_train, y_train, epochs=35, validation_split=0.2, callbacks=[stop_early])
best_hps=tuner.get_best_hyperparameters(num_trials=1)[0]
print(f"""
The hyperparameter search is complete. The optimal number of units in the first densely-connected
layer is {best_hps.get('No. of Layers')}, dropout was {best_hps.get('dropout')} and the optimal learning rate for the optimizer
is {best_hps.get('learning_rate')}.
""")


Trial 98 Complete [00h 00m 08s]
val_accuracy: 0.832402229309082

Best val_accuracy So Far: 0.8547486066818237
Total elapsed time: 00h 05m 44s

The hyperparameter search is complete. The optimal number of units in the first densely-connected
layer is 6, dropout was 0.1 and the optimal learning rate for the optimizer
is 0.001.



In [55]:
model = tuner.hypermodel.build(best_hps)
history = model.fit(X_train, y_train, epochs=50, validation_split=0.2)
val_acc_per_epoch = history.history['val_accuracy']
best_epoch = val_acc_per_epoch.index(max(val_acc_per_epoch)) + 1
print('Best epoch: %d' % (best_epoch,))


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Best epoch: 41


In [56]:
hypermodel = tuner.hypermodel.build(best_hps)
# Retrain the model
hypermodel.fit(X_train, y_train, epochs=best_epoch, validation_split=0.2)

Epoch 1/41
Epoch 2/41
Epoch 3/41
Epoch 4/41
Epoch 5/41
Epoch 6/41
Epoch 7/41
Epoch 8/41
Epoch 9/41
Epoch 10/41
Epoch 11/41
Epoch 12/41
Epoch 13/41
Epoch 14/41
Epoch 15/41
Epoch 16/41
Epoch 17/41
Epoch 18/41
Epoch 19/41
Epoch 20/41
Epoch 21/41
Epoch 22/41
Epoch 23/41
Epoch 24/41
Epoch 25/41
Epoch 26/41
Epoch 27/41
Epoch 28/41
Epoch 29/41
Epoch 30/41
Epoch 31/41
Epoch 32/41
Epoch 33/41
Epoch 34/41
Epoch 35/41
Epoch 36/41
Epoch 37/41
Epoch 38/41
Epoch 39/41
Epoch 40/41
Epoch 41/41


<keras.src.callbacks.History at 0x78e7aa472e50>

In [57]:
predictions = hypermodel.predict(test_data)



In [58]:
survived = np.round(predictions,0).astype('int')

In [59]:
t = pd.read_csv('titanic/test.csv')
# print(t["PassengerId"].shape, survived.shape)
answer = pd.DataFrame({"PassengerId" : t["PassengerId"], "Survived":survived.ravel()})
answer.to_csv('titanic/predictions.csv',index=False)

In [60]:
best_hps.values

{'No. of Layers': 6,
 'n 0': 64,
 'dropout': 0.1,
 'learning_rate': 0.001,
 'n 1': 480,
 'n 2': 128,
 'n 3': 128,
 'n 4': 384,
 'n 5': 448,
 'n 6': 32,
 'n 7': 192,
 'n 8': 96,
 'n 9': 128,
 'n 10': 224,
 'tuner/epochs': 100,
 'tuner/initial_epoch': 0,
 'tuner/bracket': 0,
 'tuner/round': 0}

In [76]:
import tensorflow_decision_forests as tfdf
from tensorflow_decision_forests.keras import pd_dataframe_to_tf_dataset 

In [81]:
train_ds = pd_dataframe_to_tf_dataset(train_data, label='Survived')
# test_ds = test_data.insert(column="Survived",value=[0]*418, loc=9)

In [82]:
test_ds = pd_dataframe_to_tf_dataset(test_data, label='Survived')

In [87]:
model = tfdf.keras.RandomForestModel()
model.fit(train_ds, validation_split=0.2)

Use /tmp/tmplnkxsq9h as temporary training directory




Reading training dataset...
Training dataset read in 0:00:00.111907. Found 891 examples.
Training model...


[INFO 24-02-21 09:31:05.5436 UTC kernel.cc:1233] Loading model from path /tmp/tmplnkxsq9h/model/ with prefix 90256cc6f9df4912


Model trained in 0:00:00.114095
Compiling model...
Model compiled.


[INFO 24-02-21 09:31:05.5822 UTC decision_forest.cc:660] Model loaded with 300 root(s), 47030 node(s), and 9 input feature(s).
[INFO 24-02-21 09:31:05.5822 UTC abstract_model.cc:1344] Engine "RandomForestOptPred" built
[INFO 24-02-21 09:31:05.5823 UTC kernel.cc:1061] Use fast generic engine


<keras.src.callbacks.History at 0x78e76bf0e650>

In [88]:
model.compile(["accuracy"])
test_accuracy = model.evaluate(train_ds, return_dict=True, verbose=0)["accuracy"]
print(f"Test accuracy without hyper-parameter tuning: {test_accuracy:.4f}")

Test accuracy without hyper-parameter tuning: 0.9046


In [89]:
tuner = tfdf.tuner.RandomSearch(num_trials=110)
tuner.choice("min_examples", [2, 5, 7, 10, 14, 21])
tuner.choice("categorical_algorithm", ["CART", "RANDOM"])
local_search_space = tuner.choice("growing_strategy", ["LOCAL"])
local_search_space.choice("max_depth", [3, 4, 5, 6, 8, 10, 12])
global_search_space = tuner.choice("growing_strategy", ["BEST_FIRST_GLOBAL"], merge=True)
global_search_space.choice("max_num_nodes", [16, 32, 64, 128, 256])
tuner.choice("use_hessian_gain", [True, False])
tuner.choice("shrinkage", [0.02, 0.05, 0.10, 0.15])
tuner.choice("num_candidate_attributes_ratio", [0.2, 0.5, 0.9, 1.0])
tuner.choice("split_axis", ["AXIS_ALIGNED"])
oblique_space = tuner.choice("split_axis", ["SPARSE_OBLIQUE"], merge=True)
oblique_space.choice("sparse_oblique_normalization",
                     ["NONE", "STANDARD_DEVIATION", "MIN_MAX"])
oblique_space.choice("sparse_oblique_weights", ["BINARY", "CONTINUOUS"])
oblique_space.choice("sparse_oblique_num_projections_exponent", [1.0, 1.5])

<tensorflow_decision_forests.component.tuner.tuner.SearchSpace at 0x78e764151010>

In [90]:
tuned_model = tfdf.keras.GradientBoostedTreesModel(tuner=tuner)
tuned_model.fit(train_ds, verbose=2)


Use /tmp/tmp0pnh9t9n as temporary training directory
Reading training dataset...
Training tensor examples:
Features: {'Pclass': <tf.Tensor 'data:0' shape=(None,) dtype=int64>, 'Age': <tf.Tensor 'data_1:0' shape=(None,) dtype=float64>, 'Fare': <tf.Tensor 'data_2:0' shape=(None,) dtype=float64>, 'People': <tf.Tensor 'data_3:0' shape=(None,) dtype=int64>, 'Sex_female': <tf.Tensor 'data_4:0' shape=(None,) dtype=int64>, 'Sex_male': <tf.Tensor 'data_5:0' shape=(None,) dtype=int64>, 'Embarked_C': <tf.Tensor 'data_6:0' shape=(None,) dtype=int64>, 'Embarked_Q': <tf.Tensor 'data_7:0' shape=(None,) dtype=int64>, 'Embarked_S': <tf.Tensor 'data_8:0' shape=(None,) dtype=int64>}
Label: Tensor("data_9:0", shape=(None,), dtype=int64)
Weights: None
Normalized tensor features:
 {'Pclass': SemanticTensor(semantic=<Semantic.NUMERICAL: 1>, tensor=<tf.Tensor 'Cast:0' shape=(None,) dtype=float32>), 'Age': SemanticTensor(semantic=<Semantic.NUMERICAL: 1>, tensor=<tf.Tensor 'Cast_1:0' shape=(None,) dtype=float32

[INFO 24-02-21 09:31:31.8639 UTC kernel.cc:771] Start Yggdrasil model training
[INFO 24-02-21 09:31:31.8639 UTC kernel.cc:772] Collect training examples
[INFO 24-02-21 09:31:31.8639 UTC kernel.cc:785] Dataspec guide:
column_guides {
  column_name_pattern: "^__LABEL$"
  type: CATEGORICAL
  categorial {
    min_vocab_frequency: 0
    max_vocab_count: -1
  }
}
default_column_guide {
  categorial {
    max_vocab_count: 2000
  }
  discretized_numerical {
    maximum_num_bins: 255
  }
}
ignore_columns_without_guides: false
detect_numerical_as_discretized_numerical: false

[INFO 24-02-21 09:31:31.8640 UTC kernel.cc:391] Number of batches: 1
[INFO 24-02-21 09:31:31.8640 UTC kernel.cc:392] Number of examples: 891
[INFO 24-02-21 09:31:31.8640 UTC kernel.cc:792] Training dataset:
Number of records: 891
Number of columns: 10

Number of columns by type:
	NUMERICAL: 9 (90%)
	CATEGORICAL: 1 (10%)

Columns:

NUMERICAL: 9 (90%)
	0: "Age" NUMERICAL mean:28.567 min:0.42 max:80 sd:13.1922
	1: "Embarked_C"

Model trained in 0:00:04.204566
Compiling model...
Model compiled.


<keras.src.callbacks.History at 0x78e764586c90>

In [91]:
tuned_model.compile(["accuracy"])
tuned_test_accuracy = tuned_model.evaluate(train_ds, return_dict=True, verbose=0)["accuracy"]
print(f"Test accuracy with the TF-DF hyper-parameter tuner: {tuned_test_accuracy:.4f}")

Test accuracy with the TF-DF hyper-parameter tuner: 0.9675


In [92]:
tuner2 = tfdf.tuner.RandomSearch(num_trials=50, use_predefined_hps=True)

# Define and train the model.
tuned_model2 = tfdf.keras.GradientBoostedTreesModel(tuner=tuner2)
tuned_model2.fit(train_ds, verbose=2)


Use /tmp/tmpsby758fw as temporary training directory
Reading training dataset...
Training tensor examples:
Features: {'Pclass': <tf.Tensor 'data:0' shape=(None,) dtype=int64>, 'Age': <tf.Tensor 'data_1:0' shape=(None,) dtype=float64>, 'Fare': <tf.Tensor 'data_2:0' shape=(None,) dtype=float64>, 'People': <tf.Tensor 'data_3:0' shape=(None,) dtype=int64>, 'Sex_female': <tf.Tensor 'data_4:0' shape=(None,) dtype=int64>, 'Sex_male': <tf.Tensor 'data_5:0' shape=(None,) dtype=int64>, 'Embarked_C': <tf.Tensor 'data_6:0' shape=(None,) dtype=int64>, 'Embarked_Q': <tf.Tensor 'data_7:0' shape=(None,) dtype=int64>, 'Embarked_S': <tf.Tensor 'data_8:0' shape=(None,) dtype=int64>}
Label: Tensor("data_9:0", shape=(None,), dtype=int64)
Weights: None
Normalized tensor features:
 {'Pclass': SemanticTensor(semantic=<Semantic.NUMERICAL: 1>, tensor=<tf.Tensor 'Cast:0' shape=(None,) dtype=float32>), 'Age': SemanticTensor(semantic=<Semantic.NUMERICAL: 1>, tensor=<tf.Tensor 'Cast_1:0' shape=(None,) dtype=float32

[INFO 24-02-21 09:33:15.4768 UTC kernel.cc:771] Start Yggdrasil model training
[INFO 24-02-21 09:33:15.4768 UTC kernel.cc:772] Collect training examples
[INFO 24-02-21 09:33:15.4768 UTC kernel.cc:785] Dataspec guide:
column_guides {
  column_name_pattern: "^__LABEL$"
  type: CATEGORICAL
  categorial {
    min_vocab_frequency: 0
    max_vocab_count: -1
  }
}
default_column_guide {
  categorial {
    max_vocab_count: 2000
  }
  discretized_numerical {
    maximum_num_bins: 255
  }
}
ignore_columns_without_guides: false
detect_numerical_as_discretized_numerical: false

[INFO 24-02-21 09:33:15.4768 UTC kernel.cc:391] Number of batches: 1
[INFO 24-02-21 09:33:15.4768 UTC kernel.cc:392] Number of examples: 891
[INFO 24-02-21 09:33:15.4769 UTC kernel.cc:792] Training dataset:
Number of records: 891
Number of columns: 10

Number of columns by type:
	NUMERICAL: 9 (90%)
	CATEGORICAL: 1 (10%)

Columns:

NUMERICAL: 9 (90%)
	0: "Age" NUMERICAL mean:28.567 min:0.42 max:80 sd:13.1922
	1: "Embarked_C"

Model trained in 0:00:08.415172
Compiling model...
Model compiled.


<keras.src.callbacks.History at 0x78e69cf05290>

In [94]:
tuned_model2.compile(["accuracy"])
tuned_test_accuracy2 = tuned_model2.evaluate(train_ds, return_dict=True, verbose=0)["accuracy"]
print(f"Test accuracy with the TF-DF hyper-parameter tuner: {tuned_test_accuracy:.4f}")






Test accuracy with the TF-DF hyper-parameter tuner: 0.9675


In [95]:
predictions_rfm = tuned_model.predict(test_ds)







In [99]:
answer = pd.DataFrame({"PassengerId" : t["PassengerId"], "Survived":np.round(predictions_rfm,0).astype('int').ravel()})
answer.to_csv('titanic/predictions.csv',index=False)

In [100]:
tuned_model.summary()

Model: "gradient_boosted_trees_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
Total params: 1 (1.00 Byte)
Trainable params: 0 (0.00 Byte)
Non-trainable params: 1 (1.00 Byte)
_________________________________________________________________
Type: "GRADIENT_BOOSTED_TREES"
Task: CLASSIFICATION
Label: "__LABEL"

Input Features (9):
	Age
	Embarked_C
	Embarked_Q
	Embarked_S
	Fare
	Pclass
	People
	Sex_female
	Sex_male

No weights

Variable Importance: INV_MEAN_MIN_DEPTH:
    1.        "Age"  0.404580 ################
    2.       "Fare"  0.214968 ####
    3. "Embarked_C"  0.203683 ###
    4. "Embarked_Q"  0.184145 ##
    5.     "Pclass"  0.184070 ##
    6. "Sex_female"  0.179725 #
    7. "Embarked_S"  0.179184 #
    8.     "People"  0.152488 
    9.   "Sex_male"  0.147862 

Variable Importance: NUM_AS_ROOT:
    1.        "Age" 19.000000 ################
    2. "Sex_female" 11.000000 ########
    3. "Em