In [96]:
import pandas as pd 
import seaborn as sns
import numpy as np
import keras_tuner as kt
import tensorflow as tf
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.metrics import F1Score

In [97]:
train_data = pd.read_csv('titanic/train.csv')
test_data = pd.read_csv('titanic/test.csv')

In [98]:
train_data.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [99]:
test_data.columns

Index(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [100]:
train_data['People'] = train_data['SibSp'] + train_data['Parch'] + 1
test_data['People'] = test_data['SibSp'] + test_data['Parch'] + 1

In [101]:
train_data['Size'] = train_data['Name'] .apply(lambda x : len(x))
test_data['Size'] = test_data['Name'] .apply(lambda x : len(x))
train_data = train_data.drop(columns=['PassengerId', 'Name', 'SibSp', 'Parch','Ticket', 'Cabin'])
test_data = test_data.drop(columns=['PassengerId', 'Name', 'SibSp', 'Parch','Ticket', 'Cabin'])

In [102]:
train_data.columns

Index(['Survived', 'Pclass', 'Sex', 'Age', 'Fare', 'Embarked', 'People',
       'Size'],
      dtype='object')

In [103]:
train_data['Age'] = train_data['Age'].fillna(train_data['Age'].mode()[0])
test_data['Age'] = train_data['Age'].fillna(train_data['Age'].mode()[0])
train_data['Fare'] = train_data['Fare'].fillna(train_data['Fare'].mean())
test_data['Fare'] = train_data['Fare'].fillna(train_data['Fare'].mean())
train_data['Embarked'] = train_data['Embarked'].fillna(train_data['Embarked'].mode()[0])
test_data['Embarked'] = train_data['Embarked'].fillna(train_data['Embarked'].mode()[0])

In [104]:
train_data = pd.get_dummies(train_data, dtype='int')
test_data = pd.get_dummies(test_data, dtype='int')

In [105]:
X_train = train_data.drop(columns=['Survived'])
y_train = train_data['Survived'] 

In [106]:
X_train.columns

Index(['Pclass', 'Age', 'Fare', 'People', 'Size', 'Sex_female', 'Sex_male',
       'Embarked_C', 'Embarked_Q', 'Embarked_S'],
      dtype='object')

In [116]:
def model_builder(hp):
  model = keras.Sequential()
  model.add(keras.layers.Flatten(input_shape=(10,)))

  # Tune the number of units in the first Dense layer
  # Choose an optimal value between 32-512
  for i in range(hp.Int('No. of Layers', min_value=1, max_value=7, step=1)):
    model.add(keras.layers.Dense(units=hp.Int('n '+str(i), min_value = 32, max_value=512, step=32), activation='relu'))
  dr = hp.Float('dropout', min_value=0, max_value=0.5, step=0.1)
  model.add(keras.layers.Dropout(dr))
  model.add(keras.layers.Dense(1, activation='sigmoid'))
  # Tune the learning rate for the optimizer
  # Choose an optimal value from 0.01, 0.001, or 0.0001
  hp_learning_rate = hp.Choice('learning_rate', values=[1e-1, 1e-2, 1e-3, 1e-4])

  model.compile(optimizer=keras.optimizers.Adam(learning_rate=hp_learning_rate),
                loss=keras.losses.BinaryCrossentropy(),
                metrics=['accuracy'])

  return model

In [117]:
tuner = kt.Hyperband(model_builder,
                     objective='val_accuracy',
                     directory='my_dir',
                     factor = 70,
                     project_name='intro_to_kt')

In [118]:
stop_early = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5)

In [119]:
tuner.search(X_train, y_train, epochs=35, validation_split=0.2, callbacks=[stop_early])
best_hps=tuner.get_best_hyperparameters(num_trials=1)[0]
print(f"""
The hyperparameter search is complete. The optimal number of units in the first densely-connected
layer is {best_hps.get('No. of Layers')}, dropout was {best_hps.get('dropout')} and the optimal learning rate for the optimizer
is {best_hps.get('learning_rate')}.
""")


Trial 110 Complete [00h 00m 06s]
val_accuracy: 0.832402229309082

Best val_accuracy So Far: 0.8491619825363159
Total elapsed time: 00h 03m 37s

The hyperparameter search is complete. The optimal number of units in the first densely-connected
layer is 1, dropout was 0.30000000000000004 and the optimal learning rate for the optimizer
is 0.01.



In [120]:
model = tuner.hypermodel.build(best_hps)
history = model.fit(X_train, y_train, epochs=50, validation_split=0.2)
val_acc_per_epoch = history.history['val_accuracy']
best_epoch = val_acc_per_epoch.index(max(val_acc_per_epoch)) + 1
print('Best epoch: %d' % (best_epoch,))


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Best epoch: 35


In [121]:
hypermodel = tuner.hypermodel.build(best_hps)
# Retrain the model
hypermodel.fit(X_train, y_train, epochs=best_epoch, validation_split=0.2)

Epoch 1/35
Epoch 2/35
Epoch 3/35
Epoch 4/35
Epoch 5/35
Epoch 6/35
Epoch 7/35
Epoch 8/35
Epoch 9/35
Epoch 10/35
Epoch 11/35
Epoch 12/35
Epoch 13/35
Epoch 14/35
Epoch 15/35
Epoch 16/35
Epoch 17/35
Epoch 18/35
Epoch 19/35
Epoch 20/35
Epoch 21/35
Epoch 22/35
Epoch 23/35
Epoch 24/35
Epoch 25/35
Epoch 26/35
Epoch 27/35
Epoch 28/35
Epoch 29/35
Epoch 30/35
Epoch 31/35
Epoch 32/35
Epoch 33/35
Epoch 34/35
Epoch 35/35


<keras.src.callbacks.History at 0x704aa297a850>

In [122]:
m = F1Score()
y_true = y_train[:178].to_numpy().reshape(178,1)
y_pred = np.round(hypermodel.predict(X_train[:178])).astype('int')
m.update_state(y_true, y_pred)
res = m.result()
res.numpy()



array([0.75000006], dtype=float32)

In [123]:
predictions = hypermodel.predict(test_data)



In [124]:
survived = np.round(predictions,0).astype('int')

In [125]:
t = pd.read_csv('titanic/test.csv')
# print(t["PassengerId"].shape, survived.shape)
answer = pd.DataFrame({"PassengerId" : t["PassengerId"], "Survived":survived.ravel()})
answer.to_csv('titanic/predictions.csv',index=False)

In [114]:
best_hps.values

{'No. of Layers': 1,
 'n 0': 384,
 'dropout': 0.2,
 'learning_rate': 0.01,
 'n 1': 480,
 'n 2': 64,
 'n 3': 288,
 'n 4': 64,
 'n 5': 128,
 'n 6': 192,
 'n 7': 224,
 'n 8': 64,
 'n 9': 288,
 'n 10': 384,
 'tuner/epochs': 100,
 'tuner/initial_epoch': 2,
 'tuner/bracket': 1,
 'tuner/round': 1,
 'tuner/trial_id': '0015'}

In [126]:
import tensorflow_decision_forests as tfdf
from tensorflow_decision_forests.keras import pd_dataframe_to_tf_dataset 

In [129]:
train_ds = pd_dataframe_to_tf_dataset(train_data, label='Survived')
test_ds = test_data.insert(column="Survived",value=[0]*418, loc=10)

In [130]:
test_data

Unnamed: 0,Pclass,Age,Fare,People,Size,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,Survived
0,3,22.0,7.2500,1,16,0,1,0,0,1,0
1,3,38.0,71.2833,2,32,1,0,1,0,0,0
2,2,26.0,7.9250,1,25,0,1,0,0,1,0
3,3,35.0,53.1000,1,16,0,1,0,0,1,0
4,3,35.0,8.0500,3,44,1,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...
413,3,24.0,0.0000,1,18,0,1,0,0,1,0
414,1,44.0,7.9250,1,28,1,0,0,0,1,0
415,3,24.0,8.0500,1,28,0,1,0,0,1,0
416,3,34.0,32.5000,1,19,0,1,0,0,1,0


In [131]:
# test_ds = pd_dataframe_to_tf_dataset(train_data[:178], label='Survived')
test_ds = pd_dataframe_to_tf_dataset(test_data, label='Survived')

In [30]:
model = tfdf.keras.RandomForestModel()
model.fit(train_ds, validation_split=0.2)

Use /tmp/tmp8emijaf_ as temporary training directory




Reading training dataset...
Training dataset read in 0:00:00.110156. Found 713 examples.
Training model...
Model trained in 0:00:00.085700
Compiling model...


[INFO 24-02-22 06:48:38.1120 UTC kernel.cc:1233] Loading model from path /tmp/tmp8emijaf_/model/ with prefix cce4213c9f9e4ee2
[INFO 24-02-22 06:48:38.1404 UTC decision_forest.cc:660] Model loaded with 300 root(s), 37318 node(s), and 9 input feature(s).
[INFO 24-02-22 06:48:38.1404 UTC abstract_model.cc:1344] Engine "RandomForestOptPred" built
[INFO 24-02-22 06:48:38.1404 UTC kernel.cc:1061] Use fast generic engine


Model compiled.


<keras.src.callbacks.History at 0x704bdc60b950>

In [31]:
model.compile(["accuracy"])
test_accuracy = model.evaluate(test_ds, return_dict=True, verbose=0)["accuracy"]
print(f"Test accuracy without hyper-parameter tuning: {test_accuracy:.4f}")

Test accuracy without hyper-parameter tuning: 0.8202


In [132]:
tuner = tfdf.tuner.RandomSearch(num_trials=1100)
tuner.choice("min_examples", [2, 5, 7, 10, 14, 21])
tuner.choice("categorical_algorithm", ["CART", "RANDOM"])
local_search_space = tuner.choice("growing_strategy", ["LOCAL"])
local_search_space.choice("max_depth", [3, 4, 5, 6, 8, 10, 12, 14])
global_search_space = tuner.choice("growing_strategy", ["BEST_FIRST_GLOBAL"], merge=True)
global_search_space.choice("max_num_nodes", [16, 32, 64, 128, 256, 512])
tuner.choice("use_hessian_gain", [True, False])
tuner.choice("shrinkage", [0.02, 0.05, 0.10, 0.15, 0.20, 0.25, 0.30])
tuner.choice("num_candidate_attributes_ratio", [0.2, 0.5, 0.9, 1.0])
tuner.choice("split_axis", ["AXIS_ALIGNED"])
oblique_space = tuner.choice("split_axis", ["SPARSE_OBLIQUE"], merge=True)
oblique_space.choice("sparse_oblique_normalization",
                     ["NONE", "STANDARD_DEVIATION", "MIN_MAX"])
oblique_space.choice("sparse_oblique_weights", ["BINARY", "CONTINUOUS"])
oblique_space.choice("sparse_oblique_num_projections_exponent", [1.0, 1.5, 2.0, 2.5])

<tensorflow_decision_forests.component.tuner.tuner.SearchSpace at 0x70493c4c9610>

In [133]:
tuned_model = tfdf.keras.GradientBoostedTreesModel(tuner=tuner)
tuned_model.fit(train_ds, verbose=2)


Use /tmp/tmpmzhz5c9s as temporary training directory
Reading training dataset...
Training tensor examples:
Features: {'Pclass': <tf.Tensor 'data:0' shape=(None,) dtype=int64>, 'Age': <tf.Tensor 'data_1:0' shape=(None,) dtype=float64>, 'Fare': <tf.Tensor 'data_2:0' shape=(None,) dtype=float64>, 'People': <tf.Tensor 'data_3:0' shape=(None,) dtype=int64>, 'Size': <tf.Tensor 'data_4:0' shape=(None,) dtype=int64>, 'Sex_female': <tf.Tensor 'data_5:0' shape=(None,) dtype=int64>, 'Sex_male': <tf.Tensor 'data_6:0' shape=(None,) dtype=int64>, 'Embarked_C': <tf.Tensor 'data_7:0' shape=(None,) dtype=int64>, 'Embarked_Q': <tf.Tensor 'data_8:0' shape=(None,) dtype=int64>, 'Embarked_S': <tf.Tensor 'data_9:0' shape=(None,) dtype=int64>}
Label: Tensor("data_10:0", shape=(None,), dtype=int64)
Weights: None
Normalized tensor features:
 {'Pclass': SemanticTensor(semantic=<Semantic.NUMERICAL: 1>, tensor=<tf.Tensor 'Cast:0' shape=(None,) dtype=float32>), 'Age': SemanticTensor(semantic=<Semantic.NUMERICAL: 1



Training dataset read in 0:00:00.121300. Found 891 examples.
Training model...


[INFO 24-02-22 07:36:33.1665 UTC kernel.cc:771] Start Yggdrasil model training
[INFO 24-02-22 07:36:33.1665 UTC kernel.cc:772] Collect training examples
[INFO 24-02-22 07:36:33.1665 UTC kernel.cc:785] Dataspec guide:
column_guides {
  column_name_pattern: "^__LABEL$"
  type: CATEGORICAL
  categorial {
    min_vocab_frequency: 0
    max_vocab_count: -1
  }
}
default_column_guide {
  categorial {
    max_vocab_count: 2000
  }
  discretized_numerical {
    maximum_num_bins: 255
  }
}
ignore_columns_without_guides: false
detect_numerical_as_discretized_numerical: false

[INFO 24-02-22 07:36:33.1666 UTC kernel.cc:391] Number of batches: 1
[INFO 24-02-22 07:36:33.1666 UTC kernel.cc:392] Number of examples: 891
[INFO 24-02-22 07:36:33.1666 UTC kernel.cc:792] Training dataset:
Number of records: 891
Number of columns: 11

Number of columns by type:
	NUMERICAL: 10 (90.9091%)
	CATEGORICAL: 1 (9.09091%)

Columns:

NUMERICAL: 10 (90.9091%)
	0: "Age" NUMERICAL mean:28.567 min:0.42 max:80 sd:13.1922

Model trained in 0:02:36.372082
Compiling model...
Model compiled.




<keras.src.callbacks.History at 0x704ac09199d0>

In [134]:
tuned_model.compile(["accuracy"])
tuned_test_accuracy = tuned_model.evaluate(test_ds, return_dict=True, verbose=0)["accuracy"]
print(f"Test accuracy with the TF-DF hyper-parameter tuner: {tuned_test_accuracy:.4f}")





Test accuracy with the TF-DF hyper-parameter tuner: 0.6388


In [25]:
tuner2 = tfdf.tuner.RandomSearch(num_trials=500, use_predefined_hps=True)

# Define and train the model.
tuned_model2 = tfdf.keras.GradientBoostedTreesModel(tuner=tuner2)
tuned_model2.fit(train_ds, verbose=2)


Use /tmp/tmpf8og66t6 as temporary training directory
Reading training dataset...




Training tensor examples:
Features: {'Pclass': <tf.Tensor 'data:0' shape=(None,) dtype=int64>, 'Age': <tf.Tensor 'data_1:0' shape=(None,) dtype=float64>, 'Fare': <tf.Tensor 'data_2:0' shape=(None,) dtype=float64>, 'People': <tf.Tensor 'data_3:0' shape=(None,) dtype=int64>, 'Sex_female': <tf.Tensor 'data_4:0' shape=(None,) dtype=int64>, 'Sex_male': <tf.Tensor 'data_5:0' shape=(None,) dtype=int64>, 'Embarked_C': <tf.Tensor 'data_6:0' shape=(None,) dtype=int64>, 'Embarked_Q': <tf.Tensor 'data_7:0' shape=(None,) dtype=int64>, 'Embarked_S': <tf.Tensor 'data_8:0' shape=(None,) dtype=int64>}
Label: Tensor("data_9:0", shape=(None,), dtype=int64)
Weights: None
Normalized tensor features:
 {'Pclass': SemanticTensor(semantic=<Semantic.NUMERICAL: 1>, tensor=<tf.Tensor 'Cast:0' shape=(None,) dtype=float32>), 'Age': SemanticTensor(semantic=<Semantic.NUMERICAL: 1>, tensor=<tf.Tensor 'Cast_1:0' shape=(None,) dtype=float32>), 'Fare': SemanticTensor(semantic=<Semantic.NUMERICAL: 1>, tensor=<tf.Tensor 'C

[INFO 24-02-22 06:45:45.7465 UTC kernel.cc:771] Start Yggdrasil model training
[INFO 24-02-22 06:45:45.7465 UTC kernel.cc:772] Collect training examples
[INFO 24-02-22 06:45:45.7465 UTC kernel.cc:785] Dataspec guide:
column_guides {
  column_name_pattern: "^__LABEL$"
  type: CATEGORICAL
  categorial {
    min_vocab_frequency: 0
    max_vocab_count: -1
  }
}
default_column_guide {
  categorial {
    max_vocab_count: 2000
  }
  discretized_numerical {
    maximum_num_bins: 255
  }
}
ignore_columns_without_guides: false
detect_numerical_as_discretized_numerical: false

[INFO 24-02-22 06:45:45.7467 UTC kernel.cc:391] Number of batches: 1
[INFO 24-02-22 06:45:45.7467 UTC kernel.cc:392] Number of examples: 713
[INFO 24-02-22 06:45:45.7468 UTC kernel.cc:792] Training dataset:
Number of records: 713
Number of columns: 10

Number of columns by type:
	NUMERICAL: 9 (90%)
	CATEGORICAL: 1 (10%)

Columns:

NUMERICAL: 9 (90%)
	0: "Age" NUMERICAL mean:28.8925 min:0.42 max:80 sd:13.0626
	1: "Embarked_C

Model trained in 0:00:45.245856
Compiling model...
Model compiled.


<keras.src.callbacks.History at 0x704c301cf910>

In [28]:
tuned_model2.compile(["accuracy"])
tuned_test_accuracy2 = tuned_model2.evaluate(test_ds, return_dict=True, verbose=0)["accuracy"]
print(f"Test accuracy with the TF-DF hyper-parameter tuner: {tuned_test_accuracy2:.4f}")


Test accuracy with the TF-DF hyper-parameter tuner: 0.7978


In [135]:
predictions_rfm = tuned_model.predict(test_ds)



In [136]:
t = pd.read_csv('titanic/test.csv')
answer = pd.DataFrame({"PassengerId" : t["PassengerId"], "Survived":np.round(predictions_rfm,0).astype('int').ravel()})
answer.to_csv('titanic/predictions.csv',index=False)

In [56]:
tuned_model.summary()

Model: "gradient_boosted_trees_model_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
Total params: 1 (1.00 Byte)
Trainable params: 0 (0.00 Byte)
Non-trainable params: 1 (1.00 Byte)
_________________________________________________________________
Type: "GRADIENT_BOOSTED_TREES"
Task: CLASSIFICATION
Label: "__LABEL"

Input Features (9):
	Age
	Embarked_C
	Embarked_Q
	Embarked_S
	Fare
	Pclass
	People
	Sex_female
	Sex_male

No weights

Variable Importance: INV_MEAN_MIN_DEPTH:
    1.        "Age"  0.480781 ################
    2.       "Fare"  0.233748 ##
    3. "Embarked_C"  0.229072 ##
    4. "Embarked_Q"  0.212467 #
    5.     "Pclass"  0.204518 #
    6. "Embarked_S"  0.203581 #
    7. "Sex_female"  0.190493 
    8.   "Sex_male"  0.187266 
    9.     "People"  0.184621 

Variable Importance: NUM_AS_ROOT:
    1.        "Age" 40.000000 ################
    2.       "Fare"  9.000000 ##
    3. "Embarked_C" 