In [385]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score,recall_score,precision_score


In [386]:
TRAIN_DATA="data\\train.csv"
TEST_DATA="data\\test.csv"
PRED_DATA="data\\pred.csv"
TARGET="data\\target.csv"
SEED=4567
threshold=0.5

In [387]:
train_data=pd.read_csv(TRAIN_DATA)
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [388]:
def preprocess(data):
    Titles =["Mr","Miss","Mrs","Master"]

    for title in Titles:
        t= title+". "
        data[title] = data["Name"].str.contains(t)

    cat_variables = ['Sex',
    'Pclass',
    'Embarked',
    ]
    
    # One hot encoding
    data = pd.get_dummies(data = data,
                            prefix = cat_variables,
                            columns = cat_variables)
    # Initialize the class
    #scaler_linear = StandardScaler()
    # Compute the mean and standard deviation of the training set then transform it
    #data[["Scaled_Fare","Scaled_Age"]] = scaler_linear.fit_transform(data[["Fare","Age"]])
    

    return data
train_data = preprocess(train_data)


In [389]:
non_features = ["PassengerId","Survived","Name","Ticket","Cabin","Fare","Age"]
features = [x for x in train_data.columns if x not in non_features]
#features=["Sex_female"]
n=len(features) # Number of features
X_train,X_cv,y_train,y_cv=train_test_split(train_data[features],train_data["Survived"],train_size=0.70,random_state = SEED)
y_train=np.asarray(y_train).astype(np.float32)
X_train=np.asarray(X_train).astype(np.float32)
y_cv=np.asarray(y_cv).astype(np.float32)
X_cv=np.asarray(X_cv).astype(np.float32)
print(f"Data set has {n} features")
print(f"X_train Shape {X_train.shape} Y_train Shape {y_train.shape}")
print(f"X_train type {X_train.dtype} Y_train Shape {y_train.dtype}")

Data set has 14 features
X_train Shape (623, 14) Y_train Shape (623,)
X_train type float32 Y_train Shape float32


In [390]:
print(f'train samples: {len(X_train)}')
print(f'validation samples: {len(X_cv)}')
print(f'target proportion train: {sum(y_train)/len(y_train):.4f}')
print(f'target proportion cv: {sum(y_cv)/len(y_cv):.4f}')

train samples: 623
validation samples: 268
target proportion train: 0.3692
target proportion cv: 0.4179


In [391]:
#Define Model
tf.random.set_seed(SEED)  # applied to achieve consistent results
model = Sequential(
    [
        tf.keras.Input(shape=(n,)),
        Dense(35, activation="relu", name="layer1"),
        Dense(17, activation="relu", name="layer2"),
        Dense(8, activation="relu", name="layer3"),
        Dense(12, activation="relu", name="layer4"),

        Dense(1, activation="sigmoid", name="layer5"),
    ]
)
W1, b1 = model.get_layer("layer1").get_weights()
W5, b5 = model.get_layer("layer5").get_weights()
#print(f"W1{W1.shape}:\n", W1, f"\nb1{b1.shape}:", b1)
print(f"W2{W5.shape}:\n", W5, f"\nb2{b5.shape}:", b5)


W2(12, 1):
 [[-0.05386007]
 [-0.575647  ]
 [-0.5163946 ]
 [-0.3949987 ]
 [-0.4050623 ]
 [-0.6610146 ]
 [-0.36833057]
 [ 0.47004247]
 [-0.03206962]
 [-0.18600571]
 [ 0.21406275]
 [ 0.6374413 ]] 
b2(1,): [0.]


In [392]:
model.compile(
loss=tf.keras.losses.BinaryCrossentropy(),#from_logits=True),
optimizer=tf.keras.optimizers.Adam(learning_rate=0.01),
)
#tf.convert_to_tensor(X_train["SibSp"], dtype=tf.float32)

print(f"X_train Shape {X_train.shape} Y_train Shape {y_train.shape}")
print(f"X_train type {X_train.dtype} Y_train Shape {y_train.dtype}")

[print(i.shape, i.dtype) for i in model.inputs]
[print(o.shape, o.dtype) for o in model.outputs]
[print(l.name,  l.dtype) for l in model.layers]
# Train the model
model.fit(
     X_train, y_train,
     epochs=200,
     verbose=2
)
print(f"Training {model.name}...  \n {model.summary()}")

yhat_train = model.predict(X_train) 
yhat_cv = model.predict(X_cv) 


X_train Shape (623, 14) Y_train Shape (623,)
X_train type float32 Y_train Shape float32
(None, 14) float32
(None, 1) float32
layer1 float32
layer2 float32
layer3 float32
layer4 float32
layer5 float32
Epoch 1/200
20/20 - 1s - 49ms/step - loss: 0.5436
Epoch 2/200
20/20 - 0s - 2ms/step - loss: 0.4394
Epoch 3/200
20/20 - 0s - 2ms/step - loss: 0.4135
Epoch 4/200
20/20 - 0s - 2ms/step - loss: 0.4032
Epoch 5/200
20/20 - 0s - 2ms/step - loss: 0.3965
Epoch 6/200
20/20 - 0s - 2ms/step - loss: 0.3935
Epoch 7/200
20/20 - 0s - 2ms/step - loss: 0.3915
Epoch 8/200
20/20 - 0s - 2ms/step - loss: 0.3894
Epoch 9/200
20/20 - 0s - 2ms/step - loss: 0.3871
Epoch 10/200
20/20 - 0s - 2ms/step - loss: 0.3878
Epoch 11/200
20/20 - 0s - 2ms/step - loss: 0.3881
Epoch 12/200
20/20 - 0s - 2ms/step - loss: 0.3877
Epoch 13/200
20/20 - 0s - 2ms/step - loss: 0.3860
Epoch 14/200
20/20 - 0s - 2ms/step - loss: 0.3825
Epoch 15/200
20/20 - 0s - 2ms/step - loss: 0.3809
Epoch 16/200
20/20 - 0s - 2ms/step - loss: 0.3822
Epoch 17

Training sequential_45...  
 None
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 696us/step


In [393]:
th_list=[0.3,0.4,0.5,0.6,0.7,0.8]
#th_list = [np.mean(yhat_train),0.5]
accuracy_list_train = []
accuracy_list_val = []
for t in th_list:
    print(f"Checking accuravy with threshold setting of {t}")
    predictions_train = np.where(yhat_train >= t, 1, 0) ## The predicted values for the test dataset
    predictions_val = np.where(yhat_cv >= t, 1, 0) ## The predicted values for the test dataset
    accuracy_train = accuracy_score(predictions_train,y_train)
    accuracy_val = accuracy_score(predictions_val,y_cv)
    accuracy_list_train.append(accuracy_train)
    accuracy_list_val.append(accuracy_val)
    print (f"Training Accuracy:{accuracy_train}")
    print (f"CV Accuracy:{accuracy_val}")
th_best=th_list[np.argmax(accuracy_list_val)]
print(th_best)

Checking accuravy with threshold setting of 0.3
Training Accuracy:0.8298555377207063
CV Accuracy:0.7723880597014925
Checking accuravy with threshold setting of 0.4
Training Accuracy:0.85553772070626
CV Accuracy:0.7835820895522388
Checking accuravy with threshold setting of 0.5
Training Accuracy:0.8603531300160514
CV Accuracy:0.7947761194029851
Checking accuravy with threshold setting of 0.6
Training Accuracy:0.8571428571428571
CV Accuracy:0.8022388059701493
Checking accuravy with threshold setting of 0.7
Training Accuracy:0.8539325842696629
CV Accuracy:0.8246268656716418
Checking accuravy with threshold setting of 0.8
Training Accuracy:0.8491171749598716
CV Accuracy:0.8208955223880597
0.7


In [380]:
print(yhat_cv)

[[1.0000000e+00]
 [1.3204116e-01]
 [6.0880643e-01]
 [5.0956696e-01]
 [1.3204116e-01]
 [1.1757057e-01]
 [9.9995035e-01]
 [1.1757057e-01]
 [1.3204116e-01]
 [9.0128101e-02]
 [8.8856500e-01]
 [1.3204116e-01]
 [1.3204116e-01]
 [6.2888721e-05]
 [1.0000000e+00]
 [7.3866160e-03]
 [1.3204116e-01]
 [1.0000000e+00]
 [1.3204116e-01]
 [1.1757057e-01]
 [2.6972246e-01]
 [9.9875742e-01]
 [9.9992180e-01]
 [1.3204116e-01]
 [9.0128101e-02]
 [1.4811343e-01]
 [1.3204116e-01]
 [4.5941602e-03]
 [1.4811343e-01]
 [1.0946274e-01]
 [9.9171072e-01]
 [9.9999774e-01]
 [1.3204116e-01]
 [9.9996948e-01]
 [1.3204116e-01]
 [4.8018295e-02]
 [4.3811932e-01]
 [3.4472268e-04]
 [8.3198124e-01]
 [1.0000000e+00]
 [1.3204116e-01]
 [5.8208448e-01]
 [1.0946274e-01]
 [6.4817220e-01]
 [9.9600554e-01]
 [4.3811932e-01]
 [9.1461605e-01]
 [1.0000000e+00]
 [1.0000000e+00]
 [4.6386698e-01]
 [1.9499141e-01]
 [3.3184245e-01]
 [9.9996191e-01]
 [1.3204116e-01]
 [3.3184245e-01]
 [9.9999875e-01]
 [9.0128101e-02]
 [7.2013486e-06]
 [8.5056919e-0

In [384]:

test_data=pd.read_csv(TEST_DATA)
X_test = preprocess(test_data)
X_test = np.asarray(X_test[features]).astype(np.float32)
yhat_test = model.predict(X_test)
predictions_train = np.where(yhat_test >= th_best, 1, 0) ## The predicted values for the test dataset
test_data["Survived_pred"]=predictions_train
test_data.head(20)
target_data=pd.read_csv(TARGET)
display(target_data.head())
tmp_acc = pd.merge(test_data, target_data, how="right", on='PassengerId')
tmp_acc.head()
acc_test=accuracy_score(tmp_acc["Survived_pred"],tmp_acc["Survived"])
print(f"Test Accuracy score: {acc_test:.4f}")
test_data.to_csv(PRED_DATA,columns=["PassengerId","Survived_pred"],index=False)

[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 745us/step


Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0


Test Accuracy score: 0.8489
