# Logistic regression model for select the survivors of the titanic

In [1]:
import pandas as pd 

import numpy as np

import tensorflow as tf


In [2]:
dataset_path='../Media/titanicdatacleaned.xlsx'

dataset=pd.read_excel(dataset_path)

dataset.head(10)

Unnamed: 0,pclass,survived,name,sex,age,ticket,fare,cabin,embarked,boat
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,24160,211.3375,B5,S,2
1,1,1,"Allison, Master. Hudson Trevor",male,0.92,113781,151.55,C22 C26,S,11
2,1,0,"Allison, Miss. Helen Loraine",female,2.0,113781,151.55,C22 C26,S,1000
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,113781,151.55,C22 C26,S,1001
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,113781,151.55,C22 C26,S,1002
5,1,1,"Anderson, Mr. Harry",male,48.0,19952,26.55,E12,S,3
6,1,1,"Andrews, Miss. Kornelia Theodosia",female,63.0,13502,77.9583,D7,S,10
7,1,0,"Andrews, Mr. Thomas Jr",male,39.0,112050,0.0,A36,S,1003
8,1,1,"Appleton, Mrs. Edward Dale (Charlotte Lamson)",female,53.0,11769,51.4792,C101,S,D
9,1,0,"Astor, Col. John Jacob",male,47.0,PC 17757,227.525,C62 C64,C,1005


In [3]:
dataset = pd.get_dummies(dataset, columns=['sex'])


def fill_boolean_encoding(column):

    dataset[column] = dataset[column].astype(int)


#Second option
# def fill_boolean_encoding(column):
#     for i in range(dataset.shape[0]):
#         if dataset.loc[i, column] == True:
#             dataset.loc[i, column] = 1
#         else:
#             dataset.loc[i, column] = 0


fill_boolean_encoding('sex_female')

fill_boolean_encoding('sex_male')


In [4]:
dataset.drop(columns=['sex_male'], inplace=True)

dataset=dataset.rename(columns={'sex_female':'gender'})

In [5]:
dataset

Unnamed: 0,pclass,survived,name,age,ticket,fare,cabin,embarked,boat,gender
0,1,1,"Allen, Miss. Elisabeth Walton",29.00,24160,211.3375,B5,S,2,1
1,1,1,"Allison, Master. Hudson Trevor",0.92,113781,151.5500,C22 C26,S,11,0
2,1,0,"Allison, Miss. Helen Loraine",2.00,113781,151.5500,C22 C26,S,1000,1
3,1,0,"Allison, Mr. Hudson Joshua Creighton",30.00,113781,151.5500,C22 C26,S,1001,0
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",25.00,113781,151.5500,C22 C26,S,1002,1
...,...,...,...,...,...,...,...,...,...,...
287,3,1,"Sandstrom, Miss. Marguerite Rut",4.00,PP 9549,16.7000,G6,S,13,1
288,3,0,"Soholt, Mr. Peter Andreas Lauritz Andersen",19.00,348124,7.6500,F G73,S,1749,0
289,3,0,"Strom, Miss. Telma Matilda",2.00,347054,10.4625,G6,S,1759,1
290,3,0,"Strom, Mrs. Wilhelm (Elna Matilda Persson)",29.00,347054,10.4625,G6,S,1760,1


The gender column have 2 options , for the female 1 and the male 0 

In [6]:
dataset['embarked'].value_counts()

embarked_values={'S':1,"C":2,"Q":3}

embarked_values = {'S': 1, 'C': 2, 'Q': 3}

def fill_embarked(column):
    dataset[column] = dataset[column].map(embarked_values)

    dataset[column].fillna(1, inplace=True)
    
    dataset[column]=dataset[column].astype(int)

fill_embarked('embarked')

dataset

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  dataset[column].fillna(1, inplace=True)


Unnamed: 0,pclass,survived,name,age,ticket,fare,cabin,embarked,boat,gender
0,1,1,"Allen, Miss. Elisabeth Walton",29.00,24160,211.3375,B5,1,2,1
1,1,1,"Allison, Master. Hudson Trevor",0.92,113781,151.5500,C22 C26,1,11,0
2,1,0,"Allison, Miss. Helen Loraine",2.00,113781,151.5500,C22 C26,1,1000,1
3,1,0,"Allison, Mr. Hudson Joshua Creighton",30.00,113781,151.5500,C22 C26,1,1001,0
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",25.00,113781,151.5500,C22 C26,1,1002,1
...,...,...,...,...,...,...,...,...,...,...
287,3,1,"Sandstrom, Miss. Marguerite Rut",4.00,PP 9549,16.7000,G6,1,13,1
288,3,0,"Soholt, Mr. Peter Andreas Lauritz Andersen",19.00,348124,7.6500,F G73,1,1749,0
289,3,0,"Strom, Miss. Telma Matilda",2.00,347054,10.4625,G6,1,1759,1
290,3,0,"Strom, Mrs. Wilhelm (Elna Matilda Persson)",29.00,347054,10.4625,G6,1,1760,1


In [7]:
X = dataset[['age', 'fare', 'gender', 'embarked']].values

Y=dataset['survived'].values

In [8]:
from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test=train_test_split(X,Y,random_state=42,shuffle=True,train_size=0.7)

num_features = x_train.shape[1]

num_labels = 1

In [9]:
x_train=tf.constant(x_train,dtype=tf.float32)

x_test=tf.constant(x_test,dtype=tf.float32)


y_train=tf.constant(y_train,dtype=tf.float32)

y_test=tf.constant(y_test,dtype=tf.float32)

In [10]:
weight = tf.Variable(tf.random.normal([num_features, num_labels]))
bias = tf.Variable(tf.zeros([num_labels]))

def logistic_regression(x):
    z = tf.matmul(x, weight) + bias
    return tf.nn.sigmoid(z)

# Ensure that y_train and y_test are (batch_size, 1)
y_train = tf.reshape(y_train, [-1, 1])
y_test = tf.reshape(y_test, [-1, 1])

num_epochs=500


# Optimization (RMSprop)

In [11]:

def accuracy(y_true, y_pred):
    predicted = tf.round(y_pred)
    acc = tf.reduce_mean(tf.cast(tf.equal(predicted, y_true), tf.float32))
    return acc

mean = tf.reduce_mean(x_train, axis=0)
std = tf.math.reduce_std(x_train, axis=0)
x_train_norm = (x_train - mean) / std
x_test_norm = (x_test - mean) / std

learning_rate = 0.001  
optimizer = tf.keras.optimizers.RMSprop(learning_rate, decay=1e-6, momentum=0.9)
loss_object = tf.keras.losses.BinaryCrossentropy(from_logits=False)

def run_optimization(x, y):
    with tf.GradientTape() as g:
        pred = logistic_regression(x)
        loss = loss_object(y, pred)
    
    gradients = g.gradient(loss, [weight, bias])
        # Add gradient clipping to prevent explosion  gradients
    optimizer.apply_gradients(zip(gradients, [weight, bias]))
    
    return loss, pred



# Train model

In [12]:
def train_model(x_train, x_test, y_train, y_test, num_epochs):
    # Normalize the data
    mean = tf.reduce_mean(x_train, axis=0)
    std = tf.math.reduce_std(x_train, axis=0)
    x_train_norm = (x_train - mean) / std
    x_test_norm = (x_test - mean) / std

    train_losses, val_losses = [], []
    train_accuracies, val_accuracies = [], []

    for epoch in range(num_epochs):
        # Optimize the model
        train_loss, train_pred = run_optimization(x_train_norm, y_train)
        train_acc = accuracy(y_train, train_pred)

        val_pred = logistic_regression(x_test_norm)
        val_loss = loss_object(y_test, val_pred)
        val_acc = accuracy(y_test, val_pred)

        # 100 epochs analysis
        if epoch % 100 == 0:
            print(f"Weight norm: {tf.norm(weight).numpy():.4f}")
            print(f"Bias value: {bias.numpy()[0]:.4f}")

        train_losses.append(float(train_loss))
        val_losses.append(float(val_loss))
        train_accuracies.append(float(train_acc))
        val_accuracies.append(float(val_acc))

        if epoch % 10 == 0:  # Reducir frecuencia de impresión
            print(f"Epoch {epoch+1}/{num_epochs} - "
                  f"loss: {train_loss:.4f} - acc: {train_acc:.4f} - "
                  f"val_loss: {val_loss:.4f} - val_acc: {val_acc:.4f}")

    return {
        'train_losses': train_losses,
        'val_losses': val_losses,
        'train_accuracies': train_accuracies,
        'val_accuracies': val_accuracies
    }

resuls = train_model(x_train, x_test, y_train, y_test, num_epochs)

Weight norm: 1.4448
Bias value: 0.0032
Epoch 1/500 - loss: 1.0065 - acc: 0.4657 - val_loss: 0.8511 - val_acc: 0.5227
Epoch 11/500 - loss: 0.9300 - acc: 0.4657 - val_loss: 0.7968 - val_acc: 0.5114
Epoch 21/500 - loss: 0.8380 - acc: 0.4853 - val_loss: 0.7402 - val_acc: 0.5455
Epoch 31/500 - loss: 0.7613 - acc: 0.4902 - val_loss: 0.6959 - val_acc: 0.5455
Epoch 41/500 - loss: 0.6997 - acc: 0.5343 - val_loss: 0.6620 - val_acc: 0.5568
Epoch 51/500 - loss: 0.6499 - acc: 0.5833 - val_loss: 0.6351 - val_acc: 0.6136
Epoch 61/500 - loss: 0.6089 - acc: 0.6225 - val_loss: 0.6115 - val_acc: 0.6477
Epoch 71/500 - loss: 0.5740 - acc: 0.6569 - val_loss: 0.5872 - val_acc: 0.6705
Epoch 81/500 - loss: 0.5438 - acc: 0.6667 - val_loss: 0.5618 - val_acc: 0.6932
Epoch 91/500 - loss: 0.5182 - acc: 0.7059 - val_loss: 0.5401 - val_acc: 0.6818
Weight norm: 0.9741
Bias value: 0.7690
Epoch 101/500 - loss: 0.4967 - acc: 0.7500 - val_loss: 0.5248 - val_acc: 0.7045
Epoch 111/500 - loss: 0.4787 - acc: 0.7647 - val_loss

# Cross Validation

In [13]:
from sklearn.model_selection import cross_val_score, KFold

def cross_validate_model(X, Y, num_epochs, k_folds=5):
    kf = KFold(n_splits=k_folds, shuffle=True, random_state=42)
    fold_scores = []
    
    for fold, (train_idx, val_idx) in enumerate(kf.split(X)):
        print(f"\nEntrenando fold {fold+1}/{k_folds}")
        
        # Prepare data
        x_train_fold = tf.gather(X, train_idx)
        y_train_fold = tf.gather(Y, train_idx)
        x_val_fold = tf.gather(X, val_idx)
        y_val_fold = tf.gather(Y, val_idx)
        
        # Reset the bias and weight
        global weight, bias
        weight.assign(tf.random.normal([num_features, num_labels]))
        bias.assign(tf.zeros([num_labels]))
        
        # Train the model
        results = train_model(x_train_fold, x_val_fold, 
                            y_train_fold, y_val_fold, 
                            num_epochs)
        
        fold_scores.append(results['val_accuracies'][-1])
    
    print("\nCross validation results")
    print(f"Accuracy promedio: {np.mean(fold_scores):.4f} ± {np.std(fold_scores):.4f}")
    
    return fold_scores

scores = cross_validate_model(x_train, y_train, num_epochs=500)


Entrenando fold 1/5
Weight norm: 1.1611
Bias value: 0.0032
Epoch 1/500 - loss: 1.0395 - acc: 0.3865 - val_loss: 1.1188 - val_acc: 0.2683
Epoch 11/500 - loss: 0.9673 - acc: 0.3926 - val_loss: 1.0007 - val_acc: 0.2927
Epoch 21/500 - loss: 0.8820 - acc: 0.4172 - val_loss: 0.8744 - val_acc: 0.4146
Epoch 31/500 - loss: 0.8100 - acc: 0.4663 - val_loss: 0.7731 - val_acc: 0.5366
Epoch 41/500 - loss: 0.7496 - acc: 0.4969 - val_loss: 0.6933 - val_acc: 0.5854
Epoch 51/500 - loss: 0.6993 - acc: 0.5644 - val_loss: 0.6265 - val_acc: 0.7073
Epoch 61/500 - loss: 0.6571 - acc: 0.6196 - val_loss: 0.5676 - val_acc: 0.7561
Epoch 71/500 - loss: 0.6215 - acc: 0.6196 - val_loss: 0.5200 - val_acc: 0.8049
Epoch 81/500 - loss: 0.5911 - acc: 0.6442 - val_loss: 0.4817 - val_acc: 0.8293
Epoch 91/500 - loss: 0.5648 - acc: 0.6380 - val_loss: 0.4515 - val_acc: 0.8049
Weight norm: 0.6021
Bias value: 0.5874
Epoch 101/500 - loss: 0.5426 - acc: 0.6810 - val_loss: 0.4251 - val_acc: 0.8537
Epoch 111/500 - loss: 0.5242 - a

# Save the model


In [None]:
from tensorflow import keras

model = keras.Sequential([
    keras.layers.Dense(1, activation='sigmoid', input_shape=(num_features,))
])

model.layers[0].set_weights([weight.numpy(), bias.numpy()])

model.save('titanic_model.h5')


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
