<h1 style="text-align: center">Kaggle Titanic Dataset</h1>
<h3 style="text-align: center">Predicting passenger survival</h3>


In [2411]:
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
import csv as csv
from IPython.display import clear_output
import math as math
import re

ALPHA = 0.01
TRAIN_PERCENTAGE = 0.85
PREDICTION_THRESHOLD = 0.50
EPOCHS = 20
DATA_MULTIPLICATION = 2

# ESPILON = 0.00005
# MAX_ITERATIONS = 100000
# REGULARIZATION = 0.75

TRAIN_DATA_FILE = "train.csv"
TEST_DATA_FILE = "test.csv"
DATA_OUTPUT_NAME = "answers.csv"

UNKNOWN_AGE_INSERT = 0 # This is the assumption for the passenger if the age is unknown.

## Fetching and Organizing Data

#### Raw Passenger Data
Passenger Data is returned as a list of strings, in the order of the following:
*Passenger Id, Survived, Pclass, Name, Sex, Age, SibSp, Parch, Ticket, Fare, Cabin, Embarked*

#### Parsed Passenger Data
Passenger Data needs to be parsed and converted into numbers for the machine learning algorithms to take over. The following will be the parsing scheme. This will also be the order as well

1. **Passenger Id**: This will stay the same, nor will be converted into an int.
2. **Survived**: 0 for died, 1 for survived
3. **Pclass**: 1 for upper class, 2 for middle class, 3 for lower class
4. **Sex**: 0 for Male, 1 for Female
5. **Age**: Float from the string, if it is an empty string then going to default to age of 30 (arbitrary).
6. **SibSp**: Sibling/Spouse on board, staying the same.
7. **Parch**: # of Parent/Children on board, staying the same.
8. **Fare**: Convert to a float

In [2412]:
def FetchPassengerData(fileName: str) -> list:
    """Gets the information and then returns a list of the passengers in the format shown below (as strings)

    Args:
        fileName (str): The csv file to open

    Returns:
        list: PassengerId, Survived, Pclass, Name, Sex, Age, SibSp, Parch, Ticket, Fare, Cabin, Embarked 
    """
    result = []
    with open(fileName, newline='') as csvfile:
        reader = csv.reader(csvfile, delimiter=',')
        for row in reader:
            result.append(row)
    return result

def ParseSurvived(survived: str) -> int:
    try:
        return int(float(survived))
    except Exception as ex:
        print(str(ex))
        return 0


def ParseAge(age: str) -> int:
    try:
        isAdult = int(age) >= 18
        if isAdult:
            Adult = 1
            Child = 0
        else:
            Adult = 0
            Child = 1

        return float(age) / 100, Adult, Child
    except ValueError:
        return UNKNOWN_AGE_INSERT / 100, 0, 0

def DetermineTitle(name: str) -> int:
    Mr = re.findall("Mr\.", name)
    Miss = re.findall("Miss\.", name)
    Mrs = re.findall("Mrs\.", name)

    if len(Mr) > 0:
        return 3
    elif len(Mrs) > 0:
        return 2
    elif len(Miss) > 0:
        return 1
    else:
        return 0
    
def ParseFare(fare, pclass):
    try:
        if fare == '':
            if pclass == 1:
                return 70 / 100
            elif pclass == 2:
                return 25 / 100
            else:
                return 7.25 / 100
        else:
            return float(fare)
    except ValueError:
        print("Parseing fare went wrong.")
        return 7.25 / 100
    except Exception as ex:
        print(f"ParseFare error: {ex}")
        return 7.25 / 100

def ParseCabin(cabin):
    for a in cabin:
        if a == 'A':
            return 6
        elif a == 'B':
            return 5
        elif a == 'C':
            return 4
        elif a == 'D':
            return 3
        elif a == 'E':
            return 2
        elif a == 'F':
            return 1
        
    return 0

def ParseEmbarked(embarked):
    if embarked == 'C':
        return 1
    elif embarked == 'Q':
        return 2
    elif embarked == 'S':
        return 3
    else:
        return 0
    
def IsMother(Title, Adult, Parch):
    return int((Title == 2 or Title == 0) and (Adult == 1) and (Parch >= 1))
    

def ParsePassengerData(passenger: list) -> list:
    """Takes passenger data and converts all the types to be usable for the machine learning. See the paragraph above for the information.
     

    Args:
        passenger (list): PassengerId, Survived (optional), Pclass, Name, Sex, Age, SibSp, Parch, Ticket, Fare, Cabin, Embarked

    Returns:
        list: PassengerId, Survived, Pclass, Sex, Age, SibSp, Parch, Fare
    """
    try:
        # Passenger surviving wasn't added. Add a '0' for survivied (mirroring test data)
        if len(passenger) < 12:
            passenger.insert(1, '0')
        
        if len(passenger) < 12:
            print("List is less than 12 elements still.")
            print(f"Passenger: {passenger}")
            raise Exception(ValueError)
        
        PassengerId = float(passenger[0])
        Survived = float(ParseSurvived(passenger[1]))
        Pclass = float(passenger[2])
        #  Name is irrelevant (passenger[3])
        Sex = float(passenger[4] != 'female')
        Age, Adult, Child = ParseAge(passenger[5])
        SibSp = float(passenger[6])
        Parch = float(passenger[7])
        # Ticket is irrelevant (passenger[8])
        Fare = ParseFare(passenger[9], Pclass)
        # Cabin will be relevant -- going to eventually make up a scheme and numbering system based general cabin location.
        Cabin = ParseCabin(passenger[10])
        # I doubt embarked will matter, might be worth adding later.
        Embarked = ParseEmbarked(passenger[11])

        # Engineered
        FamilySize = SibSp + Parch
        Title = DetermineTitle(name=passenger[3])
        Mother = IsMother(Title, Adult, Parch)
    except ValueError:
        print("Value Error")
        print(f"Passenger: {passenger}")
        raise Exception(ValueError)

    return np.array([PassengerId, Survived, 
            Sex,
            Pclass, 
            Title,  
            Age, 
            Adult,
            Child,
            SibSp,
            Parch,
            FamilySize, 
            Fare,
            Cabin,
            Embarked,
            Mother
            ])


In [2413]:
Data = [ParsePassengerData(passenger=passenger) for passenger in FetchPassengerData(TRAIN_DATA_FILE)[1:]]
Males = 0
Females = 0
for passenger in Data:
    if passenger[2] == 0:
        Males += 1
    else:
        Females += 1

print(f"Males: {Males}\nFemales: {Females}")

Males: 314
Females: 577


## Training Functions

In [2414]:
def GetTrainData(file):
    RawData = FetchPassengerData(file)
    AllData = []
    for passenger in RawData[1:]:
        AllData.append(ParsePassengerData(passenger))
    m = len(AllData)
    temp_x_total = [passenger[2:] for passenger in AllData]
    x_total = []
    for passenger in temp_x_total:
        temp_pass = []
        for attribute in passenger:
            temp_pass.append(attribute)
        for a in range(len(passenger)):
            for b in range(len(passenger) - a):
                temp_pass.append(passenger[a] * passenger[b])
        x_total.append(np.array(temp_pass))

    print(f"Training Examples: {len(x_total)}")
    print(f"Number of attributes: {len(x_total[0])}")
    y_total = [int(passenger[1]) for passenger in AllData]
    cutoff = int(m * TRAIN_PERCENTAGE)
    return (np.array(x_total[:cutoff]), np.array(y_total[:cutoff])), (np.array(x_total[cutoff:]), np.array(y_total[cutoff:]))

(x_train, y_train), (x_test, y_test) = GetTrainData(TRAIN_DATA_FILE)



Training Examples: 891
Number of attributes: 104


In [2415]:
model = tf.keras.models.Sequential([
  tf.keras.layers.Dense(units=256, activation='relu'),
  tf.keras.layers.Dense(units=64, activation='relu'),
  tf.keras.layers.Dense(1, activation='sigmoid')
])

In [2416]:
predictions = model(x_train).numpy()
predictions

array([[0.867941  ],
       [0.81182146],
       [0.6842649 ],
       [0.7963588 ],
       [0.782959  ],
       [0.7175006 ],
       [0.9935772 ],
       [0.98378664],
       [0.92675745],
       [0.5920888 ],
       [0.8279751 ],
       [0.4808288 ],
       [0.77127665],
       [0.9999542 ],
       [0.65778357],
       [0.6784017 ],
       [0.9853947 ],
       [0.85624963],
       [0.79683596],
       [0.46574062],
       [0.97771907],
       [0.67735523],
       [0.5246997 ],
       [0.90796584],
       [0.97900456],
       [0.999069  ],
       [0.59125245],
       [1.        ],
       [0.585362  ],
       [0.7510014 ],
       [0.55779976],
       [0.8149853 ],
       [0.5904128 ],
       [0.8269587 ],
       [0.99933636],
       [0.9974495 ],
       [0.5914162 ],
       [0.7719673 ],
       [0.80424875],
       [0.5958152 ],
       [0.8184905 ],
       [0.7867819 ],
       [0.6227093 ],
       [0.78616625],
       [0.53923726],
       [0.7537072 ],
       [0.94555104],
       [0.590

In [2417]:
tf.nn.softmax(predictions).numpy()

array([[1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],

In [2418]:
loss_fn = tf.keras.losses.BinaryCrossentropy()

In [2419]:
loss_fn(y_train, predictions).numpy()

1.69808

In [2420]:
optimizer = tf.keras.optimizers.legacy.Adam()            
optimizer.learning_rate.assign(ALPHA)
model.compile(optimizer=optimizer,
              loss=loss_fn,
              metrics=['accuracy'])

In [2421]:
History = model.fit(x_train, y_train, epochs=EPOCHS)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [2422]:
model.evaluate(x_test,  y_test, verbose=2)

5/5 - 0s - loss: 0.3428 - accuracy: 0.8582 - 66ms/epoch - 13ms/step


[0.34275439381599426, 0.858208954334259]

In [2423]:
print(History.params)

{'verbose': 1, 'epochs': 20, 'steps': 24}


In [2424]:
def MakePredictions(model: tf.keras.models.Sequential, file):
    RawData = FetchPassengerData(file)
    Data = []
    for passenger in RawData[1:]:
        Data.append(ParsePassengerData(passenger))

    PredictionData = np.array([passenger[2:] for passenger in Data])

    temp_pred_total = [passenger[2:] for passenger in Data]
    pred_total = []
    for passenger in temp_pred_total:
        temp_pass = []
        for attribute in passenger:
            temp_pass.append(attribute)
        for a in range(len(passenger)):
            for b in range(len(passenger) - a):
                temp_pass.append(passenger[a] * passenger[b])
        pred_total.append(np.array(temp_pass))

    print(f"Number of Test Examples: {len(pred_total)}")
    print(f"Number of Attributes: {len(pred_total[0])}")

    rawPred = model.predict(np.array(pred_total))
    
    predictions = []
    for i in range(len(Data)):
        id = Data[i][0]
        pred = rawPred[i]
        survived = pred[0] > PREDICTION_THRESHOLD
        predictions.append((int(id), int(survived)))
    return predictions

predictions = MakePredictions(model=model, file=TEST_DATA_FILE)
print(predictions)

Number of Test Examples: 418
Number of Attributes: 104
[(892, 0), (893, 0), (894, 0), (895, 0), (896, 0), (897, 0), (898, 1), (899, 0), (900, 1), (901, 0), (902, 0), (903, 0), (904, 1), (905, 0), (906, 1), (907, 1), (908, 0), (909, 0), (910, 0), (911, 1), (912, 0), (913, 1), (914, 1), (915, 0), (916, 1), (917, 0), (918, 1), (919, 0), (920, 0), (921, 0), (922, 0), (923, 0), (924, 0), (925, 1), (926, 0), (927, 0), (928, 1), (929, 1), (930, 0), (931, 0), (932, 0), (933, 0), (934, 0), (935, 1), (936, 1), (937, 0), (938, 0), (939, 0), (940, 1), (941, 1), (942, 0), (943, 0), (944, 1), (945, 1), (946, 0), (947, 0), (948, 0), (949, 0), (950, 0), (951, 1), (952, 0), (953, 0), (954, 0), (955, 1), (956, 1), (957, 1), (958, 1), (959, 0), (960, 0), (961, 1), (962, 1), (963, 0), (964, 1), (965, 0), (966, 1), (967, 0), (968, 0), (969, 1), (970, 0), (971, 1), (972, 1), (973, 0), (974, 0), (975, 0), (976, 0), (977, 0), (978, 1), (979, 1), (980, 1), (981, 1), (982, 1), (983, 0), (984, 1), (985, 0), (986

In [2425]:
def WriteAnswersCSVFile(fname, PredictionList):

    with open(fname, 'w', newline='') as csvfile:
        writer = csv.writer(csvfile, delimiter=',')
        writer.writerow(['PassengerId', 'Survived'])

        for pred in PredictionList:
            writer.writerow([str(pred[0]), str(pred[1])])

WriteAnswersCSVFile('submission.csv', predictions)