In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os

is_kaggle = "KAGGLE_WORKING_DIR" in os.environ or "/kaggle" in os.getcwd()
print("Running on Kaggle:", is_kaggle)

if is_kaggle:
    data_path = "/kaggle/input/titanic/"
else:
    data_path = os.getcwd() + "/"

Running on Kaggle: False


Based on fast.ai chapter 5 we'll now iterate on the numpy-titanic notebook by using pytorch and applying some best practices from that chapter

## Prepare Data set

In [2]:
df = pd.read_csv(data_path + "train.csv")
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


### Handling na values
For linear regression to work we need numerical values, n/a values are not numerical so we should check if our data set contain them.

In [3]:
df.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

We should avoid removing columns or rows. Even the absence of data can sometimes indicate a pattern.

There are many ways to substitute na_values, the easiest of which is to replace na values with the mode value (the most commonly occuring value). This is a good starting point as usually the method of substituion doesn't have a large impact on our results so the mode is good to get an MVP up and running we can iterate on.

In [4]:
modes = df.mode().iloc[0]
modes

PassengerId                      1
Survived                       0.0
Pclass                         3.0
Name           Abbing, Mr. Anthony
Sex                           male
Age                           24.0
SibSp                          0.0
Parch                          0.0
Ticket                        1601
Fare                          8.05
Cabin                      B96 B98
Embarked                         S
Name: 0, dtype: object

In [5]:
df.fillna(modes, inplace=True)
df.isna().sum()

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
dtype: int64

In [6]:
def substitue_na_with_modes(df: pd.DataFrame) -> pd.DataFrame:
    modes = df.mode().iloc[0]
    return df.fillna(modes)

### Converting Category Data to Binary Categorical Values


We can get view our non-numeric or numberic data using the describe function.


In [7]:
df.describe(include=object)

Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked
count,891,891,891,891,891
unique,891,2,681,147,3
top,"Braund, Mr. Owen Harris",male,347082,B96 B98,S
freq,1,577,7,691,646


Sex and Embarked only have 2, and 3 unique values respectively. It's safe to say these are categorical values.

We should also check if any of our numbers are categorical

In [8]:
df.describe(include=(np.number))

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,28.56697,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,13.199572,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,22.0,0.0,0.0,7.9104
50%,446.0,0.0,3.0,24.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,35.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


We can see from its quarile values that PClass is likely also categorical despite being numeric as its only values are 1, 2 or 3. We can confirm this by looking at the [data dictionary](https://www.kaggle.com/competitions/titanic/data) for the kaggle competition and by via pandas.


In [9]:
df.Pclass.unique()

array([3, 1, 2])


Sex, the Passenger class and Embarking city are not measurable attributes so we should convert them to Boolean numbers that can be used as co-efficients. In the previous notebook we did this manually however this pandas can do this for us using `Dataframe.get_dummies()`

In [10]:
categorical_feature_names = ['Sex', 'Embarked', 'Pclass']
df = pd.get_dummies(df, columns=categorical_feature_names)
df.columns

Index(['PassengerId', 'Survived', 'Name', 'Age', 'SibSp', 'Parch', 'Ticket',
       'Fare', 'Cabin', 'Sex_female', 'Sex_male', 'Embarked_C', 'Embarked_Q',
       'Embarked_S', 'Pclass_1', 'Pclass_2', 'Pclass_3'],
      dtype='object')

In [13]:
dummy_column_names = ['Sex_female', 'Sex_male', 'Embarked_C', 'Embarked_Q',
       'Embarked_S', 'Pclass_1', 'Pclass_2', 'Pclass_3']
df[dummy_column_names].head()

Unnamed: 0,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,Pclass_1,Pclass_2,Pclass_3
0,False,True,False,False,True,False,False,True
1,True,False,True,False,False,True,False,False
2,True,False,False,False,True,False,False,True
3,True,False,False,False,True,True,False,False
4,False,True,False,False,True,False,False,True


### Converting numbers to fractional values
#### Age
Larger numbers would have too great an impact on our calculations so we can normalize them by dividing them by their max value them so they're between 0 and 1

In [None]:
def convert_numeric_column_to_decimal(old_df: pd.DataFrame, column_name: str) -> pd.DataFrame:
    new_df = old_df.copy()
    max_numeric_value = old_df[column_name].max()
    new_df[column_name] = old_df[column_name].apply(lambda x: x/max_numeric_value)
    return new_df
    
training_dataframe = convert_numeric_column_to_decimal(training_dataframe, "Age")
training_dataframe.head(10)

#### Fare
The `Fare` column has lots of small values with the occasional very large value. Uniform normalization using the max value isn't ideal when we're dealing with lots of small values with occasional very large values as the variation between the lower numbers will be lost. To normalize the values we can use a log function (log10 here) to bring the numbers down to reasonable ranges. We must use `log10(x+1)` to avoid 0 values as `log10(0)` would give us infinity.

In [None]:
import math
def convert_numeric_column_to_decimal_with_logarithm(old_df: pd.DataFrame, column_name: str) -> pd.DataFrame:
    new_df = old_df.copy()
    new_df[column_name] = new_df[column_name].apply(lambda x: math.log10(x+1) if x > 0 else 0)
    new_df = convert_numeric_column_to_decimal(new_df, column_name)
    return new_df

training_dataframe = convert_numeric_column_to_decimal_with_logarithm(training_dataframe, "Fare")
training_dataframe.head(10)

## Linear Regression
### Add a constant value
A linear function needs a constant, this will be needed for the maths so we should add a column full of ones

In [None]:
training_dataframe["Constant"] = 1
training_dataframe.head(10)

### Prepare initial linear co-efficient values
We want to set each of our parameter values to a random number close to 1. The survived column is not a parameter but our desired result/output so we don't include this.

In [None]:
input_df = training_dataframe.drop("Survived", axis=1)
linear_parameters = np.random.rand(input_df.shape[1]).tolist()
linear_parameters

### Calculate the linear function of our parameters multiplied by our random Coefficients

In [None]:
def calculate_linear_result() -> np.array:
    return input_df.apply(lambda row: row.dot(linear_parameters), axis=1).to_numpy()

training_dataframe["Initial Linear Result"] = input_df.apply(lambda row: row.dot(linear_parameters), axis=1)
training_dataframe.head(10)

### Gradient Descent

In [None]:
def optimize_weights(inputs: [pd.DataFrame], target_variables: np.array, parameters: [float], learning_rate: float=0.01, epochs: int=1000) -> [float]:
    for current_epoch in range(epochs):
        # Predicted values
        predicted_values = inputs.apply(lambda row: row.dot(parameters), axis=1).to_numpy()
    
        # Calculate error
        errors = predicted_values - target_variables
        mean_square_error = (errors ** 2).mean()
    
        if current_epoch % 100 == 0: #Print every 100th value
            print(mean_square_error)
    
        # Calculate gradient
        gradient = np.dot(inputs.to_numpy().T, errors) * 2 / len(target_variables)
    
        # Update parameters
        parameters -= learning_rate * gradient
    # Final parameters
    print(f"Optimized weights: {parameters}")
    print(f"Final error: {mean_square_error}")
    return parameters
    
linear_parameters = optimize_weights(inputs=input_df, target_variables=training_dataframe["Survived"].to_numpy(), parameters=linear_parameters)

## Neural Nets
The calculation above was a linear regression as we only use one set of parameters.
Here we'll use two sets of parameters, apply a RELU (Rectified Linear Unit) function and add them together to give us a loss. A RELU function is non-linear and simply replaces every negative number with a 0.

The RELU is needed as adding together two linear functions just gives us another linear function which doesn't give us any more resolution for our calculation. Combinging each linear layer with a non-linear RELU allows us to keep each linear functions utility increasing our algortihms accuracy.

### Create Matrix of Relu Values

In [None]:
np.random.seed(42)
parameter_matrix = np.random.rand(2, input_df.shape[1]) - 0.5
known_survival_matrix = training_dataframe["Survived"].to_numpy().reshape(-1,1)
inputs = input_df.to_numpy()
inputs

### Relu Gradient Descent (non-linear)

In [None]:
# Gradient descent
for current_epoch in range(1000):
    # Predicted values
    predicted_value_matrix = np.dot(inputs, parameter_matrix.T)
    relu_value_matrix = np.maximum(predicted_value_matrix, 0)
    
    # Calculate error
    errors = relu_value_matrix - known_survival_matrix
    summed_errors = np.sum(errors, axis=1)
    if current_epoch % 100 == 0: #Print every 100th value
        print(summed_errors.mean())
    
    # Calculate gradient
    gradient = np.dot(inputs.T, summed_errors) * 2 / len(training_dataframe["Survived"].to_numpy())
    
    # Update parameters
    parameter_matrix -= 0.01 * gradient
    nn_params = parameter_matrix.sum(axis=0)

# Final parameters
print(f"Optimized weights: {nn_params}")

## Create Titanic survial predictions

Now we'll use the parameters we've calculated to try and make predictions about the survivors in our validation set.

In [None]:
serving_df

In [None]:
def estimate_missing_ages(old_df: pd.DataFrame) -> pd.DataFrame:
    new_df = old_df.copy()
    mean_age = old_df["Age"].mean()
    new_df["Age"].fillna(value=mean_age, inplace=True)
    return new_df

def estimate_missing_fares(old_df: pd.DataFrame) -> pd.DataFrame:
    new_df = old_df.copy()
    new_df["Fare"].fillna(value=0, inplace=True)
    return new_df
    
def prepare_data(old_df: pd.DataFrame) -> pd.DataFrame:
    new_df = old_df.copy()
    new_df = remove_irrelevant_data(new_df)
    new_df = estimate_missing_ages(new_df)
    new_df = estimate_missing_fares(new_df)
    print("Searching for NA values:")
    print(new_df.isna().any())
    new_df = convert_ticket_class_to_binary_values(new_df)
    new_df = convert_embarkation_port_to_binary_values(new_df)
    new_df = convert_sex_to_binary_value(new_df)
    new_df = convert_numeric_column_to_decimal(new_df, "Age")
    new_df = convert_numeric_column_to_decimal_with_logarithm(new_df, "Fare")
    new_df["Constant"] = 1
    return new_df
    
serving_df = prepare_data(serving_df)
assert (input_df.columns == serving_df.columns).all()
serving_df

In [None]:
def create_predictions(validation_df: pd.DataFrame, optimized_weights: np.array) -> np.array:
    return np.dot(validation_df.to_numpy(), optimized_weights)

serving_df["Survival Prediction"] = create_predictions(serving_df, nn_params)
serving_df

## Prepare Submission CSV

In [None]:
original_validation_df = pd.read_csv(data_path + "test.csv")
submission_df = pd.DataFrame()
submission_df["PassengerId"] = original_validation_df["PassengerId"]
submission_df["Survived"] = serving_df["Survival Prediction"].apply(lambda x: 0 if x < 0.5 else 1)
submission_df.to_csv("submission.csv", index=False)
submission_df