In [5]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os

is_kaggle = "KAGGLE_WORKING_DIR" in os.environ or "/kaggle" in os.getcwd()
print("Running on Kaggle:", is_kaggle)

Running on Kaggle: False


This is my attempt to reproduce the basic gradient descent used in fast.ai's ML for coder's course in Python. It's originally done in an excel spreadsheet. I don't have an excel license and the "Solver" functionality in excel is not available in numbers so I'm going to try and attempt it in Python with minimal use of typical ML frameworks

## Load Data set

In [7]:
if is_kaggle:
    data_path = "/kaggle/input/titanic/"
else:
    data_path = os.getcwd() + "/"
    
training_dataframe = pd.read_csv(data_path + "train.csv")
serving_df = pd.read_csv(data_path + "test.csv")

training_dataframe.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


## Prepare Data set
### Data Removal
First we'll remove the columns that won't be useful

In [8]:
def remove_irrelevant_data(old_df: pd.DataFrame) -> pd.DataFrame:
    new_df = old_df.copy()
    columns_to_drop = ["PassengerId", "Name", "Ticket", "Cabin"]
    return new_df.drop(columns=columns_to_drop)
    
training_dataframe = remove_irrelevant_data(training_dataframe)
training_dataframe.head(10)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S
5,0,3,male,,0,0,8.4583,Q
6,0,1,male,54.0,0,0,51.8625,S
7,0,3,male,2.0,3,1,21.075,S
8,1,3,female,27.0,0,2,11.1333,S
9,1,2,female,14.0,1,0,30.0708,C


There are also some rows with empty values we should remove

In [9]:
def remove_na_values(old_df: pd.DataFrame) -> pd.DataFrame:
    cleaned_df = old_df.copy()
    cleaned_df = cleaned_df.dropna()
    removed_row_count = old_df.shape[0] - cleaned_df.shape[0]
    print(f"{removed_row_count} entries were removed, {cleaned_df.shape[0]} entries remain")
    return cleaned_df

training_dataframe = remove_na_values(training_dataframe)

179 entries were removed, 712 entries remain


### Converting Category Data to Binary Categorical Values
Sex, the Passenger class and Embarking city are not measurable attributes so we should convert them to Boolean numbers that can be used as co-efficients

In [10]:
def convert_ticket_class_to_binary_values(original_df: pd.DataFrame) -> pd.DataFrame:
    new_df = original_df.copy()
    new_df["FirstClass"] = new_df["Pclass"].apply(lambda x: binary_equal_to_value(x,1))
    new_df["SecondClass"] = new_df["Pclass"].apply(lambda x: binary_equal_to_value(x,2))
    new_df.drop("Pclass", axis=1, inplace=True)
    return new_df
    

def binary_equal_to_value(number, compare_value):
    if (number == compare_value):
        return 1
    return 0

training_dataframe = convert_ticket_class_to_binary_values(training_dataframe)
training_dataframe.head(10)

Unnamed: 0,Survived,Sex,Age,SibSp,Parch,Fare,Embarked,FirstClass,SecondClass
0,0,male,22.0,1,0,7.25,S,0,0
1,1,female,38.0,1,0,71.2833,C,1,0
2,1,female,26.0,0,0,7.925,S,0,0
3,1,female,35.0,1,0,53.1,S,1,0
4,0,male,35.0,0,0,8.05,S,0,0
6,0,male,54.0,0,0,51.8625,S,1,0
7,0,male,2.0,3,1,21.075,S,0,0
8,1,female,27.0,0,2,11.1333,S,0,0
9,1,female,14.0,1,0,30.0708,C,0,1
10,1,female,4.0,1,1,16.7,S,0,0


In [11]:
def convert_embarkation_port_to_binary_values(old_df: pd.DataFrame) -> pd.DataFrame:
    new_df = old_df.copy()
    new_df["Cherbourg_Departure"] = old_df["Embarked"].apply(lambda x: binary_equal_to_value(x, 'C'))
    new_df["Queenstown_Departure"] = old_df["Embarked"].apply(lambda x: binary_equal_to_value(x, 'Q'))
    new_df.drop("Embarked", axis=1, inplace=True)
    return new_df
    

training_dataframe = convert_embarkation_port_to_binary_values(training_dataframe)
training_dataframe.head(10)

Unnamed: 0,Survived,Sex,Age,SibSp,Parch,Fare,FirstClass,SecondClass,Cherbourg_Departure,Queenstown_Departure
0,0,male,22.0,1,0,7.25,0,0,0,0
1,1,female,38.0,1,0,71.2833,1,0,1,0
2,1,female,26.0,0,0,7.925,0,0,0,0
3,1,female,35.0,1,0,53.1,1,0,0,0
4,0,male,35.0,0,0,8.05,0,0,0,0
6,0,male,54.0,0,0,51.8625,1,0,0,0
7,0,male,2.0,3,1,21.075,0,0,0,0
8,1,female,27.0,0,2,11.1333,0,0,0,0
9,1,female,14.0,1,0,30.0708,0,1,1,0
10,1,female,4.0,1,1,16.7,0,0,0,0


In [12]:
def convert_sex_to_binary_value(old_df: pd.DataFrame) -> pd.DataFrame:
    new_df = old_df.copy()
    new_df["Sex"] = old_df["Sex"].apply(lambda x: binary_equal_to_value(x, "male"))
    return new_df

training_dataframe = convert_sex_to_binary_value(training_dataframe)
training_dataframe.head(10)

Unnamed: 0,Survived,Sex,Age,SibSp,Parch,Fare,FirstClass,SecondClass,Cherbourg_Departure,Queenstown_Departure
0,0,1,22.0,1,0,7.25,0,0,0,0
1,1,0,38.0,1,0,71.2833,1,0,1,0
2,1,0,26.0,0,0,7.925,0,0,0,0
3,1,0,35.0,1,0,53.1,1,0,0,0
4,0,1,35.0,0,0,8.05,0,0,0,0
6,0,1,54.0,0,0,51.8625,1,0,0,0
7,0,1,2.0,3,1,21.075,0,0,0,0
8,1,0,27.0,0,2,11.1333,0,0,0,0
9,1,0,14.0,1,0,30.0708,0,1,1,0
10,1,0,4.0,1,1,16.7,0,0,0,0


### Converting numbers to fractional values
#### Age
Larger numbers would have too great an impact on our calculations so we can normalize them by dividing them by their max value them so they're between 0 and 1

In [13]:
def convert_numeric_column_to_decimal(old_df: pd.DataFrame, column_name: str) -> pd.DataFrame:
    new_df = old_df.copy()
    max_numeric_value = old_df[column_name].max()
    new_df[column_name] = old_df[column_name].apply(lambda x: x/max_numeric_value)
    return new_df
    
training_dataframe = convert_numeric_column_to_decimal(training_dataframe, "Age")
training_dataframe.head(10)

Unnamed: 0,Survived,Sex,Age,SibSp,Parch,Fare,FirstClass,SecondClass,Cherbourg_Departure,Queenstown_Departure
0,0,1,0.275,1,0,7.25,0,0,0,0
1,1,0,0.475,1,0,71.2833,1,0,1,0
2,1,0,0.325,0,0,7.925,0,0,0,0
3,1,0,0.4375,1,0,53.1,1,0,0,0
4,0,1,0.4375,0,0,8.05,0,0,0,0
6,0,1,0.675,0,0,51.8625,1,0,0,0
7,0,1,0.025,3,1,21.075,0,0,0,0
8,1,0,0.3375,0,2,11.1333,0,0,0,0
9,1,0,0.175,1,0,30.0708,0,1,1,0
10,1,0,0.05,1,1,16.7,0,0,0,0


#### Fare
The `Fare` column has lots of small values with the occasional very large value. Uniform normalization using the max value isn't ideal when we're dealing with lots of small values with occasional very large values as the variation between the lower numbers will be lost. To normalize the values we can use a log function (log10 here) to bring the numbers down to reasonable ranges. We must use `log10(x+1)` to avoid 0 values as `log10(0)` would give us infinity.

In [14]:
import math
def convert_numeric_column_to_decimal_with_logarithm(old_df: pd.DataFrame, column_name: str) -> pd.DataFrame:
    new_df = old_df.copy()
    new_df[column_name] = new_df[column_name].apply(lambda x: math.log10(x+1) if x > 0 else 0)
    new_df = convert_numeric_column_to_decimal(new_df, column_name)
    return new_df

training_dataframe = convert_numeric_column_to_decimal_with_logarithm(training_dataframe, "Fare")
training_dataframe.head(10)

Unnamed: 0,Survived,Sex,Age,SibSp,Parch,Fare,FirstClass,SecondClass,Cherbourg_Departure,Queenstown_Departure
0,0,1,0.275,1,0,0.338125,0,0,0,0
1,1,0,0.475,1,0,0.685892,1,0,1,0
2,1,0,0.325,0,0,0.350727,0,0,0,0
3,1,0,0.4375,1,0,0.639463,1,0,0,0
4,0,1,0.4375,0,0,0.352955,0,0,0,0
6,0,1,0.675,0,0,0.635755,1,0,0,0
7,0,1,0.025,3,1,0.495832,0,0,0,0
8,1,0,0.3375,0,2,0.399934,0,0,0,0
9,1,0,0.175,1,0,0.550603,0,1,1,0
10,1,0,0.05,1,1,0.460439,0,0,0,0


## Linear Regression
### Add a constant value
A linear function needs a constant, this will be needed for the maths so we should add a column full of ones

In [15]:
training_dataframe["Constant"] = 1
training_dataframe.head(10)

Unnamed: 0,Survived,Sex,Age,SibSp,Parch,Fare,FirstClass,SecondClass,Cherbourg_Departure,Queenstown_Departure,Constant
0,0,1,0.275,1,0,0.338125,0,0,0,0,1
1,1,0,0.475,1,0,0.685892,1,0,1,0,1
2,1,0,0.325,0,0,0.350727,0,0,0,0,1
3,1,0,0.4375,1,0,0.639463,1,0,0,0,1
4,0,1,0.4375,0,0,0.352955,0,0,0,0,1
6,0,1,0.675,0,0,0.635755,1,0,0,0,1
7,0,1,0.025,3,1,0.495832,0,0,0,0,1
8,1,0,0.3375,0,2,0.399934,0,0,0,0,1
9,1,0,0.175,1,0,0.550603,0,1,1,0,1
10,1,0,0.05,1,1,0.460439,0,0,0,0,1


### Prepare initial linear co-efficient values
We want to set each of our parameter values to a random number close to 1. The survived column is not a parameter but our desired result/output so we don't include this.

In [16]:
input_df = training_dataframe.drop("Survived", axis=1)
linear_parameters = np.random.rand(input_df.shape[1]).tolist()
linear_parameters

[0.3677861269962356,
 0.735390192923924,
 0.2011318137597773,
 0.3264526875060695,
 0.35235248410424114,
 0.5594845527563047,
 0.18148231219015898,
 0.7524423145267378,
 0.25434680048629243,
 0.9618360924075596]

### Calculate the linear function of our parameters multiplied by our random Coefficients

In [18]:
def calculate_linear_result() -> np.array:
    return input_df.apply(lambda row: row.dot(linear_parameters), axis=1).to_numpy()

training_dataframe["Initial Linear Result"] = input_df.apply(lambda row: row.dot(linear_parameters), axis=1)
training_dataframe.head(10)

Unnamed: 0,Survived,Sex,Age,SibSp,Parch,Fare,FirstClass,SecondClass,Cherbourg_Departure,Queenstown_Departure,Constant,Initial Linear Result
0,0,1,0.275,1,0,0.338125,0,0,0,0,1,1.852126
1,1,0,0.475,1,0,0.685892,1,0,1,0,1,3.065881
2,1,0,0.325,0,0,0.350727,0,0,0,0,1,1.324417
3,1,0,0.4375,1,0,0.639463,1,0,0,0,1,2.269502
4,0,1,0.4375,0,0,0.352955,0,0,0,0,1,1.77572
6,0,1,0.675,0,0,0.635755,1,0,0,0,1,2.609505
7,0,1,0.025,3,1,0.495832,0,0,0,0,1,2.452563
8,1,0,0.3375,0,2,0.399934,0,0,0,0,1,2.003853
9,1,0,0.175,1,0,0.550603,0,1,1,0,1,2.419592
10,1,0,0.05,1,1,0.460439,0,0,0,0,1,1.688427


### Gradient Descent

In [19]:
def optimize_weights(inputs: [pd.DataFrame], target_variables: np.array, parameters: [float], learning_rate: float=0.01, epochs: int=1000) -> [float]:
    for current_epoch in range(epochs):
        # Predicted values
        predicted_values = inputs.apply(lambda row: row.dot(parameters), axis=1).to_numpy()
    
        # Calculate error
        errors = predicted_values - target_variables
        mean_square_error = (errors ** 2).mean()
    
        if current_epoch % 100 == 0: #Print every 100th value
            print(mean_square_error)
    
        # Calculate gradient
        gradient = np.dot(inputs.to_numpy().T, errors) * 2 / len(target_variables)
    
        # Update parameters
        parameters -= learning_rate * gradient
    # Final parameters
    print(f"Optimized weights: {parameters}")
    print(f"Final error: {mean_square_error}")
    return parameters
    
linear_parameters = optimize_weights(inputs=input_df, target_variables=training_dataframe["Survived"].to_numpy(), parameters=linear_parameters)

3.8236063642479308
0.20650975214902054
0.18443333611239954
0.17542655914172445
0.1699023162785615
0.1660849899502831
0.1632379850163706
0.1609953497559946
0.15915781510294305
0.15760873961243244
Optimized weights: [-0.46225838  0.12778404 -0.01639597  0.00664009  0.08907964  0.25810681
  0.18612195  0.15692407  0.10872972  0.46137245]
Final error: 0.15628806283912391


## Neural Nets
The calculation above was a linear regression as we only use one set of parameters.
Here we'll use two sets of parameters, apply a RELU (Rectified Linear Unit) function and add them together to give us a loss. A RELU function is non-linear and simply replaces every negative number with a 0.

The RELU is needed as adding together two linear functions just gives us another linear function which doesn't give us any more resolution for our calculation. Combinging each linear layer with a non-linear RELU allows us to keep each linear functions utility increasing our algortihms accuracy.

### Create Matrix of Relu Values

In [21]:
np.random.seed(42)
parameter_matrix = np.random.rand(2, input_df.shape[1]) - 0.5
known_survival_matrix = training_dataframe["Survived"].to_numpy().reshape(-1,1)
inputs = input_df.to_numpy()
inputs

array([[1.    , 0.275 , 1.    , ..., 0.    , 0.    , 1.    ],
       [0.    , 0.475 , 1.    , ..., 1.    , 0.    , 1.    ],
       [0.    , 0.325 , 0.    , ..., 0.    , 0.    , 1.    ],
       ...,
       [0.    , 0.2375, 0.    , ..., 0.    , 0.    , 1.    ],
       [1.    , 0.325 , 0.    , ..., 1.    , 0.    , 1.    ],
       [1.    , 0.4   , 0.    , ..., 0.    , 1.    , 1.    ]])

### Relu Gradient Descent (non-linear)

In [22]:
# Gradient descent
for current_epoch in range(1000):
    # Predicted values
    predicted_value_matrix = np.dot(inputs, parameter_matrix.T)
    relu_value_matrix = np.maximum(predicted_value_matrix, 0)
    
    # Calculate error
    errors = relu_value_matrix - known_survival_matrix
    summed_errors = np.sum(errors, axis=1)
    if current_epoch % 100 == 0: #Print every 100th value
        print(summed_errors.mean())
    
    # Calculate gradient
    gradient = np.dot(inputs.T, summed_errors) * 2 / len(training_dataframe["Survived"].to_numpy())
    
    # Update parameters
    parameter_matrix -= 0.01 * gradient
    nn_params = parameter_matrix.sum(axis=0)

# Final parameters
print(f"Optimized weights: {nn_params}")

-0.5706512250660585
-0.04068048738753702
-0.01458382897724815
-0.00887203728882521
-0.006791860009895552
-0.005876224849483412
-0.005511044299689621
-0.005375633684792457
-0.005345585491604246
-0.005332003528486177
Optimized weights: [-1.3905638   0.09087355 -0.02496336 -0.17394496  0.23524152  0.75483801
  0.58052732  0.17011297 -0.14086271  0.99664091]


## Create Titanic survial predictions

Now we'll use the parameters we've calculated to try and make predictions about the survivors in our validation set.

In [23]:
serving_df

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0000,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S
...,...,...,...,...,...,...,...,...,...,...,...
413,1305,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S
414,1306,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C
415,1307,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S
416,1308,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S


In [24]:
def estimate_missing_ages(old_df: pd.DataFrame) -> pd.DataFrame:
    new_df = old_df.copy()
    mean_age = old_df["Age"].mean()
    new_df["Age"].fillna(value=mean_age, inplace=True)
    return new_df

def estimate_missing_fares(old_df: pd.DataFrame) -> pd.DataFrame:
    new_df = old_df.copy()
    new_df["Fare"].fillna(value=0, inplace=True)
    return new_df
    
def prepare_data(old_df: pd.DataFrame) -> pd.DataFrame:
    new_df = old_df.copy()
    new_df = remove_irrelevant_data(new_df)
    new_df = estimate_missing_ages(new_df)
    new_df = estimate_missing_fares(new_df)
    print("Searching for NA values:")
    print(new_df.isna().any())
    new_df = convert_ticket_class_to_binary_values(new_df)
    new_df = convert_embarkation_port_to_binary_values(new_df)
    new_df = convert_sex_to_binary_value(new_df)
    new_df = convert_numeric_column_to_decimal(new_df, "Age")
    new_df = convert_numeric_column_to_decimal_with_logarithm(new_df, "Fare")
    new_df["Constant"] = 1
    return new_df
    
serving_df = prepare_data(serving_df)
assert (input_df.columns == serving_df.columns).all()
serving_df

Searching for NA values:
Pclass      False
Sex         False
Age         False
SibSp       False
Parch       False
Fare        False
Embarked    False
dtype: bool


Unnamed: 0,Sex,Age,SibSp,Parch,Fare,FirstClass,SecondClass,Cherbourg_Departure,Queenstown_Departure,Constant
0,1,0.453947,0,0,0.348997,0,0,0,1,1
1,0,0.618421,1,0,0.333195,0,0,0,0,1
2,1,0.815789,0,0,0.379604,0,1,0,1,1
3,1,0.355263,0,0,0.363449,0,0,0,0,1
4,0,0.289474,1,1,0.414494,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...
413,1,0.398324,0,0,0.352955,0,0,0,0,1
414,0,0.513158,0,0,0.753026,1,0,1,0,1
415,1,0.506579,0,0,0.338125,0,0,0,0,1
416,1,0.398324,0,0,0.352955,0,0,0,0,1


In [25]:
def create_predictions(validation_df: pd.DataFrame, optimized_weights: np.array) -> np.array:
    return np.dot(validation_df.to_numpy(), optimized_weights)

serving_df["Survival Prediction"] = create_predictions(serving_df, nn_params)
serving_df

Unnamed: 0,Sex,Age,SibSp,Parch,Fare,FirstClass,SecondClass,Cherbourg_Departure,Queenstown_Departure,Constant,Survival Prediction
0,1,0.453947,0,0,0.348997,0,0,0,1,1,-0.411435
1,0,0.618421,1,0,0.333195,0,0,0,0,1,1.106257
2,1,0.815789,0,0,0.379604,0,1,0,1,1,0.209174
3,1,0.355263,0,0,0.363449,0,0,0,0,1,-0.276141
4,0,0.289474,1,1,0.414494,0,0,0,0,1,0.921544
...,...,...,...,...,...,...,...,...,...,...,...
413,1,0.398324,0,0,0.352955,0,0,0,0,1,-0.274696
414,0,0.513158,0,0,0.753026,1,0,1,0,1,2.145367
415,1,0.506579,0,0,0.338125,0,0,0,0,1,-0.268347
416,1,0.398324,0,0,0.352955,0,0,0,0,1,-0.274696


## Prepare Submission CSV

In [26]:
original_validation_df = pd.read_csv(data_path + "test.csv")
submission_df = pd.DataFrame()
submission_df["PassengerId"] = original_validation_df["PassengerId"]
submission_df["Survived"] = serving_df["Survival Prediction"].apply(lambda x: 0 if x < 0.5 else 1)
submission_df.to_csv("submission.csv", index=False)
submission_df

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0
