In [1]:
import pandas
import numpy
import random
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dfTrain = pandas.read_csv("train.csv", index_col=0)
dfTrain.drop(columns={"Name", "Ticket", "Cabin"}, inplace=True)
dfTrain.dropna(inplace = True)
dfTrain

Unnamed: 0_level_0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,0,3,male,22.0,1,0,7.2500,S
2,1,1,female,38.0,1,0,71.2833,C
3,1,3,female,26.0,0,0,7.9250,S
4,1,1,female,35.0,1,0,53.1000,S
5,0,3,male,35.0,0,0,8.0500,S
...,...,...,...,...,...,...,...,...
886,0,3,female,39.0,0,5,29.1250,Q
887,0,2,male,27.0,0,0,13.0000,S
888,1,1,female,19.0,0,0,30.0000,S
890,1,1,male,26.0,0,0,30.0000,C


In [3]:
dfRefinedData = dfTrain[["SibSp", "Parch"]].copy()
dfRefinedData

Unnamed: 0_level_0,SibSp,Parch
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,1,0
2,1,0
3,0,0
4,1,0
5,0,0
...,...,...
886,0,5
887,0,0
888,0,0
890,0,0


In [4]:
# Normalize the Age column
dfRefinedData.loc[:, "Norm_Age"] = dfTrain.loc[:, "Age"]/dfTrain.loc[:, "Age"].max()
dfRefinedData

Unnamed: 0_level_0,SibSp,Parch,Norm_Age
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1,0,0.2750
2,1,0,0.4750
3,0,0,0.3250
4,1,0,0.4375
5,0,0,0.4375
...,...,...,...
886,0,5,0.4875
887,0,0,0.3375
888,0,0,0.2375
890,0,0,0.3250


In [5]:
# Due to unbalanced and large distribution of "Fare" column, take the log base 10 instead of normalizing it
dfRefinedData["Log_Fare"] = numpy.log10(dfTrain["Fare"] + 1)
dfRefinedData

Unnamed: 0_level_0,SibSp,Parch,Norm_Age,Log_Fare
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,1,0,0.2750,0.916454
2,1,0,0.4750,1.859038
3,0,0,0.3250,0.950608
4,1,0,0.4375,1.733197
5,0,0,0.4375,0.956649
...,...,...,...,...
886,0,5,0.4875,1.478927
887,0,0,0.3375,1.146128
888,0,0,0.2375,1.491362
890,0,0,0.3250,1.491362


In [6]:
# Add 2 binary collumns to represent trinary "Pclass" column
dfRefinedData.loc[:, "Pclass_1"] = 0
dfRefinedData.loc[dfTrain['Pclass'] == 1, "Pclass_1"] = 1

dfRefinedData.loc[:, "Pclass_2"] = 0
dfRefinedData.loc[dfTrain['Pclass'] == 2, "Pclass_2"] = 1

# Add 2 binary columns to represent trinary "Embarked" column
dfRefinedData.loc[:, "Embark_S"] = 0
dfRefinedData.loc[dfTrain['Embarked'] == 'S', "Embark_S"] = 1

dfRefinedData.loc[:, "Embark_C"] = 0
dfRefinedData.loc[dfTrain['Embarked'] == 'C', "Embark_C"] = 1

dfRefinedData

Unnamed: 0_level_0,SibSp,Parch,Norm_Age,Log_Fare,Pclass_1,Pclass_2,Embark_S,Embark_C
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,1,0,0.2750,0.916454,0,0,1,0
2,1,0,0.4750,1.859038,1,0,0,1
3,0,0,0.3250,0.950608,0,0,1,0
4,1,0,0.4375,1.733197,1,0,1,0
5,0,0,0.4375,0.956649,0,0,1,0
...,...,...,...,...,...,...,...,...
886,0,5,0.4875,1.478927,0,0,0,0
887,0,0,0.3375,1.146128,0,1,1,0
888,0,0,0.2375,1.491362,1,0,1,0
890,0,0,0.3250,1.491362,1,0,0,1


In [7]:
# Transform string filled "Sex" column into a boolean column
dfRefinedData.loc[:, "Male"] = 0
dfRefinedData.loc[dfTrain['Sex'] == 'male', "Male"] = 1

dfRefinedData

Unnamed: 0_level_0,SibSp,Parch,Norm_Age,Log_Fare,Pclass_1,Pclass_2,Embark_S,Embark_C,Male
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,1,0,0.2750,0.916454,0,0,1,0,1
2,1,0,0.4750,1.859038,1,0,0,1,0
3,0,0,0.3250,0.950608,0,0,1,0,0
4,1,0,0.4375,1.733197,1,0,1,0,0
5,0,0,0.4375,0.956649,0,0,1,0,1
...,...,...,...,...,...,...,...,...,...
886,0,5,0.4875,1.478927,0,0,0,0,0
887,0,0,0.3375,1.146128,0,1,1,0,1
888,0,0,0.2375,1.491362,1,0,1,0,0
890,0,0,0.3250,1.491362,1,0,0,1,1


In [8]:
# Add ones collumn for constants
dfRefinedData.loc[:, 'Ones'] = 1

In [9]:
actualResults = dfTrain.loc[:, 'Survived']
actualResults

PassengerId
1      0
2      1
3      1
4      1
5      0
      ..
886    0
887    0
888    1
890    1
891    0
Name: Survived, Length: 712, dtype: int64

In [10]:
random.seed(a = 12071941)


# TODO get multiple lines of params
dfParams = pandas.Series(index = dfRefinedData.columns, dtype = 'float64')
for column in dfRefinedData.columns:
    dfParams.at[column] = random.random() - 0.5

dfParams

SibSp       0.138894
Parch      -0.402977
Norm_Age    0.259165
Log_Fare    0.343216
Pclass_1    0.429688
Pclass_2    0.415268
Embark_S    0.200195
Embark_C   -0.152522
Male        0.193573
Ones       -0.360328
dtype: float64

In [11]:
#TODO find the linear sum, then relu, then add together for prediction
#     need to change to welcome multiple linear params
def predict(params):
    test = dfRefinedData * params
    relu = test <= 0
    test.mask(relu, 0, inplace = True)
    return test.sum(axis = 1)

predict(dfParams)

PassengerId
1      0.918474
2      1.329737
3      0.610687
4      1.477023
5      0.835489
         ...   
886    0.633934
887    1.289873
888    1.203294
890    1.219349
891    0.620551
Length: 712, dtype: float64

In [12]:
((predict(dfParams) - actualResults) ** 2).mean()

def loss(prediction):
    return ((prediction - actualResults) ** 2).mean()

NameError: name 'mse' is not defined