In [325]:
import pandas as pd
import numpy as np

In [326]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

In [327]:
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [328]:
train_data[train_data.Age.isna()]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
17,18,1,2,"Williams, Mr. Charles Eugene",male,,0,0,244373,13.0000,,S
19,20,1,3,"Masselmani, Mrs. Fatima",female,,0,0,2649,7.2250,,C
26,27,0,3,"Emir, Mr. Farred Chehab",male,,0,0,2631,7.2250,,C
28,29,1,3,"O'Dwyer, Miss. Ellen ""Nellie""",female,,0,0,330959,7.8792,,Q
...,...,...,...,...,...,...,...,...,...,...,...,...
859,860,0,3,"Razi, Mr. Raihed",male,,0,0,2629,7.2292,,C
863,864,0,3,"Sage, Miss. Dorothy Edith ""Dolly""",female,,8,2,CA. 2343,69.5500,,S
868,869,0,3,"van Melkebeke, Mr. Philemon",male,,0,0,345777,9.5000,,S
878,879,0,3,"Laleff, Mr. Kristo",male,,0,0,349217,7.8958,,S


In [407]:
def initial_data(full_data, test_data=False, train_rate=0.8, answer_mode=False):
    full_data = full_data.loc[:, ['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch']]
    if not answer_mode:
        full_data = full_data[~full_data.Age.isna()]
    full_data.Sex = (full_data.Sex == 'male').astype('int')
    if not answer_mode:
        full_data = full_data.reindex(np.random.permutation(full_data.index))
    print(full_data)
    
    X = full_data.iloc[:, 1:].values
    mean_X = X.mean(axis=0)
    std_X = X.std(axis=0)
    X = np.hstack(((X - mean_X) / std_X, np.ones((X.shape[0], 1))))
    Y = full_data.iloc[:, 0].values
    Y = Y.reshape(Y.shape[0], 1)
    
    if not test_data:
        return X, Y
    
    train_X = X[:int(X.shape[0] * rate)]
    test_X = X[int(X.shape[0] * rate):]
    train_Y = Y[:int(Y.shape[0] * rate)]
    test_Y = Y[int(Y.shape[0] * rate):]
    return train_X, train_Y, test_X, test_Y

In [392]:
def sigmoid(X, theta):
    return 1 / ( 1 + np.exp(-np.dot(X, theta)))

In [368]:
def forward_prop(X, Y, theta):
    Y_hat = sigmoid(X, theta)
    m = X.shape[0]
    J = -1/m * np.sum(Y * np.log(Y_hat) + (1 - Y) * np.log(1 - Y_hat))
    return Y_hat, J

In [369]:
def backward_prop(X, Y, theta, Y_hat, learning_rate):
    m = X.shape[0]
    gradient = 1/m * np.dot(X.T, Y_hat - Y)
    return theta - learning_rate * gradient

In [370]:
def logistic_regression(X, Y, num_iteration=2000, learning_rate=0.1):
    # initialize
    theta = np.zeros((X.shape[1], 1))
    
    for num_iter in range(1, num_iteration + 1):
        Y_hat, J = forward_prop(X, Y, theta)
        theta = backward_prop(X, Y, theta, Y_hat, learning_rate)
        if num_iter % 100 == 0:
            print(num_iter, J)
    
    return theta

In [374]:
def accuracy(X, Y, theta):
    return (sigmoid(X, theta).round().astype('int') == Y).astype('int').mean()

# Initialize Input

In [410]:
train_X, train_Y = initial_data(train_data)

     Survived  Pclass  Sex   Age  SibSp  Parch
195         1       1    0  58.0      0      0
217         0       2    1  42.0      1      0
84          1       2    0  17.0      0      0
889         1       1    1  26.0      0      0
618         1       2    0   4.0      2      1
..        ...     ...  ...   ...    ...    ...
523         1       1    0  44.0      0      1
541         0       3    0   9.0      4      2
571         1       1    0  53.0      2      0
329         1       1    0  16.0      0      1
380         1       1    0  42.0      0      0

[714 rows x 6 columns]
[[-1.47636364 -1.31743394  1.94959054 -0.55170307 -0.50589515  1.        ]
 [-0.28256564  0.75905134  0.8473829   0.52457013 -0.50589515  1.        ]
 [-0.28256564 -1.31743394 -0.87481653 -0.55170307 -0.50589515  1.        ]
 ...
 [-1.47636364 -1.31743394  1.60515065  1.60084334 -0.50589515  1.        ]
 [-1.47636364 -1.31743394 -0.9437045  -0.55170307  0.66686178  1.        ]
 [-1.47636364 -1.31743394  0.847

# Train the model

In [411]:
theta = logistic_regression(train_X, train_Y)

100 0.4611195640312356
200 0.4489401396332154
300 0.44662916567850425
400 0.4460452094576543
500 0.445882031664768
600 0.4458343546893532
700 0.4458201129375755
800 0.4458158092494033
900 0.44581450064257416
1000 0.44581410140024125
1100 0.44581397937211237
1200 0.44581394203686087
1300 0.4458139306076224
1400 0.44581392710779305
1500 0.4458139260359067
1600 0.44581392570759204
1700 0.4458139256070254
1800 0.44581392557621974
1900 0.4458139255667833
2000 0.44581392556389265


# Predict Test Data

In [412]:
test_X, _ = initial_data(test_data, answer_mode=True)
print(test_X)
print(theta)
sigmoid(test_X, theta)

     Survived  Pclass  Sex   Age  SibSp  Parch
0         NaN       3    1  34.5      0      0
1         NaN       3    0  47.0      1      0
2         NaN       2    1  62.0      0      0
3         NaN       3    1  27.0      0      0
4         NaN       3    0  22.0      1      1
..        ...     ...  ...   ...    ...    ...
413       NaN       3    1   NaN      0      0
414       NaN       1    0  39.0      0      0
415       NaN       3    1  38.5      0      0
416       NaN       3    1   NaN      0      0
417       NaN       3    1   NaN      1      1

[418 rows x 6 columns]
[[ 0.87348191  0.75592895         nan -0.49947002 -0.4002477   1.        ]
 [ 0.87348191 -1.32287566         nan  0.61699237 -0.4002477   1.        ]
 [-0.31581919  0.75592895         nan -0.49947002 -0.4002477   1.        ]
 ...
 [ 0.87348191  0.75592895         nan -0.49947002 -0.4002477   1.        ]
 [ 0.87348191  0.75592895         nan -0.49947002 -0.4002477   1.        ]
 [ 0.87348191  0.75592895       

array([[nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
      