In [45]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [47]:
current_dir = %pwd
train_dataset = pd.read_csv('../input/titanic/train.csv')[['PassengerId','Sex' ,'Pclass','Age','SibSp','Fare','Embarked','Parch', 'Survived']]
test_dataset = pd.read_csv('../input/titanic/test.csv')[['PassengerId','Sex' ,'Pclass','Age','SibSp','Fare','Embarked','Parch']]
train_dataset.head()

Unnamed: 0,PassengerId,Sex,Pclass,Age,SibSp,Fare,Embarked,Parch,Survived
0,1,male,3,22.0,1,7.25,S,0,0
1,2,female,1,38.0,1,71.2833,C,0,1
2,3,female,3,26.0,0,7.925,S,0,1
3,4,female,1,35.0,1,53.1,S,0,1
4,5,male,3,35.0,0,8.05,S,0,0


# PreProcessing

In [48]:
train_dataset['Sex'].replace('male',1,inplace=True) # male -> 1
test_dataset['Sex'].replace('male',1,inplace=True) # male -> 1

train_dataset['Sex'].replace('female',0,inplace=True) # female -> 0
test_dataset['Sex'].replace('female',0,inplace=True) # female -> 0

In [49]:
train_dataset['Embarked'].replace(np.nan,'S',inplace=True)
test_dataset['Embarked'].replace(np.nan,'S',inplace=True)

In [50]:
def change_to_number(x):
    if x == 'C':
        return 0
    elif x == 'Q':
        return 1
    elif x == 'S':
        return 2

In [51]:
train_dataset.loc[:,'Embarked'] = train_dataset['Embarked'].apply(change_to_number)
test_dataset.loc[:,'Embarked'] = test_dataset['Embarked'].apply(change_to_number)

In [52]:
train_dataset.fillna(train_dataset['Age'].mean(), inplace=True)
test_dataset.fillna(test_dataset['Age'].mean(), inplace=True)

In [53]:
x_train = np.array(train_dataset[['Sex','Pclass','Age','SibSp','Fare','Embarked','Parch']])
y_train = np.array(train_dataset['Survived'])
x_test = np.array(test_dataset[['Sex','Pclass','Age','SibSp','Fare','Embarked','Parch']])

# Sklearn

In [54]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(max_iter = 500)

In [55]:
model.fit(x_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=500,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [56]:
y_pred = model.predict(x_train)

In [57]:
accuracy = model.score(x_train, y_train)

In [58]:
accuracy

0.8002244668911336

In [59]:
test_predicted = model.predict(x_test)

In [60]:
predicted_data = pd.Series(test_predicted)
df = pd.DataFrame({'PassengerId' : test_dataset.PassengerId ,"Survived" : predicted_data})
df.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1


In [61]:
df.set_index('PassengerId', inplace=True)
df.head()

Unnamed: 0_level_0,Survived
PassengerId,Unnamed: 1_level_1
892,0
893,0
894,0
895,0
896,1


In [62]:
df.to_csv(current_dir+r'/out.csv')

# Self-implemented

In [None]:
bias = np.array([[1 for i in range(891)]]).T
x_train = np.concatenate((bias,x_train),axis=1)

In [None]:
y_train = y_train.reshape((891,1))

In [None]:
x_train.shape

In [None]:
y_train.shape

In [None]:
thetas = np.random.randn(x_train.shape[1],1)
thetas

In [None]:
def h(x,thetas):
    return 1 / (1 + np.exp(-(x @ thetas)))

In [None]:
def loss(h,y):
    epsilon = 1e-5  
    m = h.shape[0]
    return (-1/m) *(y.T @ np.log(np.abs(h) + epsilon) + (1-y).T @ np.log(np.abs(1-h) + epsilon))

In [None]:
prediction = h(x_train,thetas)
loss(prediction,y_train)

# Training

In [None]:
losses = []
lr = 1e-3
epoch = 200
for i in range(epoch):
    my_h = h(x_train,thetas)
    gradient = x_train.T @ (my_h - y_train)
    thetas-= lr * ( gradient / y_train.shape[0])
    epoch_loss = loss(my_h,y_train)
    losses.append(epoch_loss)

In [None]:
plt.scatter([i for i in range(epoch)],losses)

In [None]:
test_data.head()

# Validating on Train Dataset

In [None]:
predicted_train = np.where(h(x_train,thetas)>0.5,1,0)

In [None]:
correct_count = np.isclose(predicted_train,y_train).sum()
correct_count

In [None]:
train_accuracy = (correct_count/x_train.shape[0])*100
train_accuracy

# Results on TestDataSet

In [None]:
bias_b = np.array([[1 for i in range(x_test.shape[0])]]).T
x_test = np.concatenate((bias_b,x_test),axis=1)

In [None]:
predicted = h(x_test,thetas)
predicted = np.where(predicted>0.5,1,0)

In [None]:
predicted_data = pd.Series(predicted.reshape(predicted.shape[0]))

In [None]:
df = pd.DataFrame({'PassengerId' : test_dataset.PassengerId ,"Survived" : predicted_data})
df.head()

In [None]:
df.set_index('PassengerId', inplace=True)
df.head()

In [None]:
df.to_csv(current_dir+r'/out.csv')