In [1]:
import pandas as pd

titanic = pd.read_csv('https://storage.googleapis.com/kaggle_datasets/Titanic-Machine-Learning-from-Disaster/train.csv')
titanic_data = titanic.loc[:,['Survived','Pclass']]

In [2]:
cross_table = pd.crosstab(index=titanic_data['Survived'], columns=titanic_data['Pclass'])
cross_table

Pclass,1,2,3
Survived,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,80,97,372
1,136,87,119


<img src="./dataset/posterior_probability_titanic.png">

In [3]:
passenger_num = cross_table.sum().sum()
survived_num = cross_table.iloc[1].sum()
pclass1_num = cross_table.loc[:,1].sum()
pclass2_num = cross_table.loc[:,2].sum()
pclass3_num = cross_table.loc[:,3].sum()

survived_pclass1_num = cross_table.loc[:,1][1]
survived_pclass2_num = cross_table.loc[:,2][1]
survived_pclass3_num = cross_table.loc[:,3][1]

p_pclass1 = pclass1_num/passenger_num
p_pclass2 = pclass2_num/passenger_num
p_pclass3 = pclass3_num/passenger_num

p_survived = survived_num/passenger_num

p_pclass1_given_survived = survived_pclass1_num/survived_num
p_pclass2_given_survived = survived_pclass2_num/survived_num
p_pclass3_given_survived = survived_pclass3_num/survived_num

In [4]:
p_survived_given_pclass1 = p_pclass1_given_survived*p_survived/p_pclass1
p_survived_given_pclass2 = p_pclass2_given_survived*p_survived/p_pclass2
p_survived_given_pclass3 = p_pclass3_given_survived*p_survived/p_pclass3

print(p_survived_given_pclass1)
print(p_survived_given_pclass2)
print(p_survived_given_pclass3)

0.6296296296296295
0.47282608695652173
0.24236252545824846


In [5]:
from sklearn.model_selection import train_test_split

titanic = pd.read_csv('https://storage.googleapis.com/kaggle_datasets/Titanic-Machine-Learning-from-Disaster/train.csv')
x = titanic.loc[:,'Pclass'].values.reshape(-1,1) #reshape sample數量 * feature數量
y = titanic.loc[:,'Survived'].values

x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.33, random_state=42)
train_df = pd.DataFrame()
train_df['Survived'] = y_train.tolist()
train_df['Pclass'] = x_train.ravel()

cross_table_train = pd.crosstab(index=train_df['Survived'], columns=train_df['Pclass'])
cross_table_train

Pclass,1,2,3
Survived,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,55,64,255
1,80,61,81


In [6]:
passenger_num = cross_table_train.sum().sum()
survived_num = cross_table_train.iloc[1].sum()
pclass1_num = cross_table_train.loc[:,1].sum()
pclass2_num = cross_table_train.loc[:,2].sum()
pclass3_num = cross_table_train.loc[:,3].sum()
pclass1_survived = cross_table_train.loc[:,1][1]
pclass2_survived = cross_table_train.loc[:,2][1]
pclass3_survived = cross_table_train.loc[:,3][1]

p_pclass1 = pclass1_num/passenger_num
p_pclass2 = pclass2_num/passenger_num
p_pclass3 = pclass3_num/passenger_num

p_survived = survived_num/passenger_num

p_pclass1_given_survived = pclass1_survived/survived_num
p_pclass2_given_survived = pclass2_survived/survived_num
p_pclass3_given_survived = pclass3_survived/survived_num

p_survived_given_pclass1 = p_pclass1_given_survived*p_survived/p_pclass1
p_survived_given_pclass2 = p_pclass2_given_survived*p_survived/p_pclass2
p_survived_given_pclass3 = p_pclass3_given_survived*p_survived/p_pclass3

In [7]:
print(p_survived_given_pclass1)
print(p_survived_given_pclass2)
print(p_survived_given_pclass3)

0.5925925925925926
0.4880000000000001
0.24107142857142858


In [8]:
import numpy as np

def get_y_predict(x_validation, p_survived_given_pclass1, p_survived_given_pclass2, p_survived_given_pclass3):
    y_predict = np.zeros(x_validation.shape[0],dtype = int)
    for i in range(y_predict.size):
        if x_validation[i,0] == 1:
            prediction = np.random.choice([0,1], p = [1 - p_survived_given_pclass1, p_survived_given_pclass1])
        elif x_validation[i,0] == 2:
            prediction = np.random.choice([0,1], p = [1 - p_survived_given_pclass2, p_survived_given_pclass2])    
        elif x_validation[i,0] == 3:
            prediction = np.random.choice([0,1], p = [1 - p_survived_given_pclass3, p_survived_given_pclass3])
        y_predict[i] = prediction
        
    return y_predict

In [9]:
y_predict = get_y_predict(x_test, p_survived_given_pclass1, p_survived_given_pclass2, p_survived_given_pclass3)
print(y_predict[0:10])
print(y_test[0:10])

[1 1 0 1 0 1 0 0 0 1]
[1 0 0 1 1 1 1 0 1 1]


In [10]:
n_tp_tn = (y_predict == y_test).sum()
acc = n_tp_tn / y_predict.size
print('Accuracy score: {:.2f}%'.format(acc*100))

Accuracy score: 55.59%


<img src="./dataset/confusion_matrix.jpeg">

In [11]:
def get_tp():
    result = 0;
    for i in range(0,len(y_predict)):
        if y_predict[i] == y_test[i] and y_predict[i] == 1:
            result += 1
    return result

def get_tn():
    result = 0;
    for i in range(0,len(y_predict)):
        if y_predict[i] == y_test[i] and y_predict[i] == 0:
            result += 1
    return result


def get_fp():
    result = 0;
    for i in range(0,len(y_predict)):
        if y_predict[i] != y_test[i] and y_predict[i] == 1:
            result += 1
    return result

def get_fn():
    result = 0;
    for i in range(0,len(y_predict)):
        if y_predict[i] != y_test[i] and y_predict[i] == 0:
            result += 1
    return result

tn = get_tn()
tp = get_tp()
fp = get_fp()
fn = get_fn()

<img src="./dataset/confusion_matrix2.jpeg">

In [12]:
accuracy = (tp+tn)/len(y_predict)
precision = tp/(tp+fp)
recall = tp/(tp+fn)
f1_score = 2/((1/precision)+(1/recall))

In [13]:
accuracy

0.5559322033898305

In [14]:
print(accuracy)
print(precision)
print(recall)
print(f1_score)

0.5559322033898305
0.45925925925925926
0.5166666666666667
0.4862745098039216
