In [106]:
import pandas as pd
import numpy as np
import os

In [107]:
processed_data_path = os.path.join(os.path.pardir,'data','processed')
train_file_path = os.path.join(processed_data_path,'train.csv')
test_file_path = os.path.join(processed_data_path,'test.csv')

In [108]:
train_df = pd.read_csv(train_file_path, index_col='PassengerId')
test_df = pd.read_csv(test_file_path, index_col='PassengerId')

In [109]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 33 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Survived            891 non-null    int64  
 1   Age                 891 non-null    float64
 2   Fare                891 non-null    float64
 3   FamilySize          891 non-null    int64  
 4   IsMother            891 non-null    int64  
 5   IsMale              891 non-null    int64  
 6   Deck_A              891 non-null    int64  
 7   Deck_B              891 non-null    int64  
 8   Deck_C              891 non-null    int64  
 9   Deck_D              891 non-null    int64  
 10  Deck_E              891 non-null    int64  
 11  Deck_F              891 non-null    int64  
 12  Deck_G              891 non-null    int64  
 13  Deck_Z              891 non-null    int64  
 14  Pclass_1            891 non-null    int64  
 15  Pclass_2            891 non-null    int64  
 16  Pclass_3

In [179]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 418 entries, 892 to 1309
Data columns (total 33 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Survived            418 non-null    int64  
 1   Age                 418 non-null    float64
 2   Fare                418 non-null    float64
 3   FamilySize          418 non-null    int64  
 4   IsMother            418 non-null    int64  
 5   IsMale              418 non-null    int64  
 6   Deck_A              418 non-null    int64  
 7   Deck_B              418 non-null    int64  
 8   Deck_C              418 non-null    int64  
 9   Deck_D              418 non-null    int64  
 10  Deck_E              418 non-null    int64  
 11  Deck_F              418 non-null    int64  
 12  Deck_G              418 non-null    int64  
 13  Deck_Z              418 non-null    int64  
 14  Pclass_1            418 non-null    int64  
 15  Pclass_2            418 non-null    int64  
 16  Pclas

### Data Preperation

In [111]:
X = train_df.loc[:,'Age':].to_numpy().astype('float')
y = train_df['Survived'].ravel()

In [112]:
X.shape, y.shape

((891, 32), (891,))

In [228]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)


In [229]:
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(712, 32) (712,)
(179, 32) (179,)


In [230]:
print('mean survival in train : {0:.3f}'.format(np.mean(y_train)))
print('mean survival in test : {0:.3f}'.format(np.mean(y_test)))

mean survival in train : 0.383
mean survival in test : 0.385


Check Scikit learn version

In [183]:
import sklearn

In [184]:
sklearn.__version__

'0.24.2'

## Baseline model

In [185]:
from sklearn.dummy import  DummyClassifier

In [186]:
model_dummy = DummyClassifier(strategy='most_frequent',random_state=0)

In [187]:
model_dummy.fit(X_train, y_train)

DummyClassifier(random_state=0, strategy='most_frequent')

In [188]:
model_dummy.score(X_test, y_test) #Accuracy

0.6145251396648045

In [189]:
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score

In [190]:
#Accuracy for baseline model
accuracy_score(y_test, model_dummy.predict(X_test))

0.6145251396648045

In [191]:
#Confusion matrix

confusion_matrix(y_test, model_dummy.predict(X_test))

array([[110,   0],
       [ 69,   0]], dtype=int64)

In [192]:
#precision Score
precision_score(y_test, model_dummy.predict(X_test))

  _warn_prf(average, modifier, msg_start, len(result))


0.0

In [193]:
#recall

recall_score(y_test, model_dummy.predict(X_test))

0.0

## First Kaggle Submission

In [223]:
test_X = test_df.to_numpy().astype('float')

In [224]:
predictions = model_dummy.predict(test_X)

In [198]:
df_submission = pd.DataFrame({'PassengerId': test_df.index, 'Survived':predictions})

In [199]:
df_submission.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0


In [200]:
submission_data_path = os.path.join(os.path.pardir,'data','external')
submission_file_path = os.path.join(submission_data_path, '01_dummy.csv')


In [204]:
df_submission.to_csv(submission_file_path, index=False)

## Logistic Regression Model

In [231]:
from sklearn.linear_model import  LogisticRegression
model_log_reg = LogisticRegression(solver='lbfgs', max_iter=5000)

In [232]:
model_log_reg.fit(X_train, y_train)

LogisticRegression(max_iter=5000)

In [238]:
test_X = test_df.loc[:, 'Age':].to_numpy().astype('float')

In [239]:
predictions = model_log_reg.predict(test_X)

In [240]:
predictions

array([0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1,
       0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [242]:
model_log_reg.score(X_test,y_test)

0.8324022346368715

In [243]:
print(accuracy_score(y_test, model_log_reg.predict(X_test)))
print(confusion_matrix(y_test, model_log_reg.predict(X_test)))
print(precision_score(y_test, model_log_reg.predict(X_test)))
print(recall_score(y_test, model_log_reg.predict(X_test)))

0.8324022346368715
[[95 15]
 [15 54]]
0.782608695652174
0.782608695652174


In [244]:
model_log_reg.coef_

array([[-0.03274954,  0.00420884, -0.5272492 ,  0.64132787, -1.11663318,
        -0.01579198, -0.30562751, -0.52876864,  0.40171065,  0.97408762,
         0.28381144, -0.28733536, -0.51912336,  0.59087039,  0.12931061,
        -0.71721813,  0.23337057,  1.10273341,  0.15273357, -1.55607139,
         0.73349191, -0.15727937, -0.50601583, -0.12255718, -0.03890269,
         0.00402445,  0.16039829,  0.12078982,  0.11697678, -0.23480373,
        -0.18014387,  0.18310674]])

## Second Kaggle Submission

In [245]:
df_submission = pd.DataFrame({'PassengerId': test_df.index, 'Survived':predictions})
# #submission file
submission_data_path = os.path.join(os.path.pardir,'data','external')
submission_file_path = os.path.join(submission_data_path, '02_logreg.csv')
#write to the file
df_submission.to_csv(submission_file_path, index=False) 

In [163]:
get_submission_file(model_log_reg,'02_logisticregression.csv')