<a href="https://colab.research.google.com/github/BobbyLeeSH/tf_ml/blob/main/kaggle/titanic/kaggle_titanic.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Titanic - First Kaggle Competition
## First, Import Libraries and Read Data



In [1]:
import numpy as np
import tensorflow as tf
import pandas as pd

train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [2]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
train.info()
print('-'*30)
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
------------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 no

It can be found that Age, Cabin and Embarked have some null value.
* should they be dropped? or fill in default value?
* Also, convert all data types either into float64 or categorical data
---
I think PassengerId, Name and Ticket are irrelevant, hence, I'm going to drop them in train data. In test data, we need PassengerId for prediction, so leave it.


In [4]:
train = train.drop(['PassengerId', 'Name', 'Ticket'], axis=1)
test = test.drop(['Name','Ticket'], axis=1)

In [5]:
train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,0,3,male,22.0,1,0,7.25,,S
1,1,1,female,38.0,1,0,71.2833,C85,C
2,1,3,female,26.0,0,0,7.925,,S
3,1,1,female,35.0,1,0,53.1,C123,S
4,0,3,male,35.0,0,0,8.05,,S


### Now we need to have a look at each columns


1. Pclass
2. Sex
3. Age
4. SibSp
5. Parch
6. Fare
7. Cabin
8. Embarked

# 1. Pclass
Pclass is integer value but it is actually an Ordinal variable. Hence, will be converted into categorical variable using one-hot-encoding.

In [6]:
train['Pclass'].value_counts()
test['Pclass'].value_counts()

pclass_train_dummies = pd.get_dummies(train['Pclass'])
pclass_test_dummies = pd.get_dummies(test['Pclass'])

train.drop(['Pclass'], axis=1, inplace=True)
test.drop(['Pclass'], axis=1, inplace=True)

In [7]:
pclass_train_dummies.columns = ['First Class','Second Class','Third Class']
pclass_test_dummies.columns = ['First Class','Second Class','Third Class']

train = train.join(pclass_train_dummies)
test = test.join(pclass_test_dummies)

In [8]:
train.head()

Unnamed: 0,Survived,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,First Class,Second Class,Third Class
0,0,male,22.0,1,0,7.25,,S,0,0,1
1,1,female,38.0,1,0,71.2833,C85,C,1,0,0
2,1,female,26.0,0,0,7.925,,S,0,0,1
3,1,female,35.0,1,0,53.1,C123,S,1,0,0
4,0,male,35.0,0,0,8.05,,S,0,0,1


# 2. Sex
sex is a nominal variable and it should be in one-hot-encoding format

In [9]:
train['Sex'].value_counts()
test['Sex'].value_counts()

sex_train_dummies = pd.get_dummies(train['Sex'])
sex_test_dummies = pd.get_dummies(test['Sex'])

train.drop(['Sex'], axis=1, inplace=True)
test.drop(['Sex'], axis=1, inplace=True)

In [10]:
sex_train_dummies.columns = ['Female','Male']
sex_test_dummies.columns = ['Female','Male']

train = train.join(sex_train_dummies)
test = test.join(sex_test_dummies)

train.head()

Unnamed: 0,Survived,Age,SibSp,Parch,Fare,Cabin,Embarked,First Class,Second Class,Third Class,Female,Male
0,0,22.0,1,0,7.25,,S,0,0,1,0,1
1,1,38.0,1,0,71.2833,C85,C,1,0,0,1,0
2,1,26.0,0,0,7.925,,S,0,0,1,1,0
3,1,35.0,1,0,53.1,C123,S,1,0,0,1,0
4,0,35.0,0,0,8.05,,S,0,0,1,0,1


# 3. Age
It is continuous variable and one-hot-encoding is not required.
However, there are some NaN value and they need to be filled.
1. random
2. mean
3. median
4. drop
---
In this case, I'll try using mean age of train data to fill the NaN values in age for both train and test data.

In [11]:
train["Age"].fillna(train["Age"].mean() , inplace=True)
test["Age"].fillna(train["Age"].mean() , inplace=True)

train.head()

Unnamed: 0,Survived,Age,SibSp,Parch,Fare,Cabin,Embarked,First Class,Second Class,Third Class,Female,Male
0,0,22.0,1,0,7.25,,S,0,0,1,0,1
1,1,38.0,1,0,71.2833,C85,C,1,0,0,1,0
2,1,26.0,0,0,7.925,,S,0,0,1,1,0
3,1,35.0,1,0,53.1,C123,S,1,0,0,1,0
4,0,35.0,0,0,8.05,,S,0,0,1,0,1


# 4. SibSp & 5. Parch
Nothing much to change. so leave them.


# 6. Fare
There is one row missing Fare value in test Data. I'm just going to fill it with 0.

In [12]:
test["Fare"].fillna(0, inplace=True)

# 7. Cabin
There are many NaN value and I do think Cabin is quite irrelavent. Hence, I'm going to drop them.

In [13]:
train = train.drop(['Cabin'], axis=1)
test = test.drop(['Cabin'], axis=1)

train.head()

Unnamed: 0,Survived,Age,SibSp,Parch,Fare,Embarked,First Class,Second Class,Third Class,Female,Male
0,0,22.0,1,0,7.25,S,0,0,1,0,1
1,1,38.0,1,0,71.2833,C,1,0,0,1,0
2,1,26.0,0,0,7.925,S,0,0,1,1,0
3,1,35.0,1,0,53.1,S,1,0,0,1,0
4,0,35.0,0,0,8.05,S,0,0,1,0,1


# 8. Embarked
Embarked has some NaN Values but most of them are 'S' and they do not seem very relavent. Hence, I'm going to fill them with 'S'. Then,  one-hot-encoding will be carried out.

In [14]:
train["Embarked"].fillna('S', inplace=True)
test["Embarked"].fillna('S', inplace=True)

In [15]:
embarked_train_dummies = pd.get_dummies(train['Embarked'])
embarked_test_dummies = pd.get_dummies(test['Embarked'])

embarked_train_dummies.columns = ['S', 'C', 'Q']
embarked_test_dummies.columns = ['S', 'C', 'Q']

train.drop(['Embarked'], axis=1, inplace=True)
test.drop(['Embarked'], axis=1, inplace=True)

train = train.join(embarked_train_dummies)
test = test.join(embarked_test_dummies)

train.head()

Unnamed: 0,Survived,Age,SibSp,Parch,Fare,First Class,Second Class,Third Class,Female,Male,S,C,Q
0,0,22.0,1,0,7.25,0,0,1,0,1,0,0,1
1,1,38.0,1,0,71.2833,1,0,0,1,0,1,0,0
2,1,26.0,0,0,7.925,0,0,1,1,0,0,0,1
3,1,35.0,1,0,53.1,1,0,0,1,0,0,0,1
4,0,35.0,0,0,8.05,0,0,1,0,1,0,0,1


The data is now ready for fit. I am going to check if there are any NaN values or if any data type is invalid. If all data is in right place, I'll just check with the correlation. Then, will divide the data into Feature and Label, and start fitting the model.

In [16]:
train.info()
print('-'*30)
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Survived      891 non-null    int64  
 1   Age           891 non-null    float64
 2   SibSp         891 non-null    int64  
 3   Parch         891 non-null    int64  
 4   Fare          891 non-null    float64
 5   First Class   891 non-null    uint8  
 6   Second Class  891 non-null    uint8  
 7   Third Class   891 non-null    uint8  
 8   Female        891 non-null    uint8  
 9   Male          891 non-null    uint8  
 10  S             891 non-null    uint8  
 11  C             891 non-null    uint8  
 12  Q             891 non-null    uint8  
dtypes: float64(2), int64(3), uint8(8)
memory usage: 41.9 KB
------------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  -----

It is shown that there is no NaN values. Let's check the correlations.

In [17]:
train.corr()

Unnamed: 0,Survived,Age,SibSp,Parch,Fare,First Class,Second Class,Third Class,Female,Male,S,C,Q
Survived,1.0,-0.069809,-0.035322,0.081629,0.257307,0.285904,0.093349,-0.322308,0.543351,-0.543351,0.16824,0.00365,-0.149683
Age,-0.069809,1.0,-0.232625,-0.179191,0.091566,0.319916,0.006589,-0.281004,-0.084153,0.084153,0.032024,-0.013855,-0.019336
SibSp,-0.035322,-0.232625,1.0,0.414838,0.159651,-0.054582,-0.055932,0.092548,0.114631,-0.114631,-0.059528,-0.026354,0.068734
Parch,0.081629,-0.179191,0.414838,1.0,0.216225,-0.017633,-0.000734,0.01579,0.245489,-0.245489,-0.011069,-0.081228,0.060814
Fare,0.257307,0.091566,0.159651,0.216225,1.0,0.591711,-0.118557,-0.413333,0.182333,-0.182333,0.269335,-0.117216,-0.162184
First Class,0.285904,0.319916,-0.054582,-0.017633,0.591711,1.0,-0.288585,-0.626738,0.098013,-0.098013,0.296423,-0.155342,-0.161921
Second Class,0.093349,0.006589,-0.055932,-0.000734,-0.118557,-0.288585,1.0,-0.56521,0.064746,-0.064746,-0.125416,-0.127301,0.18998
Third Class,-0.322308,-0.281004,0.092548,0.01579,-0.413333,-0.626738,-0.56521,1.0,-0.137143,0.137143,-0.153329,0.237449,-0.015104
Female,0.543351,-0.084153,0.114631,0.245489,0.182333,0.098013,0.064746,-0.137143,1.0,-1.0,0.082853,0.074115,-0.119224
Male,-0.543351,0.084153,-0.114631,-0.245489,-0.182333,-0.098013,-0.064746,0.137143,-1.0,1.0,-0.082853,-0.074115,0.119224


Let's now proceed with dividing the data into Features and Labels

In [33]:
Y_train = train[["Survived"]]
X_train = train.drop("Survived",axis=1)
X_test  = test.drop("PassengerId",axis=1).copy()

In [34]:
print(X_train.shape)
print(Y_train.shape)
X_train.head()
Y_train.head()

(891, 12)
(891, 1)


Unnamed: 0,Survived
0,0
1,1
2,1
3,1
4,0


In [35]:
X = tf.keras.layers.Input(shape=[12])
Y = tf.keras.layers.Dense(1)(X)
model = tf.keras.models.Model(X, Y)
model.compile(loss='mse')

In [36]:
model.summary()

Model: "model_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         [(None, 12)]              0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 13        
Total params: 13
Trainable params: 13
Non-trainable params: 0
_________________________________________________________________


In [37]:
model.fit(X_train, Y_train, epochs=1000, verbose=0)
model.fit(X_train, Y_train, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7ffb9631a5f8>

In [38]:
X_test.head()

Unnamed: 0,Age,SibSp,Parch,Fare,First Class,Second Class,Third Class,Female,Male,S,C,Q
0,34.5,0,0,7.8292,0,0,1,0,1,0,1,0
1,47.0,1,0,7.0,0,0,1,1,0,0,0,1
2,62.0,0,0,9.6875,0,1,0,0,1,0,1,0
3,27.0,0,0,8.6625,0,0,1,0,1,0,0,1
4,22.0,1,1,12.2875,0,0,1,1,0,0,0,1


In [39]:
print(X_test.shape)

(418, 12)


In [40]:
predicted = model.predict(X_test)

In [45]:
predicted.shape

(418, 1)

In [84]:
pid = test['PassengerId']
predicted = np.around(predicted)
predicted[:10]
evaluation=pid.to_frame()
evaluation["Survived"]=predicted
evaluation['Survived'] = evaluation['Survived'].astype(int) 
evaluation.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1


In [86]:
from google.colab import files
evaluation.to_csv('submission.csv', index=False)
files.download('submission.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>