In [19]:
%pylab 
import numpy as np
import pandas as pd
import tensorflow as tf

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from IPython.display import display

np.random.seed(0)

Using matplotlib backend: TkAgg
Populating the interactive namespace from numpy and matplotlib


# Data & Preprocessing


| Variable | Definition | Key |
|:---------|:-----------|:----|
| survival | 생존 | 0 = No, 1 = Yes  | 
| pclass | 티켓 등급 | 1 = 1st, 2 = 2nd, 3 = 3rd | 
| sex  | 성별 |  | 
| Age | 나이 |  | 
| sibsp	 | 타이타닉에 함께 타고 있는 형제, 자매 또는 배우 (갯수) |  | 
| parch | 타이타닉에 함께 타고 있는 부모 또는 자식 (갯수) |  | 
| ticket | 티켓 넘버 |  | 
| fare | 티켓 가격	 |  | 
| cabin  | 객실 번호 |  | 
| embarked | 출항지 | C = Cherbourg, Q = Queenstown, S = Southampton | 

In [2]:
dataset = pd.read_csv('./train.csv')

# Drop useless columns
dataset.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1, inplace=True)

# Fill NA
dataset['Age'] = dataset['Age'].fillna(0)

# Make dummies
dataset = pd.get_dummies(dataset, columns=['Pclass', 'Sex', 'Embarked'])

# Display
display(dataset.head())
dataset.info()

Unnamed: 0,Survived,Age,SibSp,Parch,Fare,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,0,22.0,1,0,7.25,0,0,1,0,1,0,0,1
1,1,38.0,1,0,71.2833,1,0,0,1,0,1,0,0
2,1,26.0,0,0,7.925,0,0,1,1,0,0,0,1
3,1,35.0,1,0,53.1,1,0,0,1,0,0,0,1
4,0,35.0,0,0,8.05,0,0,1,0,1,0,0,1


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 13 columns):
Survived      891 non-null int64
Age           891 non-null float64
SibSp         891 non-null int64
Parch         891 non-null int64
Fare          891 non-null float64
Pclass_1      891 non-null uint8
Pclass_2      891 non-null uint8
Pclass_3      891 non-null uint8
Sex_female    891 non-null uint8
Sex_male      891 non-null uint8
Embarked_C    891 non-null uint8
Embarked_Q    891 non-null uint8
Embarked_S    891 non-null uint8
dtypes: float64(2), int64(3), uint8(8)
memory usage: 41.8 KB


## Check Missing Values

In [3]:
# Check Missing Values
dataset[dataset.isnull().any(axis=1)].head()

Unnamed: 0,Survived,Age,SibSp,Parch,Fare,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S


## Scaling

In [4]:
# Scaling
SCALING_COLUMNS = ['Age', 'SibSp', 'Parch', 'Fare']

scaler = MinMaxScaler()
dataset[SCALING_COLUMNS] = scaler.fit_transform(dataset[SCALING_COLUMNS])
dataset.head()

Unnamed: 0,Survived,Age,SibSp,Parch,Fare,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,0,0.275,0.125,0.0,0.014151,0,0,1,0,1,0,0,1
1,1,0.475,0.125,0.0,0.139136,1,0,0,1,0,1,0,0
2,1,0.325,0.0,0.0,0.015469,0,0,1,1,0,0,0,1
3,1,0.4375,0.125,0.0,0.103644,1,0,0,1,0,0,0,1
4,0,0.4375,0.0,0.0,0.015713,0,0,1,0,1,0,0,1


## Split Train and Test dataset

In [5]:
# Seperate X and Y
data_x = dataset.loc[:, dataset.columns != 'Survived'].as_matrix()
data_y = dataset['Survived'].as_matrix()

train_x, test_x, train_y, test_y = train_test_split(data_x, data_y, test_size=0.3)

print('train_x:', train_x.shape)
print('train_y:', train_y.shape)
print('test_x:', test_x.shape)
print('test_y:', test_y.shape)

train_x: (623, 12)
train_y: (623,)
test_x: (268, 12)
test_y: (268,)


# Scipy

## Train

In [16]:
lr = LogisticRegression(penalty='l2', max_iter=1000)
print(lr.fit(train_x, train_y))
print('accuracy:', lr.score(train_x, train_y))

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=1000, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
accuracy: 0.794542536116


## Evaluate

In [17]:
print('accuracy:', lr.score(test_x, test_y))

accuracy: 0.805970149254


# TensorFlow

In [25]:
inputs = tf.placeholder(tf.float32, shape=(None, 12))
outputs = tf.placeholder(tf.int16, shape=(None, 1))