# TensorFlow: Logistic Regression

In [1]:
import tensorflow as tf
import numpy as np

import pandas as pd
from pandas import DataFrame as DF, Series

**Data Source:** https://www.kaggle.com/c/titanic/data


**Variable Definition Key**

**survival**	Survival	0 = No, 1 = Yes<br>
**pclass**	Ticket class	1 = 1st, 2 = 2nd, 3 = 3rd<br>
**sex**	Sex	<br>
**Age**	Age in years	<br>
**sibsp**	# of siblings / spouses aboard the Titanic	<br>
**parch**	# of parents / children aboard the Titanic	<br>
**ticket**	Ticket number	<br>
**fare**	Passenger fare	<br>
**cabin**	Cabin number	<br>
**embarked**	Port of Embarkation	C = Cherbourg, Q = Queenstown, S = Southampton

**Variable Notes**

pclass: A proxy for socio-economic status (SES)<br>
1st = Upper<br>
2nd = Middle<br>
3rd = Lower

age: Age is fractional if less than 1. If the age is estimated, is it in the form of xx.5

sibsp: The dataset defines family relations in this way...<br>
Sibling = brother, sister, stepbrother, stepsister<br>
Spouse = husband, wife (mistresses and fiancés were ignored)

parch: The dataset defines family relations in this way...<br>
Parent = mother, father<br>
Child = daughter, son, stepdaughter, stepson<br>
Some children travelled only with a nanny, therefore parch=0 for them.<br>

In [2]:
import requests
data_url = 'https://storage.googleapis.com/kaggle-competitions-data/kaggle/3136/train.csv?GoogleAccessId=competitions-data@kaggle-161607.iam.gserviceaccount.com&Expires=1511997025&Signature=PEIvkHcr9xiKriHwS%2Fk2TzHDxBnlnDOLGP2sap%2FKeRObtkx8CRZPM45vEoVPxnT4q4faBGp4CHLsyS6zU309K%2F4RFq0e41HoqAOpj8vdSi0Uh6GqQGFgMfhvfEoxtTOhOKjcIi9Z51%2FswGttevmyUcjS6t2oePguBFRd5W7bn27u1dWBvMIB9GiGMmY0W0iopb7sPLAvur308QrP%2F97nl6i05NKljB1Myb02dGi3t14wvEPPew%2FD3mPjLsJsi8XEO209R8%2Fg1oWE3dyj5F9mB0DH7e8NKu%2F8EhW2mXijADPtMCn2mSoJLH%2F%2By7gc%2FtlMzN0KolYpg8plUvEI6EV8rg%3D%3D'

# get string of comma separated values
r = requests.get(data_url)
# create dataframe using read_csv on IO object
from io import StringIO
data = pd.read_csv(StringIO(r.content.decode('utf-8')))

In [3]:
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
data.drop(['PassengerId', 'Name', 'Ticket'], axis=1, inplace=True)

In [5]:
# save both to csv
data.to_csv('data.csv', index=False)

In [6]:
del data
import gc
gc.collect()

217

In [7]:
# read data
data = pd.read_csv('data.csv')
data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,0,3,male,22.0,1,0,7.25,,S
1,1,1,female,38.0,1,0,71.2833,C85,C
2,1,3,female,26.0,0,0,7.925,,S
3,1,1,female,35.0,1,0,53.1,C123,S
4,0,3,male,35.0,0,0,8.05,,S


In [9]:
data.shape

(891, 9)

In [10]:
data.fillna({'Age': -1,
             'Cabin': 'Unk',
             'Embarked': 'Unk',
             'Fare': -1},
            inplace=True);

### Very Basic - Hand Coded Logistic Regression

In [11]:
# convert sex binary
data.loc[:, 'Sex'] = (data.Sex == 'female').astype(int)

# train/test split
Xtr = data.loc[:, ['Pclass','Sex','Age','SibSp','Parch','Fare']].sample(frac=0.75)
Xts = data[~data.index.isin(Xtr.index)].loc[:, ['Pclass','Sex','Age','SibSp','Parch','Fare']]

# one-hot-encode Ytr and Yts (quick method)
Ytr = pd.get_dummies(data[data.index.isin(Xtr.index)].Survived).values
Yts = pd.get_dummies(data[~data.index.isin(Xtr.index)].Survived).values

In [None]:
import tensorflow as tf

# data format is as usual:
# Xtr and test_X have shape (num_instances, num_features)
# Ytr and test_Y have shape (num_instances, num_classes)
num_features = Xtr.shape[1]
num_classes = 2

# shape=[None, num_features] tells the model to accept different numbers of datapoints
X = tf.placeholder('float', [None, num_features])
Y = tf.placeholder('float', [None, num_classes])

# W - weights array
W = tf.Variable(tf.zeros([num_features, num_classes]))
# B - bias array
B = tf.Variable(tf.zeros([num_classes]))

# define the logistic model
# y=wx+b as argument of softmax
yhat = tf.nn.softmax(tf.matmul(X, W) + B)

# define a loss function
loss_fn = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=yhat, labels=Y))

# define optimizer and minimize on loss_fn
opt = tf.train.AdamOptimizer(0.01).minimize(loss_fn)

# create session
sess = tf.Session()

# init vars
init = tf.initialize_all_variables()
sess.run(init)

num_epochs = 10
# loop over num_epochs and run optimization step on
# full data each time
for i in range(num_epochs):
    sess.run(opt, feed_dict={X: Xtr, Y: Ytr})

# accuracy function
accuracy = tf.reduce_mean(tf.cast(tf.equal(tf.argmax(yhat, 1), tf.argmax(Y, 1)), 'float'))
# get the test accuracy
accuracy_value = sess.run(accuracy, feed_dict={X: Xts, Y: Yts})

In [13]:
accuracy_value

0.60089684

## Logistic Regression With Batching

### Input Function

In [14]:
# read data
data = pd.read_csv('data.csv')
data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,0,3,male,22.0,1,0,7.25,,S
1,1,1,female,38.0,1,0,71.2833,C85,C
2,1,3,female,26.0,0,0,7.925,,S
3,1,1,female,35.0,1,0,53.1,C123,S
4,0,3,male,35.0,0,0,8.05,,S


In [19]:
# define columns and default values
_csv_column_defaults = [[0],[-1],['Unk'],[-1.],[0],[0],[-1.],['Unk'],['Unk']]
_csv_columns = data.columns.tolist()

# define input function
def input_fn(csv_file, feature_names, batch_size=16, n_epochs=10, shuffle=False):
    def decode_csv(line):
        parsed_line = tf.decode_csv(line, _csv_column_defaults)
        features_dict = dict(zip(feature_names, parsed_line))
#         features_dict['Age'] = tf.to_int32(features_dict['Age'])
        labels = features_dict.pop('Survived') # removes this from dict
        return features_dict, labels
    
    if shuffle:
        dataset = dataset.shuffle(buffer_size=100*1024) # buffer 100KB

    dataset = (tf.data.TextLineDataset(csv_file) # Read text file
           .skip(1) # Skip header row
           .map(decode_csv, num_parallel_calls=3)) # Transform each elem by applying decode_csv fn

    dataset = dataset.batch(batch_size)  # create a batch of size `batch_size`
    dataset = dataset.repeat(n_epochs)
    iterator = dataset.make_one_shot_iterator()
    batch_features, batch_labels = iterator.get_next()
    
    return batch_features, batch_labels

### Handling Categorical Features

Using `tf.feature_column` is a way to map data to a model, as opposed to using feed dictionaries. It can be efficient and help with certain preprocessing tasks.

#### Base Categorical Features

In [20]:
# pclass = tf.feature_column.categorical_column_with_identity(
#     'Pclass', num_buckets=3)

sex = tf.feature_column.categorical_column_with_vocabulary_list(
    'Sex', vocabulary_list=['female','male','Unk'])

embarked = tf.feature_column.categorical_column_with_vocabulary_list(
    'Embarked', vocabulary_list=['S','C','Q','Unk'])

#### Base Continuous Features

In [21]:
age = tf.feature_column.numeric_column('Age')

# age_buckets = tf.feature_column.bucketized_column(
#     age, boundaries=[5.,10,18,25,35,45,55,65])

sib = tf.feature_column.numeric_column('SibSp')

parch = tf.feature_column.numeric_column('Parch')

fare = tf.feature_column.numeric_column('Fare')

### Define Model

In [22]:
columns = [age, sib, parch, fare, sex, embarked]

model_dir = 'lr_model'
model = tf.estimator.LinearClassifier(model_dir=model_dir,
                                      feature_columns=columns,
                                      optimizer=tf.train.AdamOptimizer())

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_save_summary_steps': 100, '_tf_random_seed': None, '_master': '', '_keep_checkpoint_max': 5, '_save_checkpoints_secs': 600, '_save_checkpoints_steps': None, '_is_chief': True, '_task_id': 0, '_keep_checkpoint_every_n_hours': 10000, '_service': None, '_task_type': 'worker', '_num_worker_replicas': 1, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x1247d5c50>, '_num_ps_replicas': 0, '_log_step_count_steps': 100, '_session_config': None, '_model_dir': 'lr_model'}


### Train Model

In [23]:
model.train(input_fn=lambda: input_fn('data.csv', _csv_columns))

INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Restoring parameters from lr_model/model.ckpt-1120
INFO:tensorflow:Saving checkpoints for 1121 into lr_model/model.ckpt.
INFO:tensorflow:step = 1121, loss = 6.64619
INFO:tensorflow:global_step/sec: 248.133
INFO:tensorflow:step = 1221, loss = 7.51727 (0.404 sec)
INFO:tensorflow:global_step/sec: 348.066
INFO:tensorflow:step = 1321, loss = 8.30055 (0.287 sec)
INFO:tensorflow:global_step/sec: 360.014
INFO:tensorflow:step = 1421, loss = 5.60117 (0.279 sec)
INFO:tensorflow:global_step/sec: 288.247
INFO:tensorflow:step = 1521, loss = 7.38559 (0.353 sec)
INFO:tensorflow:global_step/sec: 256.359
INFO:tensorflow:step = 1621, loss = 5.44131 (0.383 sec)
INFO:tensorflow:Saving checkpoints for 1680 into lr_model/model.ckpt.
INFO:tensorflow:Loss for final step: 6.01248.


<tensorflow.python.estimator.canned.linear.LinearClassifier at 0x1247d5ba8>

In [24]:
results = model.evaluate(input_fn=lambda: input_fn('data.csv', _csv_columns, n_epochs=1))

INFO:tensorflow:Starting evaluation at 2017-11-27-05:28:01
INFO:tensorflow:Restoring parameters from lr_model/model.ckpt-1680
INFO:tensorflow:Finished evaluation at 2017-11-27-05:28:03
INFO:tensorflow:Saving dict for global step 1680: accuracy = 0.79349, accuracy_baseline = 0.616162, auc = 0.828753, auc_precision_recall = 0.770835, average_loss = 0.492578, global_step = 1680, label/mean = 0.383838, loss = 7.83726, prediction/mean = 0.409476


In [25]:
results

{'accuracy': 0.79349047,
 'accuracy_baseline': 0.61616158,
 'auc': 0.82875299,
 'auc_precision_recall': 0.77083457,
 'average_loss': 0.49257779,
 'global_step': 1680,
 'label/mean': 0.38383839,
 'loss': 7.8372645,
 'prediction/mean': 0.40947571}