In [1]:
import numpy as np 
import pandas as pd 
import tensorflow as tf
from sklearn.model_selection import train_test_split

### Import and clean the data

In [2]:
dftrain = pd.read_csv('../input/titanic/train.csv') 
dftrain= dftrain.drop(columns=['Cabin','Embarked','Ticket','Name'])
dftrain=dftrain.fillna(dftrain['Age'].mean())

dfvalid = pd.read_csv('../input/titanic/test.csv') 
dfvalid= dfvalid.drop(columns=['Cabin','Embarked','Ticket','Name'])
dfvalid=dfvalid.fillna(dftrain['Age'].mean())

print(dftrain.head(-1))
print(dfvalid.head(-1))

     PassengerId  Survived  Pclass     Sex        Age  SibSp  Parch     Fare
0              1         0       3    male  22.000000      1      0   7.2500
1              2         1       1  female  38.000000      1      0  71.2833
2              3         1       3  female  26.000000      0      0   7.9250
3              4         1       1  female  35.000000      1      0  53.1000
4              5         0       3    male  35.000000      0      0   8.0500
..           ...       ...     ...     ...        ...    ...    ...      ...
885          886         0       3  female  39.000000      0      5  29.1250
886          887         0       2    male  27.000000      0      0  13.0000
887          888         1       1  female  19.000000      0      0  30.0000
888          889         0       3  female  29.699118      1      2  23.4500
889          890         1       1    male  26.000000      0      0  30.0000

[890 rows x 8 columns]
     PassengerId  Pclass     Sex        Age  SibSp  

### Split Data

In [3]:
y= dftrain.pop('Survived') 
X_dftrain_train, X_dftrain_valid, y_train, y_valid = train_test_split(dftrain, y, train_size=0.8, test_size=0.2,random_state=0)

print(dftrain.head(-1))

     PassengerId  Pclass     Sex        Age  SibSp  Parch     Fare
0              1       3    male  22.000000      1      0   7.2500
1              2       1  female  38.000000      1      0  71.2833
2              3       3  female  26.000000      0      0   7.9250
3              4       1  female  35.000000      1      0  53.1000
4              5       3    male  35.000000      0      0   8.0500
..           ...     ...     ...        ...    ...    ...      ...
885          886       3  female  39.000000      0      5  29.1250
886          887       2    male  27.000000      0      0  13.0000
887          888       1  female  19.000000      0      0  30.0000
888          889       3  female  29.699118      1      2  23.4500
889          890       1    male  26.000000      0      0  30.0000

[890 rows x 7 columns]


### Define feature columns

In [4]:
dftrain.dtypes

PassengerId      int64
Pclass           int64
Sex             object
Age            float64
SibSp            int64
Parch            int64
Fare           float64
dtype: object

In [5]:
CATEGORICAL_COLUMNS = ['Sex','SibSp','Parch','Pclass']
NUMERIC_COLUMNS = ['Age', 'Fare']

In [6]:
feature_columns = []
for feature_name in CATEGORICAL_COLUMNS:
  vocabulary = dftrain[feature_name].unique() 
  feature_columns.append(tf.feature_column.categorical_column_with_vocabulary_list(feature_name,vocabulary))
for feature_name in NUMERIC_COLUMNS:
  feature_columns.append(tf.feature_column.numeric_column(feature_name, dtype=tf.float32))

print(feature_columns)

[VocabularyListCategoricalColumn(key='Sex', vocabulary_list=('male', 'female'), dtype=tf.string, default_value=-1, num_oov_buckets=0), VocabularyListCategoricalColumn(key='SibSp', vocabulary_list=(1, 0, 3, 4, 2, 5, 8), dtype=tf.int64, default_value=-1, num_oov_buckets=0), VocabularyListCategoricalColumn(key='Parch', vocabulary_list=(0, 1, 2, 5, 3, 4, 6), dtype=tf.int64, default_value=-1, num_oov_buckets=0), VocabularyListCategoricalColumn(key='Pclass', vocabulary_list=(3, 1, 2), dtype=tf.int64, default_value=-1, num_oov_buckets=0), NumericColumn(key='Age', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), NumericColumn(key='Fare', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None)]


### Input function

In [7]:
def make_input_fn(data_df,label_df, num_epocs=2000,shuffle=True, batch_size= 32):
  def input_function():
    ds = tf.data.Dataset.from_tensor_slices((dict(data_df),label_df)) 
    if shuffle:
      ds = ds.shuffle(2000)  
    ds = ds.batch(batch_size).repeat(num_epocs)  
    return ds 
  return input_function 
  
train_input_fn = make_input_fn(X_dftrain_train, y_train) 
eval_input_fn = make_input_fn(X_dftrain_valid,y_valid,num_epocs=1,shuffle=False)

### Creating the linear regression model

In [8]:
linear_est = tf.estimator.LinearClassifier(feature_columns=feature_columns)

### Training the model

In [9]:
linear_est.train(train_input_fn) 
result = linear_est.evaluate(eval_input_fn) 
print(result)
print(result['accuracy'])


User settings:

   KMP_AFFINITY=granularity=fine,verbose,compact,1,0
   KMP_BLOCKTIME=0
   KMP_DUPLICATE_LIB_OK=True
   KMP_INIT_AT_FORK=FALSE
   KMP_SETTINGS=1

Effective settings:

   KMP_ABORT_DELAY=0
   KMP_ADAPTIVE_LOCK_PROPS='1,1024'
   KMP_ALIGN_ALLOC=64
   KMP_ALL_THREADPRIVATE=128
   KMP_ATOMIC_MODE=2
   KMP_BLOCKTIME=0
   KMP_CPUINFO_FILE: value is not defined
   KMP_DETERMINISTIC_REDUCTION=false
   KMP_DEVICE_THREAD_LIMIT=2147483647
   KMP_DISP_NUM_BUFFERS=7
   KMP_DUPLICATE_LIB_OK=true
   KMP_ENABLE_TASK_THROTTLING=true
   KMP_FORCE_REDUCTION: value is not defined
   KMP_FOREIGN_THREADS_THREADPRIVATE=true
   KMP_FORKJOIN_BARRIER='2,2'
   KMP_FORKJOIN_BARRIER_PATTERN='hyper,hyper'
   KMP_GTID_MODE=3
   KMP_HANDLE_SIGNALS=false
   KMP_HOT_TEAMS_MAX_LEVEL=1
   KMP_HOT_TEAMS_MODE=0
   KMP_INIT_AT_FORK=true
   KMP_LIBRARY=throughput
   KMP_LOCK_KIND=queuing
   KMP_MALLOC_POOL_INCR=1M
   KMP_NUM_LOCKS_IN_BLOCK=1
   KMP_PLAIN_BARRIER='2,2'
   KMP_PLAIN_BARRIER_PATTERN='hyper,hype

{'accuracy': 0.81005585, 'accuracy_baseline': 0.61452514, 'auc': 0.8720026, 'auc_precision_recall': 0.8545147, 'average_loss': 0.4144761, 'label/mean': 0.38547486, 'loss': 0.4107461, 'precision': 0.7692308, 'prediction/mean': 0.41060105, 'recall': 0.7246377, 'global_step': 46000}
0.81005585


In [10]:
test_input_fn = tf.compat.v1.estimator.inputs.pandas_input_fn(x=dfvalid, num_epochs=1, shuffle=False)

### Make predictions

In [11]:
pred_dicts = list(linear_est.predict(test_input_fn))
probs = pd.Series([pred['class_ids'][0] for pred in pred_dicts])
print(probs)

0      0
1      0
2      0
3      0
4      1
      ..
413    0
414    1
415    0
416    0
417    0
Length: 418, dtype: int64


### Save results

In [12]:
output = dfvalid['PassengerId'].copy().to_frame()
output['Survived'] = probs
output.to_csv('output_submission.csv', index=False)
output.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1
