In [1]:
!pip install -q kaggle --upgrade

In [2]:
%tensorflow_version 2.x
import tensorflow.compat.v2.feature_column as fc
import tensorflow.compat.v1.saved_model as saved_model
import tensorflow as tf

from sklearn.preprocessing import OneHotEncoder

from __future__ import absolute_import, division, print_function, unicode_literals
from IPython.display import clear_output
from six.moves import urllib
from copy import deepcopy

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
tf.random.set_seed(197)
print(tf.__version__)

2.5.0


# Loading data

In [4]:
import json, io, os
from google.colab import files
# if(not uploaded["kaggle.json"].decode("utf-8")):
if(not os.path.isfile("kaggle.json")):
  uploaded = files.upload()
clear_output()

In [5]:
%%bash
if test -f "~/.kaggle/kaggle.json"; then
    echo "kaggle.json exists."
else
    echo "kaggle.json does not exists"
    mkdir -p ~/.kaggle/
    cp kaggle.json ~/.kaggle/kaggle.json
    chmod 600 ~/.kaggle/kaggle.json
    echo "Successfully placed in directory"
fi


kaggle.json does not exists
Successfully placed in directory


In [6]:
%%bash
if [ "$(ls -A data)" ]; then
    echo "kaggle competitions data already exists"
else 
    echo "kaggle competitions data does not exists"
    kaggle competitions download -cq titanic
    mkdir -p data/
    mv *.csv data/
    echo "Successfully placed in directory"
fi
echo
echo "Data: "
ls data

kaggle competitions data already exists

Data: 
gender_submission.csv
test.csv
train.csv


In [7]:
train_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/test.csv')

train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


# Data Analysis

**Linear analysis**: For linearly correlated datapoints in $\mathit{R^{n}}$ space

# Building model

## Helper class

In [8]:
class PreProcessHelper():
  def __init__(self):
    pass

  def create_categorical_type(self, dataframe, dtypes):
    unique_values = []
    _CATEGORICAL_TYPES = {}
    for column, dtype in dtypes:
        if dtype == 'object':
            unique_values = [x for x in dataframe[column].unique()]
            _CATEGORICAL_TYPES[column] = pd.api.types.CategoricalDtype(categories=unique_values)
    return _CATEGORICAL_TYPES

  def removeUnused(self, dataframe, unused):
    if(unused):
      dataframe.drop(unused, axis = 1, inplace=True)
    return dataframe

  def preprocess(self, dataframe, _CATEGORICAL_TYPES, model_type):
    # Convert integer valued (numeric) columns to floating point
    num_cols = dataframe.select_dtypes(['int64']).columns
    dataframe[num_cols] = dataframe[num_cols].astype('float32')

    # Convert categorical columns to numeric
    cat_cols = dataframe.select_dtypes(['object']).columns
    dataframe[cat_cols] = dataframe[cat_cols].apply(lambda x: x.astype(_CATEGORICAL_TYPES[x.name]))

    if(model_type == 'neural'):
      dataframe[cat_cols] = dataframe[cat_cols].apply(lambda x: x.cat.codes)

    return dataframe

  def standardize_data(self, dataframe, dtypes, response, standardize):
    if(standardize):
      dataframe_x = dataframe.drop(response, axis = 1)
      dataframe_y = dataframe[response]
      for column, dtype in dtypes:
          if dtype == 'float32':
              dataframe_x[column] -= dataframe_x[column].mean()
              dataframe_x[column] /= dataframe_x[column].std()
      dataframe = pd.concat([dataframe_x, dataframe_y], axis = 1)
    return dataframe
  
  def split_data(self, dataframe, response):
    train, eval = dataframe.xs('train'), dataframe.xs('eval')
    train_x, train_y = train.drop(response, axis = 1), train[response]
    eval_x, eval_y = eval.drop(response, axis = 1), eval[response]
    return train_x, train_y, eval_x, eval_y
    
  def get_feature_columns(self, train_x, _CATEGORICAL_TYPES):
    feature_column = []
    num_cols = list(train_x.select_dtypes(['float32']).columns)
    for key,value in _CATEGORICAL_TYPES.items():
      unique = train_x[key].unique()
      feature_column.append(tf.feature_column.categorical_column_with_vocabulary_list(key, unique))
    for col in num_cols:
      feature_column.append(tf.feature_column.numeric_column(col, dtype=tf.dtypes.float32))
    return feature_column

## Model class

In [9]:
class SurvivalClassifier_NN(PreProcessHelper):
  def __init__(self, df_train, df_eval, response, unused = [], standardize = True, model_type = 'neural'):
    print("Neural-Net Survival Classifier")
    self.df_train = deepcopy(df_train)
    self.df_eval = deepcopy(df_eval)
    self.unused = unused 
    self.response = response
    self.standardize = standardize
    self.model_type = model_type

    self.dataframe = pd.concat([self.df_train, self.df_eval], keys=['train', 'eval'])
    # Remove unused columns
    self.dataframe = self.removeUnused(self.dataframe, self.unused)

    self.dtypes = list(zip(self.dataframe.dtypes.index, map(str, self.dataframe.dtypes)))

    self._CATEGORICAL_TYPES = self.create_categorical_type(self.dataframe, self.dtypes)
    
    self.dataframe = self.preprocess(self.dataframe, self._CATEGORICAL_TYPES, self.model_type)
    self.dataframe = self.standardize_data(self.dataframe, self.dtypes, self.response, self.standardize)

    self.train_x, self.train_y, self.eval_x, self.eval_y = self.split_data(self.dataframe, self.response)


    
    




In [10]:
# modelClass = SurvivalClassifier_NN(df_train=train_df, df_eval=test_df, response='Survived')

In [11]:
# train_df.dtypes