In [210]:
import string
import numpy as np
import pandas as pd
from keras.models import Sequential
from keras.layers import Dense
from sklearn.preprocessing import MinMaxScaler

In [211]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

In [212]:
def print_dataframe_information(df):
    print(df.shape)
    print(df.columns.values)
    print(df.isnull().sum())
    return

In [213]:
print_dataframe_information(train_data)

(891, 12)
['PassengerId' 'Survived' 'Pclass' 'Name' 'Sex' 'Age' 'SibSp' 'Parch'
 'Ticket' 'Fare' 'Cabin' 'Embarked']
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64


In [214]:
print_dataframe_information(test_data)

(418, 11)
['PassengerId' 'Pclass' 'Name' 'Sex' 'Age' 'SibSp' 'Parch' 'Ticket' 'Fare'
 'Cabin' 'Embarked']
PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64


In [215]:
def substrings_in_string(big_string, substrings):
    for substring in substrings:
        if big_string.find(substring) != -1:
            return substring
    return np.nan

In [216]:
title_list=['Mrs', 'Mr', 'Master', 'Miss', 'Major', 'Rev',
                    'Dr', 'Ms', 'Mlle','Col', 'Capt', 'Mme', 'Countess',
                    'Don', 'Jonkheer']

In [217]:
train_data['Title'] = train_data['Name'].map(lambda x: substrings_in_string(x, title_list))
test_data['Title'] = test_data['Name'].map(lambda x: substrings_in_string(x, title_list))

In [218]:
def replace_titles(x):
    title=x['Title']
    if title in ['Don', 'Major', 'Capt', 'Jonkheer', 'Rev', 'Col']:
        return 'Mr'
    elif title in ['Countess', 'Mme']:
        return 'Mrs'
    elif title in ['Mlle', 'Ms']:
        return 'Miss'
    elif title =='Dr':
        if x['Sex']=='Male':
            return 'Mr'
        else:
            return 'Mrs'
    else:
        return title

In [219]:
train_data['Title'] = train_data.apply(replace_titles, axis=1)
test_data['Title'] = test_data.apply(replace_titles, axis=1)

In [220]:
print_dataframe_information(train_data)

(891, 13)
['PassengerId' 'Survived' 'Pclass' 'Name' 'Sex' 'Age' 'SibSp' 'Parch'
 'Ticket' 'Fare' 'Cabin' 'Embarked' 'Title']
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
Title            0
dtype: int64


In [221]:
print_dataframe_information(test_data)

(418, 12)
['PassengerId' 'Pclass' 'Name' 'Sex' 'Age' 'SibSp' 'Parch' 'Ticket' 'Fare'
 'Cabin' 'Embarked' 'Title']
PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
Title            0
dtype: int64


In [222]:
train_data['Family_Size']=train_data['SibSp']+train_data['Parch']
test_data['Family_Size']=test_data['SibSp']+test_data['Parch']

In [223]:
print_dataframe_information(train_data)

(891, 14)
['PassengerId' 'Survived' 'Pclass' 'Name' 'Sex' 'Age' 'SibSp' 'Parch'
 'Ticket' 'Fare' 'Cabin' 'Embarked' 'Title' 'Family_Size']
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
Title            0
Family_Size      0
dtype: int64


In [224]:
X_train = train_data[['Pclass', 'Sex', 'SibSp', 'Parch', 'Fare', 'Embarked', 'Title', 'Family_Size']]
y = train_data[['Survived']]
X_test = test_data[['Pclass', 'Sex', 'SibSp', 'Parch', 'Fare', 'Embarked', 'Title', 'Family_Size']]

In [225]:
print_dataframe_information(X_train)

(891, 8)
['Pclass' 'Sex' 'SibSp' 'Parch' 'Fare' 'Embarked' 'Title' 'Family_Size']
Pclass         0
Sex            0
SibSp          0
Parch          0
Fare           0
Embarked       2
Title          0
Family_Size    0
dtype: int64


In [226]:
print_dataframe_information(y)

(891, 1)
['Survived']
Survived    0
dtype: int64


In [227]:
print_dataframe_information(X_test)

(418, 8)
['Pclass' 'Sex' 'SibSp' 'Parch' 'Fare' 'Embarked' 'Title' 'Family_Size']
Pclass         0
Sex            0
SibSp          0
Parch          0
Fare           1
Embarked       0
Title          0
Family_Size    0
dtype: int64


In [228]:
X_train.loc[:, 'Embarked'] = X_train.loc[:, 'Embarked'].fillna(pd.Series(np.random.choice(['S', 'C', 'Q'], size=len(X_train.index))))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


In [229]:
print_dataframe_information(X_train)

(891, 8)
['Pclass' 'Sex' 'SibSp' 'Parch' 'Fare' 'Embarked' 'Title' 'Family_Size']
Pclass         0
Sex            0
SibSp          0
Parch          0
Fare           0
Embarked       0
Title          0
Family_Size    0
dtype: int64


In [230]:
X_test = X_test.fillna(X_test.mean())

In [231]:
print_dataframe_information(X_test)

(418, 8)
['Pclass' 'Sex' 'SibSp' 'Parch' 'Fare' 'Embarked' 'Title' 'Family_Size']
Pclass         0
Sex            0
SibSp          0
Parch          0
Fare           0
Embarked       0
Title          0
Family_Size    0
dtype: int64


In [232]:
X_train = pd.get_dummies(X_train, columns=['Pclass', 'Sex', 'Embarked', 'Title'])
X_test = pd.get_dummies(X_test, columns=['Pclass', 'Sex', 'Embarked', 'Title'])

In [233]:
print_dataframe_information(X_train)

(891, 16)
['SibSp' 'Parch' 'Fare' 'Family_Size' 'Pclass_1' 'Pclass_2' 'Pclass_3'
 'Sex_female' 'Sex_male' 'Embarked_C' 'Embarked_Q' 'Embarked_S'
 'Title_Master' 'Title_Miss' 'Title_Mr' 'Title_Mrs']
SibSp           0
Parch           0
Fare            0
Family_Size     0
Pclass_1        0
Pclass_2        0
Pclass_3        0
Sex_female      0
Sex_male        0
Embarked_C      0
Embarked_Q      0
Embarked_S      0
Title_Master    0
Title_Miss      0
Title_Mr        0
Title_Mrs       0
dtype: int64


In [234]:
print_dataframe_information(X_test)

(418, 16)
['SibSp' 'Parch' 'Fare' 'Family_Size' 'Pclass_1' 'Pclass_2' 'Pclass_3'
 'Sex_female' 'Sex_male' 'Embarked_C' 'Embarked_Q' 'Embarked_S'
 'Title_Master' 'Title_Miss' 'Title_Mr' 'Title_Mrs']
SibSp           0
Parch           0
Fare            0
Family_Size     0
Pclass_1        0
Pclass_2        0
Pclass_3        0
Sex_female      0
Sex_male        0
Embarked_C      0
Embarked_Q      0
Embarked_S      0
Title_Master    0
Title_Miss      0
Title_Mr        0
Title_Mrs       0
dtype: int64


In [235]:
scaler = MinMaxScaler(feature_range=(0, 1))
X_train = scaler.fit_transform(pd.DataFrame(X_train))
print(X_train.shape)

(891, 16)


In [236]:
scaler = MinMaxScaler(feature_range=(0, 1))
X_test = scaler.fit_transform(pd.DataFrame(X_test))
print(X_test.shape)

(418, 16)


In [263]:
model = Sequential()
model.add(Dense(12, input_dim=X_train.shape[1], activation='relu'))
model.add(Dense(8, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

In [264]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [265]:
model.fit(X_train, y, epochs=100, batch_size=16)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<keras.callbacks.callbacks.History at 0x7f9362c58b10>

In [266]:
loss, accuracy = model.evaluate(X_train, y)
print(loss, accuracy)

0.37565982082543015 0.8406285047531128


In [267]:
predictions = model.predict(X_test)
predictions = np.where(predictions > 0.5, 1, 0)

In [268]:
print(predictions.shape)

(418, 1)


In [269]:
output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': predictions[:, 0]})

In [270]:
print(output.shape)

(418, 2)


In [271]:
output.to_csv('predictions.csv', index=False)