In [22]:
import pandas as pd
import numpy as np
import nltk as nltk
import matplotlib.pyplot as plt
%matplotlib inline

import zipfile
import pickle
from os.path import join as path, dirname

try:
    from IPython.core.display import HTML

    def pprint(df):
        with pd.option_context('display.max_rows', None, 'display.max_columns', None):
            display(HTML(pd.DataFrame(df).to_html()))
except:
    def pprint(df):
        print(df)

from sklearn.model_selection import train_test_split

In [23]:
print('Training data:')
df_train = pd.read_csv(path('data', 'train.csv'), index_col='PassengerId')
print(df_train.shape)
pprint(df_train.head())

print('Test data:')
df_test = pd.read_csv(path('data', 'test.csv'), index_col='PassengerId')
print(df_test.shape)
pprint(df_test.head())

Training data:
(891, 11)


Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


Test data:
(418, 10)


Unnamed: 0_level_0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [40]:
def count_passengers(df):
    return df.shape[0]

def get_states(df, colName):
    return df[colName].unique()

def count_states(df, colName):
    return df[colName].value_counts()

def count_survived_column(df, colName):
    return df[["Survived", colName]].groupby(colName).agg(np.sum)

def count_survived(df):
    return count_states(df, "Survived")[1]

def get_surviveds(df):
    return df.loc[df["Survived"]==1]

In [35]:
def get_counts(df, colExcept):
    counts = []
    for col in df.columns:
        if(col not in colExcept):
            newDF = count_survived_column(df, col)
            newDF.name = col
            counts.append(newDF)
    return counts

In [37]:
def prepare_df(df):
    df_copy = df.copy()
    df_copy["Age"].fillna(-1, inplace=True)
    return df_copy
df_train = prepare_df(df_train)

In [38]:
def get_probability(r, df):
    c = get_counts(df, ["Survived", "Name", "Ticket", "Cabin", "Fare"])
    n_surv = count_survived(df)
    prob = (n_surv+1.)/(count_passengers(df)+len(get_states(df, "Survived")))
    for col, value in r.iteritems():
        for i in c:
            if(i.name == col):
                try:
                    prob *= (i["Survived"].loc[value]+1.)/(n_surv+i.shape[0])
                except:
                    if(np.isnan(value)):
                        value=-1                    
                        prob *= (i["Survived"].loc[value]+1.)/(n_surv+i.shape[0])
                    else:
                        prob *= 1./(n_surv+i.shape[0])
                break
    return prob

In [28]:
def invert_survived(df):
    df_copy = df.copy()
    df_copy.loc[(df_copy["Survived"]==0), "Survived"]=2
    df_copy.loc[(df_copy["Survived"]==1), "Survived"]=0
    df_copy.loc[(df_copy["Survived"]==2), "Survived"]=1
    return df_copy

In [46]:
X_full = df_train.copy()
y = df_train["Survived"]
X_train_full, X_valid_full, y_train, y_valid = train_test_split(X_full, y, 
                                                                train_size=0.8, test_size=0.2,
                                                                random_state=0)
X_valid_full.drop(['Survived'], axis=1, inplace=True)
X_train_full_inv = invert_survived(X_train_full)
results = [0 if get_probability(r, X_train_full)<get_probability(r, X_train_full_inv) else 1 for index, r in X_valid_full.iterrows()]   
results=pd.Series(results)
results.index.name = "PassengerId"
results.index = X_valid_full.index
results.name="Survived"
encerts = 0
total = 0
for item, value in results.iteritems():
    if(value == y_valid.loc[item]):
            encerts+=1
    total+=1.
print(encerts/total)

0.8379888268156425


In [43]:
df_train_inv = invert_survived(df_train)
results = [0 if get_probability(r, df_train)<get_probability(r, df_train_inv) else 1 for index, r in df_test.iterrows()]   
results=pd.Series(results)
results.index.name = "PassengerId"
results.index = df_test.index
results.name="Survived"
results.to_frame().to_csv("submission.csv")