In [200]:
# imports
%matplotlib inline
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import pandas as pd
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
import seaborn as sns
sns.set_style("whitegrid")

In [201]:
# Reading of the dataset
dataset = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

In [202]:
# Extracting letter of cabin and filling with N if not know
def cabin_letter(dataset):
    dataset['CabinLetter'] = dataset.Cabin.str[:1]
    dataset['CabinLetter'] = dataset['CabinLetter'].fillna('N')
    return dataset

In [203]:
# getting side of boat (letter of cabin)
dataset = cabin_letter(dataset)
test = cabin_letter(test)

In [204]:
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,CabinLetter
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q,N
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S,N
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q,N
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S,N
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S,N


In [205]:
# filling unknown embarked with 'N'
dataset['Embarked'] = dataset['Embarked'].fillna('N')
test['Embarked'] = test['Embarked'].fillna('N')

In [206]:
# Replacing missing fare by median fare
medFare = test['Fare'][test.Fare.notnull()].median()
test.Fare.fillna(medFare,inplace=True)

In [207]:
dataset.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,CabinLetter
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,N
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,N
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,C
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,N


In [208]:
np.array([])

array([], dtype=float64)

In [209]:
from sklearn.preprocessing import LabelEncoder 

# cols_to_dummies :: Series -> Series -> (np.Array,np.Array)
def cols_to_dummies(col_train,col_test,enc):
    col_train = col_train.reshape(-1,1)
    col_test = col_test.reshape(-1,1)
    enc.fit(col_train)
    return (enc.transform(col_train),enc.transform(col_test))

# cols_to_label :: Series -> Series -> (np.Array,np.Array)
def cols_to_label(col_train,col_test):
    lab = LabelEncoder().fit(col_train)
    return (lab.transform(col_train),lab.transform(col_test))

def prepare_df(array,c):
    df = pd.DataFrame(array)
    df.columns = [c + '_' + str(colname) for colname in df.columns]
    return df

# to_dummies :: DF -> DF -> [String] -> (DF,DF)
def to_dummies(train,test,columns_to_dummies):
    
    for c in columns_to_dummies:
        
        enc = OneHotEncoder(sparse=False)
        
        if train[c].dtype == np.int64 or train[c].dtype == np.float64:
            dummies_train, dummies_test = cols_to_dummies(train[c],test[c],enc)
            dummies_train = prepare_df(dummies_train,c)
            dummies_test = prepare_df(dummies_test,c)
            train = train.join(dummies_train)
            test = test.join(dummies_test)
        else : 
            labelled_train, labelled_test = cols_to_label(train[c],test[c])
            dummies_train, dummies_test = cols_to_dummies(labelled_train,labelled_test,enc)
            dummies_train = prepare_df(dummies_train,c)
            dummies_test = prepare_df(dummies_test,c)
            train = train.join(dummies_train)
            test = test.join(dummies_test)
        
            
    return (train,test)

In [210]:
dataset, test = to_dummies(dataset,test,['Embarked','CabinLetter','Pclass','Sex'])

In [211]:
# Filtering out lines wihtout age
dataset_age_not_null = dataset[dataset.Age.notnull()]
dataset_age_null = dataset[dataset.Age.isnull()]
test_age_not_null = test[test.Age.notnull()]
test_age_null = test[test.Age.isnull()]

In [212]:
# loading libraries
from sklearn.linear_model import LinearRegression

In [213]:
# Fn to produce a calibrated linear model to predict age
def create_predict_age(X_train,y_train,dimensions):           
    clf = LinearRegression().fit(X = X_train,y = y_train)
    return clf

In [214]:
dataset_age_not_null.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,CabinLetter,Embarked_0,Embarked_1,Embarked_2,Embarked_3,CabinLetter_0,CabinLetter_1,CabinLetter_2,CabinLetter_3,CabinLetter_4,CabinLetter_5,CabinLetter_6,CabinLetter_7,CabinLetter_8,Pclass_0,Pclass_1,Pclass_2,Sex_0,Sex_1
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,N,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,C,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,N,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,C,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,N,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0


In [215]:
dataset.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked', 'CabinLetter', 'Embarked_0', 'Embarked_1', 'Embarked_2', 'Embarked_3', 'CabinLetter_0', 'CabinLetter_1', 'CabinLetter_2', 'CabinLetter_3', 'CabinLetter_4', 'CabinLetter_5', 'CabinLetter_6', 'CabinLetter_7', 'CabinLetter_8', 'Pclass_0', 'Pclass_1', 'Pclass_2', 'Sex_0', 'Sex_1'], dtype='object')

In [216]:
# Calibrating the linear model to predict the age
colsX = ['Pclass_0', 'Pclass_1', 'Pclass_2',
           'Sex_0', 'Sex_1',
           'Parch',
           'Embarked_0', 'Embarked_1', 'Embarked_2', 'Embarked_3',
           'CabinLetter_0', 'CabinLetter_1', 'CabinLetter_2', 'CabinLetter_3', 'CabinLetter_4', 'CabinLetter_5',
           'CabinLetter_6', 'CabinLetter_7', 'CabinLetter_8',
           'Fare']
colsY = ['Age']
age_predictor = create_predict_age(dataset_age_not_null[colsX],
                                   dataset_age_not_null[colsY],
                                   32)

In [217]:
# Producing the missing values of age for train and test dataset
train_missing_age_values = age_predictor.predict(dataset_age_null[colsX])
test_missing_age_values = age_predictor.predict(test_age_null[colsX])

In [218]:
# Fn to add a columns with all the ages
def fill_missing_ages(data,ages):
    data_without_null = data[data.Age.isnull()]
    ages = pd.DataFrame(ages).set_index(data_without_null.index)
    ages.columns = ["estimated_ages"]
    data = data.join(ages)
    f = np.vectorize(lambda age, est_age: est_age if np.isnan(age) else age)
    data = data.assign(final_age = f(data.Age,data.estimated_ages))
    return data

In [219]:
# Add a column with all the ages
dataset = fill_missing_ages(dataset,train_missing_age_values)
test = fill_missing_ages(test,test_missing_age_values)

In [221]:
# Creating age class
def create_age_class(data):
    age10 = np.vectorize(lambda x : 1 if x < 10 else 0)
    data = data.assign(age10 = age10(data.Age))
    return data
    

In [223]:
dataset = create_age_class(dataset)
test = create_age_class(test)


In [None]:
test_final = test.drop('final_age',axis=1)
dataset_final_y = dataset['Survived']
dataset_final_x = dataset.drop(['final_age','Survived'],axis = 1)


In [229]:
# loading libraries
from sklearn.linear_model import LogisticRegression


In [230]:
#Logistic
clf = LogisticRegression()
clf.fit_transform(dataset[colsX+['final_age']],dataset['Survived'])
results = clf.predict(test[colsX + ['final_age']])
results = pd.DataFrame(results)
test_passengerId = pd.read_csv('data/test.csv')['PassengerId']
results.columns = ['Survived']
pd.DataFrame(test_passengerId).join(results).to_csv("data/res.csv",index=False)


