In [1]:
#import packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.linear_model import LinearRegression

# read training data 
titanic_train = pd.read_csv('D:\Data\Kaggle_Titanic_Train.csv')

# extract title from Name
df_titles = titanic_train
df_titles['NameComma'] = df_titles.Name.apply(lambda x: x.split(',')[1])
df_titles['LastName'] = df_titles.Name.apply(lambda x: x.split(',')[0])
df_titles['Title'] = df_titles.NameComma.apply(lambda x: x.split('.')[0])
df_titles['Title'] = df_titles['Title'].map(lambda x: x.lstrip())

# fill 2 missing values with most common 'S'
df_titles.Embarked.fillna('S', inplace=True)

# create dummy variables for title, embarked, pclass(for more than two categrey)
title_dummies = pd.get_dummies(df_titles['Title'])
title_dummies = title_dummies[['Master','Miss','Mr','Mrs']]
pclass_dummies = pd.get_dummies(df_titles['Pclass'])
pclass_dummies.columns = ['Pclass1','Pclass2','Pclass3']
pclass_dummies = pclass_dummies[['Pclass1','Pclass3']]
embarked_dummies = pd.get_dummies(df_titles['Embarked'])
embarked_dummies = embarked_dummies[['C','S']]
dummies = pd.concat([title_dummies,pclass_dummies,embarked_dummies], axis = 1)

# binary for sex, cabin 
def gender(df):
    if df['Sex'] == 'female':
        return 1
    else: 
        return 0
def cabin(df):
    if pd.isnull(df['Cabin']) == True:
        return 0
    else:
        return 1
df_titles['Sex'] = df_titles.apply(gender,axis=1)
df_titles['Cabin'] = df_titles.apply(cabin,axis=1)

# merge dummies with original data
df_with_dummies = pd.concat([df_titles,dummies],axis=1)

# drop unnecessary vars
prep_data = df_with_dummies.drop(['Embarked','Pclass','Name','Ticket',
                                            'NameComma','LastName','Title'], axis=1)

# prep for age model, drop missing ages
age_predict_ready = df_with_dummies.dropna()
age_predict_ready = age_predict_ready.drop(['PassengerId','Embarked','Pclass','Survived','Name','Ticket',
                                            'NameComma','LastName','Title'], axis=1)
# develop age model
Y_age = age_predict_ready.Age
X_age = age_predict_ready.drop(['Age'],axis=1)

Y_age = np.ravel(Y_age)

age_lin_model = LinearRegression()
age_lin_model.fit(X_age,Y_age)
age_lin_model.score(X_age,Y_age)      

# create df with missing ages 
missing_age = df_with_dummies
missing_age = missing_age[missing_age['Age'].apply(np.isnan)]
missing_age = missing_age.drop(['Age','Embarked','Pclass','Survived','Name','Ticket',
                                            'NameComma','LastName','Title'], axis=1)

missing_age_ready = missing_age.drop(['PassengerId'],axis=1)

# apply age model to missing ages
missing_age_pred = pd.DataFrame(age_lin_model.predict(missing_age_ready))
missing_age_pred.columns = ['AgePred']

# set index to concat data
missing_age.index = range(177)

passid_agepred = pd.concat([missing_age,missing_age_pred],axis=1)[['PassengerId','AgePred']]

# merge predictions with original data
agepred_titanic_train = pd.merge(prep_data,passid_agepred,on='PassengerId',how='outer')

# replace missing data with predictions
def combine_age(df):
    if pd.isnull(df['Age']) == True:
        return df['AgePred']
    else:
        return df['Age']

agepred_titanic_train['Age'] = agepred_titanic_train.apply(combine_age,axis=1)

# prep for log model
agepred_titanic_train = agepred_titanic_train.drop(['PassengerId','AgePred'],axis=1)

# create log model for survival
Y_surv = np.ravel(agepred_titanic_train.Survived)
X_surv = agepred_titanic_train.drop(['Survived'],axis=1)

from sklearn.linear_model import LogisticRegression

log_model = LogisticRegression()
log_model = log_model.fit(X_surv,Y_surv)
log_model.score(X_surv,Y_surv)


IOError: File D:\Data\Kaggle_Titanic_Train.csv does not exist

In [288]:
# read test data
titanic_test = pd.read_csv('D:\Data\Kaggle_Titanic_Test.csv')

# replace missing fare with mean - change to median to accomodate skewed data
tt = titanic_test
tt.Fare.fillna(tt.Fare.dropna().mean(), inplace=True)

# extract title from name
tt['NameComma'] = tt.Name.apply(lambda x: x.split(',')[1])
tt['LastName'] = tt.Name.apply(lambda x: x.split(',')[0])
tt['Title'] = tt.NameComma.apply(lambda x: x.split('.')[0])
tt['Title'] = tt['Title'].map(lambda x: x.lstrip())

# create dummy vars for title, embarked, pclass
title_dummies2 = pd.get_dummies(tt['Title'])
title_dummies2 = title_dummies2[['Master','Miss','Mr','Mrs']]
pclass_dummies2 = pd.get_dummies(tt['Pclass'])
pclass_dummies2.columns = ['Pclass1','Pclass2','Pclass3']
pclass_dummies2 = pclass_dummies2[['Pclass1','Pclass3']]
embarked_dummies2 = pd.get_dummies(tt['Embarked'])
embarked_dummies2 = embarked_dummies2[['C','S']]
dummies2 = pd.concat([title_dummies2,pclass_dummies2,embarked_dummies2], axis = 1)

# binary for sex and cabin
def gender(df):
    if df['Sex'] == 'female':
        return 1
    else: 
        return 0
def cabin(df):
    if pd.isnull(df['Cabin']) == True:
        return 0
    else:
        return 1
tt['Sex'] = tt.apply(gender,axis=1)
tt['Cabin'] = tt.apply(cabin,axis=1)

# combine dummies with original data
df_with_dummies2 = pd.concat([tt,dummies2],axis=1)

# drop unnecessary vars 
prep_data2 = df_with_dummies2.drop(['Embarked','Pclass','Name','Ticket',
                                            'NameComma','LastName','Title'], axis=1)
# separate missing age df and prep for age model
missing_age2 = df_with_dummies2
missing_age2 = missing_age2[missing_age2['Age'].apply(np.isnan)]
missing_age2 = missing_age2.drop(['Age','Embarked','Pclass','Name','Ticket',
                                            'NameComma','LastName','Title'], axis=1)
missing_age2.index = range(86)

missing_age_ready2 = missing_age2.drop(['PassengerId'],axis=1)

# apply age model 
missing_age_pred2 = pd.DataFrame(age_lin_model.predict(missing_age_ready2))

missing_age_pred2.columns = ['AgePred2']

# combine age predictions with original data
passid_agepred2 = pd.concat([missing_age2,missing_age_pred2],axis=1)[['PassengerId','AgePred2']]

agepred_titanic_test = pd.merge(prep_data2,passid_agepred2,on='PassengerId',how='outer')

# replace missing age with age prediction
def combine_age(df):
    if pd.isnull(df['Age']) == True:
        return df['AgePred2']
    else:
        return df['Age']

agepred_titanic_test['Age'] = agepred_titanic_test.apply(combine_age,axis=1)
agepred_titanic_test_ready = agepred_titanic_test.drop(['PassengerId','AgePred2'],axis=1)

# apply log model to test data. prep for submission
surv_pred = pd.DataFrame(log_model.predict(agepred_titanic_test_ready))
surv_pred.columns = ['Survived']
surv_pred['Survived'] = surv_pred['Survived'].astype(int)

results = pd.DataFrame(zip(agepred_titanic_test.PassengerId,surv_pred.Survived))
results.columns = ['PassengerId','Survived']

results.to_csv('D:\Data\Kaggle_Titanic_Prediction_LinAge.csv', index=False)
