In [1]:
# Standard Imports
from pandas import Series, DataFrame
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
%matplotlib inline

In [2]:
# Performance
from time import time

In [3]:
# Machine Learning
from sklearn.linear_model import LogisticRegression

In [4]:
# Helper
from sklearn import cross_validation
from sklearn.cross_validation import cross_val_predict
from sklearn.metrics import accuracy_score

In [5]:
# Load the Data
train = pd.read_csv('train_Titanic.csv')
test = pd.read_csv('test_Titanic.csv')

# Clean data

In [6]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 90.5+ KB


In [7]:
test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 418 entries, 0 to 417
Data columns (total 11 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name           418 non-null object
Sex            418 non-null object
Age            332 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Ticket         418 non-null object
Fare           417 non-null float64
Cabin          91 non-null object
Embarked       418 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 39.2+ KB


In [8]:
def setGender(df):
    
    df['Gender'] = df['Sex'].map( {'female': 0, 'male': 1} ).astype(int)
    df = df.drop('Sex', axis = 1)
    return df

def setPort(df):
   
    df['Port'] = df['Embarked'].dropna().map( {'S': 0, 'C': 1, 'Q': 2} ).astype(int)
    df = df.drop('Embarked', axis = 1)
    return df

def setAge(df):
    
    median_ages = np.zeros((2,3)) 
    for i in range(0, 2):
        for j in range(0, 3):
            median_ages[i,j] = df[(df['Gender'] == i) & \
                                  (df['Pclass'] == j + 1)]['Age'].dropna().median()
            
    # copy of age
    df['AgeFill'] = df['Age']
    
    # whether the Age was originally missing
    df['AgeIsNull'] = pd.isnull(df.Age).astype(int)
    
    # fill in missing data
    for i in range(0, 2):
        for j in range(0, 3):
            df.loc[ (df.Age.isnull()) & (df.Gender == i) \
                          & (df.Pclass == j+1), 'AgeFill'] = median_ages[i,j]
    
    df = df.drop(['Age', 'AgeIsNull'], axis = 1)
    
    return df


In [9]:
train = setGender(train)
train = setPort(train)
train = setAge(train)
train = train.drop(['Name','Ticket','Cabin'], axis = 1)
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 0 to 890
Data columns (total 9 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
SibSp          891 non-null int64
Parch          891 non-null int64
Fare           891 non-null float64
Gender         891 non-null int32
Port           889 non-null float64
AgeFill        891 non-null float64
dtypes: float64(3), int32(1), int64(5)
memory usage: 66.1 KB


In [10]:
train = train.dropna()

In [11]:
test = setGender(test)
test = setPort(test)
test = setAge(test)
test = test.drop(['Name','Ticket','Cabin'], axis = 1)
test = test.dropna()
test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 417 entries, 0 to 417
Data columns (total 8 columns):
PassengerId    417 non-null int64
Pclass         417 non-null int64
SibSp          417 non-null int64
Parch          417 non-null int64
Fare           417 non-null float64
Gender         417 non-null int32
Port           417 non-null int32
AgeFill        417 non-null float64
dtypes: float64(2), int32(2), int64(4)
memory usage: 26.1 KB


# Logistic regression

*penalty* = 'l2' (specify the norm used in penalization)

*C* = 1.0 (inverse of regularization strength; smaller values specify stronger regularization)

*random_state* = None (seed of the pseudo random number generator to use when shuffling the data)

In [12]:
# Predictors
predictors = ["Pclass", "Gender", "AgeFill", "Port", "Fare"]

In [13]:
# Initialize our algorithm
alg = LogisticRegression()

# Train the algorithm using all the training data
start = time()
alg.fit(train[predictors], train["Survived"])
print("LogisticReg Training finished in %.2f s" % (time() - start))

LogisticReg Training finished in 0.06 s


In [14]:
# Basic Evaluation on Training Set
start = time()
train_pred = cross_val_predict(alg, train[predictors], train["Survived"], cv=2)
print("LogisticReg Evaluation finished in %.2f s" % (time() - start))

LogisticReg Evaluation finished in 0.02 s


In [15]:
# Display Accuracy
print("Accuracy: ", accuracy_score(train["Survived"], train_pred))

Accuracy:  0.784026996625


In [None]:
# Make predictions using the test set.
predictions = alg.predict(test[predictors])

In [None]:
# Create a new dataframe with only the columns Kaggle wants from the dataset.
submission = pd.DataFrame({
        "PassengerId": test["PassengerId"],
        "Survived": predictions
    })

# Any files you save will be available in the output tab below
submission.to_csv('submission.csv', index=False)