Grab imports needed

In [1]:
import pandas as pd
import numpy as np
import csv as csv
from sklearn.ensemble import RandomForestClassifier


Define functions (makes for easier automation later)

In [2]:
def getData(csv):
    #Opens the data into a pandas DF.
    df = pd.read_csv(csv, header=0)
    return df


def addGender(df):
    #Creates a Gender column and populates it with a 0/1 value for Male/Female.
    df['Gender'] = df['Sex'].map( {'female': 0, 'male': 1} ).astype(int)
    return df
    

def addMedianAges(df):
    #Creates AgeFill Column and populates it with Age
    df['AgeFill'] = df['Age']
    
    #Replaces NaN age values with median age for class and gender
    median_ages = np.zeros((2,3))
    for i in range(0, 2):
        for j in range(0, 3):
            median_ages[i,j] = df[(df['Gender'] == i) & (df['Pclass'] == j+1)]['Age'].dropna().median()
    
    for i in range(0, 2):
        for j in range(0, 3):
            df.loc[ (df.Age.isnull()) & (df.Gender == i) & (df.Pclass == j+1),'AgeFill'] = median_ages[i,j]
    
    #Records which rows were back filled
    df['AgeIsNull'] = pd.isnull(df.Age).astype(int)
    
    return df
        

def addFamilySize(df):
    #Creates a new column that groups number of siblings and parents to indicate family size
    df['FamilySize'] = df['SibSp'] + df['Parch']
    return df


def addAgeByClass(df):
    #multiplies Age and Class together to create a score as these values are high indicators
    df['Age*Class'] = df.AgeFill * df.Pclass
    
    return df

def createCleanData(csv):
    #runs the CSV through the process
    df = getData(csv)
    df = addGender(df)
    df = addMedianAges(df)
    df = addFamilySize(df)
    df = addAgeByClass(df)
    
    return df
    
def dropNonIntegerValues(df):
    #drops any columns that can't be used.
    df = df.drop(['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked','Fare','Age','PassengerId'], axis=1) 
    
    return df

Grab the training and test data from a CSV. Also runs the data through the above functions to create additional columns and clean up the NaN values. 

In [3]:
test = createCleanData("test.csv")
train = createCleanData("train.csv")

In [4]:
test.head(5)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Gender,AgeFill,AgeIsNull,FamilySize,Age*Class
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q,1,34.5,0,0,103.5
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S,0,47.0,0,1,141.0
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q,1,62.0,0,0,124.0
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S,1,27.0,0,0,81.0
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S,0,22.0,0,2,66.0


In [5]:
train.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Gender,AgeFill,AgeIsNull,FamilySize,Age*Class
0,1,0,3,"Braund, Mr. Owen Harris",male,22,1,0,A/5 21171,7.25,,S,1,22,0,1,66
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38,1,0,PC 17599,71.2833,C85,C,0,38,0,1,38
2,3,1,3,"Heikkinen, Miss. Laina",female,26,0,0,STON/O2. 3101282,7.925,,S,0,26,0,0,78
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35,1,0,113803,53.1,C123,S,0,35,0,1,35
4,5,0,3,"Allen, Mr. William Henry",male,35,0,0,373450,8.05,,S,1,35,0,0,105


Drops any unusable columns and returns the datafield as a Numpy Array.

In [6]:
test_data = dropNonIntegerValues(test)
train_data = dropNonIntegerValues(train)

Grabs the passenger ID's so they can be mapped in later. 

In [7]:
ids = test['PassengerId'].values

In [8]:
train_data.head(5)

Unnamed: 0,Survived,Pclass,SibSp,Parch,Gender,AgeFill,AgeIsNull,FamilySize,Age*Class
0,0,3,1,0,1,22,0,1,66
1,1,1,1,0,0,38,0,1,38
2,1,3,0,0,0,26,0,0,78
3,1,1,1,0,0,35,0,1,35
4,0,3,0,0,1,35,0,0,105


In [9]:
test_data.head(5)

Unnamed: 0,Pclass,SibSp,Parch,Gender,AgeFill,AgeIsNull,FamilySize,Age*Class
0,3,0,0,1,34.5,0,0,103.5
1,3,1,0,0,47.0,0,1,141.0
2,2,0,0,1,62.0,0,0,124.0
3,3,0,0,1,27.0,0,0,81.0
4,3,1,1,0,22.0,0,2,66.0


In [10]:
test_data = test_data.values
train_data = train_data.values

Trains the model

In [19]:
print 'Training...'
forest = RandomForestClassifier(n_estimators=100)
forest = forest.fit(train_data[0::,1::], train_data[0::,0] )
print 'Complete'

Training...
Complete


Runs the model over the test data set

In [20]:
print 'Predicting...'
output = forest.predict(test_data)
print 'Complete'

Predicting...
Complete


Outputs the predictions to a CSV file with just two columns. 

In [21]:
predictions_file = open("myfirstforest.csv", "wb")
open_file_object = csv.writer(predictions_file)
open_file_object.writerow(["PassengerId","Survived"])
open_file_object.writerows(zip(ids, output))
predictions_file.close()
print 'Done.'

Done.


In [22]:
output

array([ 0.,  0.,  1.,  1.,  1.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  1.,
        0.,  1.,  1.,  0.,  0.,  0.,  0.,  0.,  1.,  1.,  0.,  1.,  0.,
        1.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  1.,  1.,  0.,
        0.,  0.,  0.,  0.,  1.,  1.,  0.,  0.,  0.,  1.,  1.,  1.,  0.,
        1.,  1.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  1.,  1.,
        1.,  1.,  0.,  0.,  1.,  0.,  0.,  0.,  1.,  1.,  1.,  0.,  1.,
        0.,  0.,  1.,  0.,  1.,  0.,  0.,  0.,  1.,  1.,  1.,  1.,  0.,
        0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,
        1.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  1.,  1.,  1.,  0.,  0.,
        1.,  1.,  1.,  1.,  0.,  1.,  0.,  0.,  1.,  0.,  1.,  0.,  0.,
        1.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,
        0.,  1.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  1.,  0.,  0.,
        1.,  0.,  1.,  1.,  1.,  1.,  1.,  0.,  0.,  0.,  1.,  0.,  1.,
        1.,  0.,  1.,  0.,  0.,  0.,  1.,  1.,  0.,  1.,  1.,  0

In [23]:
test = pd.DataFrame(output)

In [24]:
test.head(5)


Unnamed: 0,0
0,0
1,0
2,1
3,1
4,1
