In [None]:
#Titanic Competition using Random Forest Regressor

In [43]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier

In [4]:
#import data
train_data_path = "../train.csv" 
test_data_path = "../test.csv"

train_data = pd.read_csv(train_data_path)
test_data = pd.read_csv(test_data_path)

In [23]:
#inspect data
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [53]:
#convert Sex and Emabarked to numerical categories
train_data.Sex = pd.factorize(train_data.Sex)[0] # male = 0, female = 1
train_data.Embarked = pd.factorize(train_data.Embarked)[0]
#might as well do the same for the test dataset since we will need it later
test_data.Sex = pd.factorize(test_data.Sex)[0] # male = 0, female = 1
test_data.Embarked = pd.factorize(test_data.Embarked)[0]

In [36]:
#inspect data again
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,A/5 21171,7.25,,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,PC 17599,71.2833,C85,1
2,3,1,3,"Heikkinen, Miss. Laina",1,26.0,0,0,STON/O2. 3101282,7.925,,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,113803,53.1,C123,0
4,5,0,3,"Allen, Mr. William Henry",0,35.0,0,0,373450,8.05,,0


In [40]:
#check for missing data/values

print(train_data.isnull().any())


PassengerId    False
Survived       False
Pclass         False
Name           False
Sex            False
Age             True
SibSp          False
Parch          False
Ticket         False
Fare           False
Cabin           True
Embarked       False
dtype: bool


In [41]:
#We won't use cabin so let's just fix Age
print(train_data[train_data['Age'].isnull()].index)
# 177 MISSING VALUES!*!*!*!*!
# let's fill it based on Sex - if male, fill in average age of men on board

Int64Index([  5,  17,  19,  26,  28,  29,  31,  32,  36,  42,
            ...
            832, 837, 839, 846, 849, 859, 863, 868, 878, 888],
           dtype='int64', length=177)


In [42]:
def Fillme(Value,Male_Avg,Female_Avg):         # Helper Function 
    Sex = Value[0]        # Value[0] is the sex attribute
    Age = Value[1]        # Value[1] is the Age attribute
    
    if pd.isnull(Age):
        if Sex == 0:      # If its Male else Female
            return Male_Avg
        else:
            return Female_Avg
    else:
        return Age #if not NaN leave as it is

In [47]:
#use the function
Male_Avg = np.mean(train_data[train_data['Sex'] == 0].Age)               # 0 is male 1 is female by factorize function
Female_Avg = np.mean(train_data[train_data['Sex'] == 1].Age) 

# Now lets fill them 
train_data['Age'] = train_data[['Sex','Age']].apply(lambda x : Fillme(x, Male_Avg, Female_Avg), axis = 1)  # Fill me is a helper Function That is defined above'


In [48]:
#check if still missing data
print(train_data.isnull().any())

PassengerId    False
Survived       False
Pclass         False
Name           False
Sex            False
Age            False
SibSp          False
Parch          False
Ticket         False
Fare           False
Cabin           True
Embarked       False
dtype: bool


In [49]:
#select target (y) and training features for X
y = train_data["Survived"]

features = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"]
X = train_data[features]

In [50]:
#define model and fit
rf_model = RandomForestClassifier()

rf_model.fit(X, y)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [55]:
#define and check data for the test set

#check for missing data/values in test data

print(test_data.isnull().any())

PassengerId    False
Pclass         False
Name           False
Sex            False
Age             True
SibSp          False
Parch          False
Ticket         False
Fare            True
Cabin           True
Embarked       False
dtype: bool


In [56]:
#Fare missing data can be filled with average values
test_data.Fare.fillna(np.mean(test_data.Fare), inplace = True)

In [58]:
#for age the same function as with train data

Male_Avg_test = np.mean(test_data[test_data['Sex'] == 0].Age)               # 0 is male 1 is female by factorize function
Female_Avg_test = np.mean(test_data[test_data['Sex'] == 1].Age) 

# Now lets fill them 
test_data['Age'] = test_data[['Sex','Age']].apply(lambda x : Fillme(x, Male_Avg_test, Female_Avg_test), axis = 1)  # Fill me is a helper Function That is defined above'

#check missing again
print(test_data.isnull().any())

PassengerId    False
Pclass         False
Name           False
Sex            False
Age            False
SibSp          False
Parch          False
Ticket         False
Fare           False
Cabin           True
Embarked       False
dtype: bool


In [59]:
#define test data  features (X)
X_test = test_data[features]


In [60]:
#make predictions
predictions = rf_model.predict(X_test)

In [62]:
len(predictions)

418

In [65]:
#output for competition submission
output = pd.DataFrame({"PassengerId": test_data.PassengerId, "Survived": predictions})
output.to_csv("../rf_submission.csv", index = False)