# Titanic classifier

In [37]:
import pandas as pd, numpy as np
import re
from sklearn.ensemble import RandomForestClassifier
import joblib

In [19]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [20]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


### Fill Deck column

In [21]:
deck = {"A": 1, "B": 2, "C": 3, "D": 4, "E": 5, "F": 6, "G": 7, "U": 8}
data = [train, test]

for dataset in data:
    dataset['Cabin'] = dataset['Cabin'].fillna("U0")
    dataset['Deck'] = dataset['Cabin'].map(lambda x: re.compile("([a-zA-Z]+)").search(x).group())
    dataset['Deck'] = dataset['Deck'].map(deck)
    dataset['Deck'] = dataset['Deck'].fillna(0)
    dataset['Deck'] = dataset['Deck'].astype(int)
# we can now drop the cabin feature
train = train.drop(['Cabin'], axis=1)
test = test.drop(['Cabin'], axis=1)

### Interpolate Age values around 1 SD

In [22]:
data = [train, test]

for dataset in data:
    mean = train["Age"].mean()
    std = test["Age"].std()
    is_null = dataset["Age"].isnull().sum()
    # compute random numbers between the mean, std and is_null
    rand_age = np.random.randint(mean - std, mean + std, size = is_null)
    # fill NaN values in Age column with random values generated
    age_slice = dataset["Age"].copy()
    age_slice[np.isnan(age_slice)] = rand_age
    dataset["Age"] = age_slice
    dataset["Age"] = train["Age"].astype(int)

In [23]:
common_value = 'S'
data = [train, test]
genders = {"male": 0, "female": 1}
ports = {"S": 0, "C": 1, "Q": 2}

for dataset in data:
    dataset['Embarked'] = dataset['Embarked'].fillna(common_value)
    dataset['Fare'] = dataset['Fare'].fillna(0)
    dataset['Fare'] = dataset['Fare'].astype(int)
    dataset['Sex'] = dataset['Sex'].map(genders)
    dataset['Embarked'] = dataset['Embarked'].map(ports)

In [24]:
train = train.drop(['Ticket', 'Name'], axis=1)
test = test.drop(['Ticket','Name'], axis=1)

In [29]:
X_train = train.drop(["Survived",'PassengerId'], axis=1)
Y_train = train["Survived"]
X_test  = test.drop("PassengerId", axis=1).copy()

In [50]:
X_train.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Deck
0,3,0,22,1,0,7,0,8
1,1,1,38,1,0,71,1,3
2,3,1,26,0,0,7,0,8
3,1,1,35,1,0,53,0,3
4,3,0,35,0,0,8,0,8


In [48]:
X_train.head().values

array([[ 3,  0, 22,  1,  0,  7,  0,  8],
       [ 1,  1, 38,  1,  0, 71,  1,  3],
       [ 3,  1, 26,  0,  0,  7,  0,  8],
       [ 1,  1, 35,  1,  0, 53,  0,  3],
       [ 3,  0, 35,  0,  0,  8,  0,  8]], dtype=int64)

### Random forest model

In [34]:
random_forest = RandomForestClassifier(n_estimators=100)
random_forest.fit(X_train, Y_train)

Y_prediction = random_forest.predict(X_test)

random_forest.score(X_train, Y_train)
acc_random_forest = round(random_forest.score(X_train, Y_train) * 100, 2)

### Pickle model

In [38]:
# Save model as pickle file
joblib.dump(random_forest, "model.pkl")

['model.pkl']

In [39]:
classifer = joblib.load("model.pkl")

In [49]:
# Create new observation
new_observation = [[ 3,  0, 22,  1,  0,  7,  0,  8],[ 1,  1, 38,  1,  0, 71,  1,  3]]

# Predict observation's class
classifer.predict(new_observation)

array([0, 1], dtype=int64)