# Titanic Survival Prediction - SVM

## Introduction

Classification use SVM

## Steps
1. Load Data
2. Feature engineering
3. Train model with NN back propagation
4. Conclusion

### Load data

In [1]:
#Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import svm
from sklearn.metrics import make_scorer, precision_score,accuracy_score
from sklearn.model_selection import train_test_split


In [2]:
train = pd.read_csv("./input/train.csv")
test = pd.read_csv("./input/test.csv")

Some observations:
- We can drop Name,Ticket column
- We can drop Fare due to pclass already represent
- Cabin contains NaN value but its also potental relate to survival
- Combine column: SibSp/Parch

### Feature Engineering

In [3]:
train.tail()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.45,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0,C148,C
890,891,0,3,"Dooley, Mr. Patrick",male,32.0,0,0,370376,7.75,,Q


Check missing values

In [4]:
print(pd.isnull(train).sum())

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64


- Drop Cabin due to having many Null values


In [5]:
train.pop('Name');
train.pop('Ticket');
train.pop('Cabin');

In [6]:
print(pd.isnull(train).sum())

PassengerId      0
Survived         0
Pclass           0
Sex              0
Age            177
SibSp            0
Parch            0
Fare             0
Embarked         2
dtype: int64


In [7]:
train["Age"].fillna(train["Age"].mean(),inplace=True) 
train=train.dropna(how='any')  

### plot data

In [8]:
train.tail()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
886,887,0,2,male,27.0,0,0,13.0,S
887,888,1,1,female,19.0,0,0,30.0,S
888,889,0,3,female,29.699118,1,2,23.45,S
889,890,1,1,male,26.0,0,0,30.0,C
890,891,0,3,male,32.0,0,0,7.75,Q


In [9]:
train['Sex']= train['Sex'].astype('category')
train['Sex']=train['Sex'].cat.codes
train['Embarked']= train['Embarked'].astype('category')
train['Embarked']=train['Embarked'].cat.codes

In [10]:
train.tail()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
886,887,0,2,1,27.0,0,0,13.0,2
887,888,1,1,0,19.0,0,0,30.0,2
888,889,0,3,0,29.699118,1,2,23.45,2
889,890,1,1,1,26.0,0,0,30.0,0
890,891,0,3,1,32.0,0,0,7.75,1


In [11]:
# Mapping Fare
train.loc[train['Fare'] <= 7.91, 'Fare']       = 0
train.loc[(train['Fare'] > 7.91) & (train['Fare'] <= 14.454), 'Fare'] = 1
train.loc[(train['Fare'] > 14.454) & (train['Fare'] <= 31), 'Fare']   = 2
train.loc[ train['Fare'] > 31, 'Fare']  = 3
train['Fare'] = train['Fare'].astype(int)


In [12]:
# # Mapping Age
train.loc[ train['Age'] <= 16, 'Age']= 0
train.loc[(train['Age'] > 16) & (train['Age'] <= 32), 'Age'] = 1
train.loc[(train['Age'] > 32) & (train['Age'] <= 48), 'Age'] = 2
train.loc[(train['Age'] > 48) & (train['Age'] <= 64), 'Age'] = 3
train.loc[ train['Age'] > 64, 'Age']= 4


In [13]:
# train['FamilySize'] = train['SibSp'] + train['Parch'] + 1

## Train data

### Build column


In [14]:
X=train.drop(['PassengerId','Survived'],axis=1)
y=train['Survived']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)


In [15]:
model = svm.SVC(kernel='rbf')
model.fit(X_train, y_train)
model.score(X_train, y_train)


0.8275

### Evaluate model

In [16]:
result = model.predict(X_test)
accuracy_score(result, y_test)

0.8089887640449438

In [17]:
y_test.tail()

721    0
174    0
516    1
667    0
650    0
Name: Survived, dtype: int64

### Prediction

In [18]:
test.pop('Name');
test.pop('Ticket');
test.pop('Cabin');
test["Age"].fillna(test["Age"].mean(),inplace=True) 
test["Embarked"].fillna('S',inplace=True) 
test['Sex']= test['Sex'].astype('category')
test['Sex']=test['Sex'].cat.codes
test['Embarked']= test['Embarked'].astype('category')
test['Embarked']=test['Embarked'].cat.codes
test["Fare"].fillna(test["Fare"].mean(),inplace=True) 

In [19]:
# Mapping Fare
test.loc[test['Fare'] <= 7.91, 'Fare']       = 0
test.loc[(test['Fare'] > 7.91) & (test['Fare'] <= 14.454), 'Fare'] = 1
test.loc[(test['Fare'] > 14.454) & (test['Fare'] <= 31), 'Fare']   = 2
test.loc[ test['Fare'] > 31, 'Fare']  = 3
test['Fare'] = test['Fare'].astype(int)

In [20]:

test.loc[ test['Age'] <= 16, 'Age']= 0
test.loc[(test['Age'] > 16) & (test['Age'] <= 32), 'Age'] = 1
test.loc[(test['Age'] > 32) & (test['Age'] <= 48), 'Age'] = 2
test.loc[(test['Age'] > 48) & (test['Age'] <= 64), 'Age'] = 3
test.loc[ test['Age'] > 64, 'Age']= 4

In [21]:
# test['FamilySize'] = test['SibSp'] + test['Parch'] + 1

In [22]:
test.tail()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,FamilySize
413,1305,3,1,1.0,0,0,1,2,1
414,1306,1,0,2.0,0,0,3,0,1
415,1307,3,1,2.0,0,0,0,2,1
416,1308,3,1,1.0,0,0,1,2,1
417,1309,3,1,1.0,1,1,2,0,3


In [23]:
passengerid = test['PassengerId']
test=test.drop(['PassengerId'],axis=1)

In [24]:
print(pd.isnull(test).sum())

Pclass        0
Sex           0
Age           0
Fare          0
Embarked      0
FamilySize    0
dtype: int64


In [25]:
predictions = model.predict(test)


In [26]:
predictions.size

418

In [27]:

output = pd.DataFrame({ 'PassengerId' : passengerid, 'Survived': predictions})
output.to_csv('./output/submission.csv', index=False)

In [28]:
output.tail()

Unnamed: 0,PassengerId,Survived
413,1305,0
414,1306,1
415,1307,0
416,1308,0
417,1309,0
