## ***Preprocess the test data***

In [1]:
import pandas as pd

In [2]:
data = pd.read_csv('test.csv')
data.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [3]:
data.describe(include='all')

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
count,418.0,418.0,418,418,332.0,418.0,418.0,418,417.0,91,418
unique,,,418,2,,,,363,,76,3
top,,,"Kelly, Mr. James",male,,,,PC 17608,,B57 B59 B63 B66,S
freq,,,1,266,,,,5,,3,270
mean,1100.5,2.26555,,,30.27259,0.447368,0.392344,,35.627188,,
std,120.810458,0.841838,,,14.181209,0.89676,0.981429,,55.907576,,
min,892.0,1.0,,,0.17,0.0,0.0,,0.0,,
25%,996.25,1.0,,,21.0,0.0,0.0,,7.8958,,
50%,1100.5,3.0,,,27.0,0.0,0.0,,14.4542,,
75%,1204.75,3.0,,,39.0,1.0,0.0,,31.5,,


In [7]:
data['Age'] = data['Age'].fillna(30)
data['Fare'] = data['Fare'].fillna(method='ffill')

In [8]:
data.describe(include='all')

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
count,418.0,418.0,418,418,418.0,418.0,418.0,418,418.0,91,418
unique,,,418,2,,,,363,,76,3
top,,,"Kelly, Mr. James",male,,,,PC 17608,,B57 B59 B63 B66,S
freq,,,1,266,,,,5,,3,270
mean,1100.5,2.26555,,,30.216507,0.447368,0.392344,,35.560845,,
std,120.810458,0.841838,,,12.635016,0.89676,0.981429,,55.856972,,
min,892.0,1.0,,,0.17,0.0,0.0,,0.0,,
25%,996.25,1.0,,,23.0,0.0,0.0,,7.8958,,
50%,1100.5,3.0,,,30.0,0.0,0.0,,14.4542,,
75%,1204.75,3.0,,,35.75,1.0,0.0,,31.471875,,


In [9]:
data['Sex'] = data['Sex'].map({'male' : 0, 'female' : 1})


In [10]:
data.head(3)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",0,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",1,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",0,62.0,0,0,240276,9.6875,,Q


In [11]:
dummies = pd.get_dummies(data['Embarked'])
dummies

Unnamed: 0,C,Q,S
0,0,1,0
1,0,0,1
2,0,1,0
3,0,0,1
4,0,0,1
...,...,...,...
413,0,0,1
414,1,0,0
415,0,0,1
416,0,0,1


In [12]:
merged = pd.concat([data, dummies], axis=1)
merged

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,C,Q,S
0,892,3,"Kelly, Mr. James",0,34.5,0,0,330911,7.8292,,Q,0,1,0
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",1,47.0,1,0,363272,7.0000,,S,0,0,1
2,894,2,"Myles, Mr. Thomas Francis",0,62.0,0,0,240276,9.6875,,Q,0,1,0
3,895,3,"Wirz, Mr. Albert",0,27.0,0,0,315154,8.6625,,S,0,0,1
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",1,22.0,1,1,3101298,12.2875,,S,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
413,1305,3,"Spector, Mr. Woolf",0,30.0,0,0,A.5. 3236,8.0500,,S,0,0,1
414,1306,1,"Oliva y Ocana, Dona. Fermina",1,39.0,0,0,PC 17758,108.9000,C105,C,1,0,0
415,1307,3,"Saether, Mr. Simon Sivertsen",0,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S,0,0,1
416,1308,3,"Ware, Mr. Frederick",0,30.0,0,0,359309,8.0500,,S,0,0,1


In [13]:
columns_to_drop = ['Embarked', 'Name', 'Ticket', 'Cabin']

dropped_columns = merged.drop(columns_to_drop, axis=1)

In [14]:
dropped_columns.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,C,Q,S
0,892,3,0,34.5,0,0,7.8292,0,1,0
1,893,3,1,47.0,1,0,7.0,0,0,1
2,894,2,0,62.0,0,0,9.6875,0,1,0
3,895,3,0,27.0,0,0,8.6625,0,0,1
4,896,3,1,22.0,1,1,12.2875,0,0,1


In [15]:
from sklearn import preprocessing

In [16]:
scaled_test_data = preprocessing.scale(dropped_columns)

In [17]:
scaled_test_data

array([[-1.72791209,  0.87348191, -0.75592895, ..., -0.56814154,
         2.84375747, -1.35067551],
       [-1.71962474,  0.87348191,  1.32287566, ..., -0.56814154,
        -0.35164743,  0.74037028],
       [-1.71133739, -0.31581919, -0.75592895, ..., -0.56814154,
         2.84375747, -1.35067551],
       ...,
       [ 1.71133739,  0.87348191, -0.75592895, ..., -0.56814154,
        -0.35164743,  0.74037028],
       [ 1.71962474,  0.87348191, -0.75592895, ..., -0.56814154,
        -0.35164743,  0.74037028],
       [ 1.72791209,  0.87348191, -0.75592895, ...,  1.76012477,
        -0.35164743, -1.35067551]])

In [18]:
import numpy as np

In [19]:
np.savez('final_test', input=scaled_test_data)

In [None]:
pre