In [1]:
import sys
import pandas as pd
import numpy as np
import sklearn

In [2]:
import matplotlib.pyplot as plt
from matplotlib import style
style.use('seaborn')

In [3]:
from sklearn.model_selection import KFold, GridSearchCV, RandomizedSearchCV
from sklearn.base import clone
from sklearn.preprocessing import scale, StandardScaler

from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

In [4]:
import warnings
warnings.filterwarnings('ignore')

import IPython
from IPython import display

In [5]:
raw_train_data = pd.read_csv('data\\train.csv')
raw_test_data = pd.read_csv('data\\test.csv')

In [7]:
raw_train_data.sample(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
397,398,0,2,"McKane, Mr. Peter David",male,46.0,0,0,28403,26.0,,S
738,739,0,3,"Ivanoff, Mr. Kanio",male,,0,0,349201,7.8958,,S
278,279,0,3,"Rice, Master. Eric",male,7.0,4,1,382652,29.125,,Q
789,790,0,1,"Guggenheim, Mr. Benjamin",male,46.0,0,0,PC 17593,79.2,B82 B84,C
358,359,1,3,"McGovern, Miss. Mary",female,,0,0,330931,7.8792,,Q


In [8]:
def clean(Original_DF):
    DF = Original_DF.copy()
    
    age_map = [int(DF[DF.Pclass==c].Age.mean()) for c in range(1, 4)]
    for c in range(1, 4):
        DF.loc[
            DF.Age.isnull() & (DF.Pclass==c), 'Age'
        ] = age_map[c-1]
    DF['Age'] = DF.Age.astype(int)
    
    Fare_map = [int(DF[DF.Pclass==c].Fare.mean()) for c in range(1, 4)]
    for c in range(1, 4):
        DF.loc[
            DF.Fare.isnull() & (DF.Pclass==c), 'Fare'
        ] = Fare_map[c-1]
        
    
    sex_mapping = {'male': 1, 'female': 0}
    DF['Sex'] = DF.Sex.map(sex_mapping)
    
    
    
    DF.Pclass = DF.Pclass.fillna(4)
    
    DF['FamilySize'] = DF.SibSp + DF.Parch
    
    DF['Embarked'] = DF.Embarked.fillna('S')
    
    DF['Cabin'] = DF.Cabin.map(lambda c: 0 if type(c) == float else 1)
    
    
    DF['FareBand'] = DF.Fare
    Fare_map = [(0, 7.91), (1, 14.454), (2, 31), (3, float('inf'))]
    Fare_map.reverse()
    for v, f in Fare_map:
        DF.loc[DF.Fare <= f, 'FareBand'] = v
#     DF['Fare'] = DF.FareBand.astype(int)
    
    
    DF['FareBin'] = pd.qcut(DF.Fare, 4)
    DF['AgeBin'] = pd.cut(DF.Age, 5)
    
    
    
    features = ['Pclass', 'Sex', 'Age', 'FamilySize', 'Fare', 'Embarked', 'Cabin']
    
    DF = DF[features]
    DF = pd.get_dummies(DF)
    return DF

In [9]:
train_X = clean(raw_train_data)
test_X = clean(raw_test_data)

y = raw_train_data.Survived

In [10]:
train_X.head(5)

Unnamed: 0,Pclass,Sex,Age,FamilySize,Fare,Cabin,Embarked_C,Embarked_Q,Embarked_S
0,3,1,22,1,7.25,0,0,0,1
1,1,0,38,1,71.2833,1,1,0,0
2,3,0,26,0,7.925,0,0,0,1
3,1,0,35,1,53.1,1,0,0,1
4,3,1,35,0,8.05,0,0,0,1


In [11]:
P = train_X.copy()
P['y']= y

In [12]:
# scaler = StandardScaler().fit(train_X)
# train_X = scaler.transform(train_X)
# test_X = scaler.transform(test_X)

In [13]:
scaler = StandardScaler().fit(train_X[['Age', 'Fare']])
train_X[['Age', 'Fare']] = scaler.transform(train_X[['Age', 'Fare']])
test_X[['Age', 'Fare']] = scaler.transform(test_X[['Age', 'Fare']])

In [14]:
pd.DataFrame(train_X).head(5)

Unnamed: 0,Pclass,Sex,Age,FamilySize,Fare,Cabin,Embarked_C,Embarked_Q,Embarked_S
0,3,1,-0.547712,1,-0.502445,0,0,0,1
1,1,0,0.663235,1,0.786845,1,1,0,0
2,3,0,-0.244976,0,-0.488854,0,0,0,1
3,1,0,0.436182,1,0.42073,1,0,0,1
4,3,1,0.436182,0,-0.486337,0,0,0,1


In [15]:
parameters = {
        'C': [10**p for p in [0, 1, 2]],
        'gamma': ['auto', 'scale']
    }

In [16]:
gscv = GridSearchCV(SVC(probability=True), parameters)

In [17]:
gscv.fit(train_X, y)

GridSearchCV(estimator=SVC(probability=True),
             param_grid={'C': [1, 10, 100], 'gamma': ['auto', 'scale']})

In [18]:
score = pd.DataFrame(gscv.cv_results_).sort_values('rank_test_score')#.head()

In [19]:
score[['params', 'mean_test_score', 'rank_test_score', 'std_test_score']].head()

Unnamed: 0,params,mean_test_score,rank_test_score,std_test_score
3,"{'C': 10, 'gamma': 'scale'}",0.827148,1,0.029451
1,"{'C': 1, 'gamma': 'scale'}",0.822673,2,0.014881
2,"{'C': 10, 'gamma': 'auto'}",0.821537,3,0.032314
0,"{'C': 1, 'gamma': 'auto'}",0.820426,4,0.017045
5,"{'C': 100, 'gamma': 'scale'}",0.80135,5,0.026706


In [20]:
predictions = gscv.predict(test_X)
output = pd.DataFrame({'PassengerID': raw_test_data.PassengerId, 'Survived': predictions})
output.to_csv('preds\\svc2_3.csv', index = False)

In [21]:
raw_train_data.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292
