In [1]:
import sys
import pandas as pd
import numpy as np
import sklearn

In [2]:
import matplotlib.pyplot as plt
from matplotlib import style
style.use('seaborn')

In [3]:
from sklearn.model_selection import train_test_split, KFold, GridSearchCV, RandomizedSearchCV
from sklearn.base import clone
from sklearn.preprocessing import scale, StandardScaler

from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

In [4]:
import warnings
warnings.filterwarnings('ignore')

import IPython
from IPython import display

In [5]:
raw_train_data = pd.read_csv('data\\train.csv')
raw_test_data = pd.read_csv('data\\test.csv')

In [6]:
def clean(Original_DF):
    DF = Original_DF.copy()
    
    age_map = [int(DF[DF.Pclass==c].Age.mean()) for c in range(1, 4)]
    for c in range(1, 4):
        DF.loc[
            DF.Age.isnull() & (DF.Pclass==c), 'Age'
        ] = age_map[c-1]
    
    Fare_map = [int(DF[DF.Pclass==c].Fare.mean()) for c in range(1, 4)]
    for c in range(1, 4):
        DF.loc[
            DF.Fare.isnull() & (DF.Pclass==c), 'Fare'
        ] = Fare_map[c-1]
        
    
    sex_mapping = {'male': 1, 'female': 0}
    DF['Sex'] = DF.Sex.map(sex_mapping)
    
    
    
    DF.Pclass = DF.Pclass.fillna(0)
    
    DF['FamilySize'] = DF.SibSp + DF.Parch
    
    DF['Embarked'] = DF.Embarked.fillna('S')
    
    DF['Cabin'] = DF.Cabin.map(lambda c: 0 if type(c) == float else 1)
    
    
    DF['FareBand'] = DF.Fare
    Fare_map = [(0, 7.91), (1, 14.454), (2, 31), (3, float('inf'))]
    Fare_map.reverse()
    for v, f in Fare_map:
        DF.loc[DF.Fare <= f, 'FareBand'] = v
    
    
    DF['FareBin'] = pd.qcut(DF.Fare, 4)
    DF['AgeBin'] = pd.cut(DF.Age, 5)
    
    
    
    features = ['Pclass', 'Sex', 'Age', 'FamilySize', 'Fare', 'Embarked', 'Cabin']
    
    DF = DF[features]
    DF = pd.get_dummies(DF)
    
    DF['Age'] = scale(DF.Age)
    DF['Fare'] = scale(DF.Fare)
    return DF

In [7]:
train_X = clean(raw_train_data)
test_X = clean(raw_test_data)

y = raw_train_data.Survived

In [8]:
P = train_X.copy()
P['y']= y

In [9]:
# scaler = StandardScaler().fit(train_X)
# train_X = scaler.transform(train_X)
# test_X = scaler.transform(test_X)

In [10]:
parameters = {
        'C': [10**p for p in [1, 2, 3, 4]], 
        'kernel': ['rbf'],
        'gamma': [10**p for p in [-1, -2, -3]]
    }

In [11]:
gscv = GridSearchCV(SVC(), parameters)

In [12]:
gscv.fit(train_X, y)

GridSearchCV(estimator=SVC(),
             param_grid={'C': [10, 100, 1000, 10000],
                         'gamma': [0.1, 0.01, 0.001], 'kernel': ['rbf']})

In [13]:
score = pd.DataFrame(gscv.cv_results_).sort_values('rank_test_score')#.head()

In [14]:
score[['params', 'mean_test_score', 'rank_test_score', 'std_test_score']].head()

Unnamed: 0,params,mean_test_score,rank_test_score,std_test_score
7,"{'C': 1000, 'gamma': 0.01, 'kernel': 'rbf'}",0.827136,1,0.020713
4,"{'C': 100, 'gamma': 0.01, 'kernel': 'rbf'}",0.826025,2,0.01436
0,"{'C': 10, 'gamma': 0.1, 'kernel': 'rbf'}",0.823778,3,0.029727
11,"{'C': 10000, 'gamma': 0.001, 'kernel': 'rbf'}",0.821537,4,0.014084
10,"{'C': 10000, 'gamma': 0.01, 'kernel': 'rbf'}",0.80585,5,0.028892


In [15]:
predictions = gscv.predict(test_X)
output = pd.DataFrame({'PassengerID': raw_test_data.PassengerId, 'Survived': predictions})
output.to_csv('preds\\svc2_3.csv', index = False)

In [16]:
pd.DataFrame(train_X).sample(10)

Unnamed: 0,Pclass,Sex,Age,FamilySize,Fare,Cabin,Embarked_C,Embarked_Q,Embarked_S
143,3,1,-0.776454,0,-0.512513,0,0,1,0
44,3,0,-0.776454,0,-0.489776,0,0,1,0
864,2,1,-0.397796,0,-0.386671,0,0,0,1
623,3,1,-0.624991,0,-0.49028,0,0,0,1
554,3,0,-0.549259,0,-0.491874,0,0,0,1
472,2,0,0.283787,3,-0.089684,0,0,0,1
628,3,1,-0.246333,0,-0.489442,0,0,0,1
698,1,1,1.495492,2,1.584179,1,1,0,0
754,2,0,1.41976,3,0.660333,0,0,0,1
655,2,1,-0.397796,2,0.831478,0,0,0,1


In [17]:
train_X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Pclass      891 non-null    int64  
 1   Sex         891 non-null    int64  
 2   Age         891 non-null    float64
 3   FamilySize  891 non-null    int64  
 4   Fare        891 non-null    float64
 5   Cabin       891 non-null    int64  
 6   Embarked_C  891 non-null    uint8  
 7   Embarked_Q  891 non-null    uint8  
 8   Embarked_S  891 non-null    uint8  
dtypes: float64(2), int64(4), uint8(3)
memory usage: 44.5 KB
