# Titanic Project: Support Vector Machine (SVM) 

## Step 1: Import packages

In [1]:
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
import warnings 
import joblib

In [2]:
# Block unwanted warnings
warnings.filterwarnings('ignore', category = FutureWarning)
warnings.filterwarnings('ignore', category = DeprecationWarning)

## Step 2: Import the dataset

In [3]:
tr_features = pd.read_csv(r"C:\Users\smart\Desktop\GitHub\Titanic\data\X_train.csv")
tr_labels = pd.read_csv(r"C:\Users\smart\Desktop\GitHub\Titanic\data\y_train.csv",header = None)

In [4]:
print("number of rows in training features: ",len(tr_features.index))
tr_features.head()

number of rows in training features:  534


Unnamed: 0,Pclass,Sex,Age,Fare,Family_cnt,Cabin_ind
0,2,0,62.0,10.5,0,0
1,3,0,8.0,29.125,5,0
2,3,0,32.0,56.4958,0,0
3,3,1,20.0,9.825,1,0
4,2,1,28.0,13.0,0,0


In [5]:
print("number of rows in training labels: ",len(tr_labels.index))
tr_labels.head()

number of rows in training labels:  534


Unnamed: 0,0
0,1
1,0
2,1
3,0
4,1


## Step 3: Explore the model & its hyper parameters

In [6]:
SVC()
# C and kernel are the two most important hyperparameters to optimize in this model
#   C: an inverse measure of regularization 
#     low C: high regularization & risk underfitting, high C: low regularization & risk over fitting

#   kernel: kernel trick increases the dimension of data to make it linearly seperable 
#     kernel type controls the type of transformation that data will go through 

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [7]:
dir(SVC)

['__abstractmethods__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_abc_impl',
 '_check_proba',
 '_compute_kernel',
 '_decision_function',
 '_dense_decision_function',
 '_dense_fit',
 '_dense_predict',
 '_dense_predict_proba',
 '_estimator_type',
 '_get_coef',
 '_get_param_names',
 '_impl',
 '_pairwise',
 '_predict_log_proba',
 '_predict_proba',
 '_sparse_decision_function',
 '_sparse_fit',
 '_sparse_kernels',
 '_sparse_predict',
 '_sparse_predict_proba',
 '_validate_for_predict',
 '_validate_targets',
 '_warn_from_fit_status',
 'coef_',
 'decision_function',
 'fit',
 'get_params',
 'predict',
 'predict_log_proba',
 'predict_prob

## Step 4: Create a function for hyperparameter evaluation 

In [8]:


def print_results(results):
    print('BEST PARAM: {}\n'.format(results.best_params_))
    
    means = results.cv_results_['mean_test_score']
    stds = results.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, results.cv_results_['params']):
        print('{} (+/-{}) for {}'.format(round(mean, 3), round(std * 2, 3), params))
        
        

## Step 5: Run k-fold Cross Validation (CV) on the data; compare results for different hyperparameters

In [9]:
# C and kernel are the two most important hyperparameters to optimize in this model
#   C: an inverse measure of regularization 
#     low C: high regularization & risk underfitting, high C: low regularization & risk over fitting

#   kernel: kernel trick increases the dimension of data to make it linearly seperable 
#     kernel type controls the type of transformation that data will go through 
#       'kernel' : ['linear','rbf','poly','sigmoid', 'precomputed']

svc = SVC()
parameters = {
    'kernel' : ['linear','rbf'],
    'C' : [0.1, 1, 10]
}

cv = GridSearchCV(svc, parameters, cv = 5)

cv.fit(tr_features, tr_labels.values.ravel())

print_results(cv)

BEST PARAM: {'C': 0.1, 'kernel': 'linear'}

0.796 (+/-0.116) for {'C': 0.1, 'kernel': 'linear'}
0.624 (+/-0.005) for {'C': 0.1, 'kernel': 'rbf'}
0.796 (+/-0.116) for {'C': 1, 'kernel': 'linear'}
0.67 (+/-0.082) for {'C': 1, 'kernel': 'rbf'}
0.796 (+/-0.116) for {'C': 10, 'kernel': 'linear'}
0.687 (+/-0.085) for {'C': 10, 'kernel': 'rbf'}


## Step 6: Select model with the best results

In [10]:
# Model with best results:
cv.best_estimator_

SVC(C=0.1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='linear', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

## Step 7: Write out picked model 

In [11]:
joblib.dump(cv.best_estimator_, r"C:\Users\smart\Desktop\GitHub\Titanic\models\SVM_model.pkl")

['C:\\Users\\smart\\Desktop\\GitHub\\Titanic\\models\\SVM_model.pkl']