Import relevant 

In [392]:
# Visualization
import seaborn as sb
import matplotlib as mpl
import bokeh as bok

# Machine Learning 
import sklearn as sk

# Data ETL
import scipy as sp
import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

pd.options.mode.chained_assignment = None

In [381]:
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')

num_cols  = ['Age','SibSp','Parch','Fare']
fact_cols = ['Pclass','Sex', 'Embarked']

train_num = train[num_cols]
test_num = test[num_cols]
train_fact = train[fact_cols]
test_fact = test[fact_cols]

In [401]:
pd.Categorical(test_fact['Embarked']).categories
# print(test_fact[test_fact['Embarked'] == ""])
pd.Categorical(train_fact['Embarked']).categories

Empty DataFrame
Columns: [Pclass, Sex, Embarked]
Index: []


Index(['C', 'Q', 'S'], dtype='object')

Manage categorical variables, one-hot-encoding them

In [383]:
# .astype(str) used due to mixed types in columns being applied to seemingly
# See https://stackoverflow.com/questions/46406720/labelencoder-typeerror-not-supported-between-instances-of-float-and-str
int_enc_train = train_fact.astype(str).apply(LabelEncoder().fit_transform)
int_enc_test  = test_fact.astype(str).apply(LabelEncoder().fit_transform)

# One hot encode now-numerical columns
cats = [[0,1,2],[0,1],[0,1,2,3]]
train_fact_enc = pd.DataFrame(OneHotEncoder(categories=cats, sparse=False).fit_transform(int_enc_train))
test_fact_enc = pd.DataFrame(OneHotEncoder(categories=cats, sparse=False).fit_transform(int_enc_test)) 

# Correct column names
col_names = ['Pclass_1', 'Pclass_2', 'Pclass_3', 'Male', 'Female', 'Embark_1', 'Embark_2', 'Embark_3', 'Embark_4']
train_fact_enc.columns = col_names
test_fact_enc.columns =  col_names

In [384]:
# train_fact_enc.head(50)
# test_fact_enc.head(50)

Manage numerical variables

In [385]:
## Standardize Age
# Replace NaNs with median for age column

## Train
age_median = train_num['Age'].median()
train_num['Age'] = train_num['Age'].fillna(age_median)
# Create and fit scaler for column
scaler = StandardScaler(with_mean=True, with_std=True)
age_np = np.array(train_num['Age']).reshape(-1,1)
scaler.fit(age_np)
# Transform and replace column
train_num['Age'] = scaler.transform(age_np)

## Test
age_median = test_num['Age'].median()
test_num['Age'] = test_num['Age'].fillna(age_median)
# Create and fit scaler for column
scaler = StandardScaler(with_mean=True, with_std=True)
age_np = np.array(test_num['Age']).reshape(-1,1)
scaler.fit(age_np)
# Transform and replace column
test_num['Age'] = scaler.transform(age_np)

In [386]:
## Standardize Fare
# Replace NaNs with median for age column

## Train
fare_median = train_num['Fare'].median()
train_num['Fare'] = train_num['Fare'].fillna(fare_median)
# Create and fit scaler for column
scaler = StandardScaler(with_mean=True, with_std=True)
fare_np = np.array(train_num['Fare']).reshape(-1,1)
scaler.fit(fare_np)
# Transform and replace column
train_num['Fare'] = scaler.transform(fare_np)

## Test
fare_median = test_num['Fare'].median()
test_num['Fare'] = test_num['Fare'].fillna(fare_median)
# Create and fit scaler for column
scaler = StandardScaler(with_mean=True, with_std=True)
fare_np = np.array(test_num['Fare']).reshape(-1,1)
scaler.fit(fare_np)
# Transform and replace column
test_num['Fare'] = scaler.transform(fare_np)

Join together dataframes

In [387]:
# print(train['Survived'])
# print(train_num)
# print(test_num)
# print(train_fact_enc)
# print(test_fact_enc)

In [389]:
train = pd.DataFrame(train['Survived']).join(train_fact_enc, how='right').join(train_num, how='right')
print(train)
test = pd.DataFrame(test['Survived']).join(test_fact_enc, how='right').join(test_num, how='right')
print(test)

     Survived  Pclass_1  Pclass_2  Pclass_3  Male  Female  Embark_1  Embark_2  \
0           0       0.0       0.0       1.0   0.0     1.0       0.0       0.0   
1           1       1.0       0.0       0.0   1.0     0.0       1.0       0.0   
2           1       0.0       0.0       1.0   1.0     0.0       0.0       0.0   
3           1       1.0       0.0       0.0   1.0     0.0       0.0       0.0   
4           0       0.0       0.0       1.0   0.0     1.0       0.0       0.0   
5           0       0.0       0.0       1.0   0.0     1.0       0.0       1.0   
6           0       1.0       0.0       0.0   0.0     1.0       0.0       0.0   
7           0       0.0       0.0       1.0   0.0     1.0       0.0       0.0   
8           1       0.0       0.0       1.0   1.0     0.0       0.0       0.0   
9           1       0.0       1.0       0.0   1.0     0.0       1.0       0.0   
10          1       0.0       0.0       1.0   1.0     0.0       0.0       0.0   
11          1       1.0     

In [425]:
train_data = train.drop('Survived', axis=1)
train_target = train['Survived']
# print(train_data)
# print(train_target)

train_x, test_x, cv_x, cv_y = train_test_split(train_data,train_target, test_size=0.4, random_state=0)

from sklearn import svm
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import ShuffleSplit
clf = svm.SVC(kernel='linear', C=1)
cv = ShuffleSplit(n_splits=25, test_size=0.4, random_state=0)
scores = cross_val_score(clf, train_data, train_target, cv=cv)
print(scores)
clf.predict(test)

[0.77871148 0.77591036 0.79551821 0.77591036 0.767507   0.78151261
 0.80672269 0.767507   0.79551821 0.78431373 0.78711485 0.80112045
 0.77591036 0.79271709 0.80672269 0.78431373 0.767507   0.78431373
 0.79551821 0.80952381 0.78711485 0.76190476 0.80952381 0.80392157
 0.77871148]


NotFittedError: This SVC instance is not fitted yet. Call 'fit' with appropriate arguments before using this method.