In [1]:
%load_ext watermark
%watermark -d -u -a 'Anubhav Dhiman' -v -p numpy,scipy,matplotlib,sklearn

Anubhav Dhiman 
last updated: 2018-01-24 

CPython 3.5.4
IPython 6.1.0

numpy 1.13.1
scipy 0.19.1
matplotlib 2.1.0
sklearn 0.19.1


In [20]:
import numpy as np

One work-around for parsing nominal features into a format that prevents the classification algorithm from asserting an order is the so-called one-hot encoding representation.

Using the `DictVectorizer` to encode categorical features

In [2]:
measurements = [
    {'city': 'Dubai', 'temperature': 33.},
    {'city': 'London', 'temperature': 12.},
    {'city': 'San Francisco', 'temperature': 18.},
]

In [3]:
from sklearn.feature_extraction import DictVectorizer

vec = DictVectorizer()
vec

DictVectorizer(dtype=<class 'numpy.float64'>, separator='=', sort=True,
        sparse=True)

In [4]:
vec.fit_transform(measurements).toarray()

array([[  1.,   0.,   0.,  33.],
       [  0.,   1.,   0.,  12.],
       [  0.,   0.,   1.,  18.]])

In [5]:
vec.get_feature_names()

['city=Dubai', 'city=London', 'city=San Francisco', 'temperature']

**Derived features** may be based in feature extraction and dimensionality reduction (such as PCA or manifold learning), may be linear or nonlinear combinations of features (such as in polynomial regression), or may be some more sophisticated transform of the features.

## Combining Numerical and Categorical Features

In [46]:
import os
import pandas as pd

titanic = pd.read_csv('../datasets/titanic3.csv', sep=';')
print(titanic.columns)

Index(['pclass', 'survived', 'name', 'sex', 'age', 'sibsp', 'parch', 'ticket',
       'fare', 'cabin', 'embarked', 'boat', 'body', 'home.dest'],
      dtype='object')


In [47]:
titanic.head()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1.0,1.0,Allen. Miss. Elisabeth Walton,female,29.0,0.0,0.0,24160,211.3375,B5,S,2.0,,St Louis. MO
1,1.0,1.0,Allison. Master. Hudson Trevor,male,0.9167,1.0,2.0,113781,151.55,C22 C26,S,11.0,,Montreal. PQ / Chesterville. ON
2,1.0,0.0,Allison. Miss. Helen Loraine,female,2.0,1.0,2.0,113781,151.55,C22 C26,S,,,Montreal. PQ / Chesterville. ON
3,1.0,0.0,Allison. Mr. Hudson Joshua Creighton,male,30.0,1.0,2.0,113781,151.55,C22 C26,S,,135.0,Montreal. PQ / Chesterville. ON
4,1.0,0.0,Allison. Mrs. Hudson J C (Bessie Waldo Daniels),female,25.0,1.0,2.0,113781,151.55,C22 C26,S,,,Montreal. PQ / Chesterville. ON


In [48]:
labels = titanic.survived.values
features = titanic[['pclass', 'sex', 'age', 'sibsp', 'parch', 'fare', 'embarked']]
features.head()

Unnamed: 0,pclass,sex,age,sibsp,parch,fare,embarked
0,1.0,female,29.0,0.0,0.0,211.3375,S
1,1.0,male,0.9167,1.0,2.0,151.55,S
2,1.0,female,2.0,1.0,2.0,151.55,S
3,1.0,male,30.0,1.0,2.0,151.55,S
4,1.0,female,25.0,1.0,2.0,151.55,S


In [49]:
pd.get_dummies(features).head()

Unnamed: 0,pclass,age,sibsp,parch,fare,sex_female,sex_male,embarked_C,embarked_Q,embarked_S
0,1.0,29.0,0.0,0.0,211.3375,1,0,0,0,1
1,1.0,0.9167,1.0,2.0,151.55,0,1,0,0,1
2,1.0,2.0,1.0,2.0,151.55,1,0,0,0,1
3,1.0,30.0,1.0,2.0,151.55,0,1,0,0,1
4,1.0,25.0,1.0,2.0,151.55,1,0,0,0,1


In [50]:
features_dummies = pd.get_dummies(features, columns=['pclass', 'sex', 'embarked'])
features_dummies.head(n=16)

Unnamed: 0,age,sibsp,parch,fare,pclass_1.0,pclass_2.0,pclass_3.0,sex_female,sex_male,embarked_C,embarked_Q,embarked_S
0,29.0,0.0,0.0,211.3375,1,0,0,1,0,0,0,1
1,0.9167,1.0,2.0,151.55,1,0,0,0,1,0,0,1
2,2.0,1.0,2.0,151.55,1,0,0,1,0,0,0,1
3,30.0,1.0,2.0,151.55,1,0,0,0,1,0,0,1
4,25.0,1.0,2.0,151.55,1,0,0,1,0,0,0,1
5,48.0,0.0,0.0,26.55,1,0,0,0,1,0,0,1
6,63.0,1.0,0.0,77.9583,1,0,0,1,0,0,0,1
7,39.0,0.0,0.0,0.0,1,0,0,0,1,0,0,1
8,53.0,2.0,0.0,51.4792,1,0,0,1,0,0,0,1
9,71.0,0.0,0.0,49.5042,1,0,0,0,1,1,0,0


In [51]:
features_dummies.dtypes

age           float64
sibsp         float64
parch         float64
fare          float64
pclass_1.0      uint8
pclass_2.0      uint8
pclass_3.0      uint8
sex_female      uint8
sex_male        uint8
embarked_C      uint8
embarked_Q      uint8
embarked_S      uint8
dtype: object

In [52]:
data = features_dummies.values

In [53]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Imputer

train_data, test_data, train_labels, test_labels = train_test_split(data, labels, random_state=0)

imp = Imputer()
imp.fit(train_data)
train_data_finite = imp.transform(train_data)
test_data_finite = imp.transform(test_data)

In [56]:
from sklearn.dummy import DummyClassifier

clf = DummyClassifier('most_frequent')
clf.fit(train_data_finite, train_labels)
print('Prediction accuracy: %f' % clf.score(test_data_finite, test_labels))

ValueError: Classification metrics can't handle a mix of continuous and binary targets