In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.datasets import load_iris
sns.set()

%matplotlib inline

In [4]:
#Iris Plot
iris = load_iris()
n_samples, m_features = iris.data.shape

In [12]:
iris.data[:5]

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2]])

In [14]:
iris.target

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [15]:
iris.target_names

array(['setosa', 'versicolor', 'virginica'], dtype='<U10')

In [18]:
# Load Data
X, y = iris.data, iris.target
# Returns an iterator of tuples, where the i-th tuple contains 
# the i-th element from each of the argument sequences or iterables. 
# The iterator stops when the shortest input iterable is exhausted.

# dict + zip
D_target_dummy = dict(zip(np.arange(iris.target_names.shape[0]), iris.target_names))

In [17]:
# 因为事先知道iris的y lable是顺序排列的，全是setosa + 全是versicolor，然后才能用shape-》arrange付给名字
# 所以更科学的方法是：iris['target'] = iris[iris.target=='0']=setosa。更加灵活实用。
a = zip(np.arange(iris.target_names.shape[0]), iris.target_names)
# https://docs.python.org/3.3/library/functions.html
list(a)

[(0, 'setosa'), (1, 'versicolor'), (2, 'virginica')]

In [19]:
D_target_dummy

{0: 'setosa', 1: 'versicolor', 2: 'virginica'}

In [20]:
DF_data = pd.DataFrame(X,columns=iris.feature_names)
DF_data["target"] = pd.Series(y).map(D_target_dummy)

In [21]:
DF_data.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


## get_dummies

pandas.get_dummies **only converts string columns** into one-hot representation, unless columns are specified.

In [28]:
DF_dummies = pd.get_dummies(DF_data["target"],prefix= 'target')
DF_dummies.head()

Unnamed: 0,target_0,target_1,target_2
0,1,0,0
1,1,0,0
2,1,0,0
3,1,0,0
4,1,0,0


## OneHotEncoder

- OneHotEncoder cannot process string values directly. If your nominal features are strings, then you need to first map them into i**ntegers**.

In [39]:
from sklearn.preprocessing import OneHotEncoder,LabelEncoder
def f1(DF_data):
    # data & label要分开
    Enc_ohe, Enc_label = OneHotEncoder(), LabelEncoder()
    # target,变成number形式
    DF_data["Dummies"] = Enc_label.fit_transform(DF_data["target"])
    # data，输出稀疏矩阵，要todense（）变成正常矩阵
    DF_dummies2 = pd.DataFrame(Enc_ohe.fit_transform(DF_data[["Dummies"]]).todense(), columns = Enc_label.classes_)
    # 返回one-hot的lable
    return(DF_dummies2)

In [41]:
f1(DF_data).head()

Unnamed: 0,0,1,2
0,1.0,0.0,0.0
1,1.0,0.0,0.0
2,1.0,0.0,0.0
3,1.0,0.0,0.0
4,1.0,0.0,0.0


## CountVector

- expected **string or bytes-like object**
- 在图书那里，**数字**也可以给转成one-hot的
- 返回matrix

In [44]:
DF_data = pd.DataFrame(X,columns=iris.feature_names)
DF_data["target"] = pd.Series(y).map(D_target_dummy)

In [50]:
DF_data.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [48]:
from sklearn.feature_extraction.text import CountVectorizer
result = DF_data['target']
# tokenizer。
counter = CountVectorizer(lowercase=False)
# expected string or bytes-like object
counter.fit(result)
# CountVector之后得到的是稀疏矩阵，todense转成稠密形式,才能看到原始真实的矩阵
result = pd.DataFrame(counter.transform(result).todense())
result.head()

Unnamed: 0,0,1,2
0,1,0,0
1,1,0,0
2,1,0,0
3,1,0,0
4,1,0,0


# Conclusion

In [None]:
- OneHotEncoder（int） & CountVector（string/number都可以）很像，需要fit，transform，然后返回matrix。