In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

title = 'woplus'
path = '../../sources/data'

In [43]:
df_net = pd.read_table('{}/dataset_2015_filter_net.txt'.format(path), sep='|')
df_net_piv = df_net.pivot(index='imsi', columns='mon', values='nettype')
df_net_piv.columns = ['net01','net02','net03','net04','net05','net06','net07','net08','net09','net10','net11','net12']
df_test1 = df_net_piv.loc[:, 'net09']
df_test1.head()

imsi
00002ec9f6870677ca801c213ce87c02    3G
0000be0fbdbe7f8ff47f7ec3e077e489    2G
000166f550781551e1eb11cfcd3f4267    2G
0001a48e7fca2e7e3279a5f275525cdb    2G
0001b4ebaa72f0e4c2e79a0791bf3002    2G
Name: net09, dtype: object

## 类别特征作独热编码

### 1. pd.get_dummies

In [52]:
pd.get_dummies(df_test1, prefix='net').head()

Unnamed: 0_level_0,net_2G,net_3G
imsi,Unnamed: 1_level_1,Unnamed: 2_level_1
00002ec9f6870677ca801c213ce87c02,0,1
0000be0fbdbe7f8ff47f7ec3e077e489,1,0
000166f550781551e1eb11cfcd3f4267,1,0
0001a48e7fca2e7e3279a5f275525cdb,1,0
0001b4ebaa72f0e4c2e79a0791bf3002,1,0


### 2. sklearn.preprocessing.OneHotEncoder

In [44]:
# 利用category相关api作category特征数值化
tmp = pd.value_counts(df_test1) / df_test1.shape[0]
print tmp
cat = list(tmp[tmp > 0].index)
print cat
test1 = df_test1.astype('category').cat.set_categories(cat).cat.rename_categories(1+np.arange(len(cat))).astype('float')
test1.head()

2G    0.623059
3G    0.376941
Name: net09, dtype: float64
['2G', '3G']


imsi
00002ec9f6870677ca801c213ce87c02    2.0
0000be0fbdbe7f8ff47f7ec3e077e489    1.0
000166f550781551e1eb11cfcd3f4267    1.0
0001a48e7fca2e7e3279a5f275525cdb    1.0
0001b4ebaa72f0e4c2e79a0791bf3002    1.0
dtype: float64

In [45]:
np.transpose(test1.head().values)

array([ 2.,  1.,  1.,  1.,  1.])

In [46]:
# Series to array-like X
np.transpose([test1.head().values])

array([[ 2.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 1.]])

In [47]:
# 独热编码
# 备注：fit_transform的输入参数必须是array-like
from sklearn.preprocessing import OneHotEncoder
test1_ohe = OneHotEncoder(n_values=1+len(cat), sparse=False)
xbin = test1_ohe.fit_transform(np.transpose([test1.values]))[:,1:]
xbin

array([[ 0.,  1.],
       [ 1.,  0.],
       [ 1.,  0.],
       ..., 
       [ 0.,  1.],
       [ 0.,  1.],
       [ 1.,  0.]])

In [53]:
test1_dabin = pd.DataFrame(xbin, columns=['{}_{}'.format(df_test1.name[:-2], x) for x in cat], index=df_test1.index)
test1_dabin.head()

Unnamed: 0_level_0,net_2G,net_3G
imsi,Unnamed: 1_level_1,Unnamed: 2_level_1
00002ec9f6870677ca801c213ce87c02,0.0,1.0
0000be0fbdbe7f8ff47f7ec3e077e489,1.0,0.0
000166f550781551e1eb11cfcd3f4267,1.0,0.0
0001a48e7fca2e7e3279a5f275525cdb,1.0,0.0
0001b4ebaa72f0e4c2e79a0791bf3002,1.0,0.0


In [None]:
ictype = {}
ictype['catbin'] = ['net01','net02','net03','net04','net05','net06','net07','net08','net09','net10','net11','net12']

In [9]:
def Cat_to_bin(das, a = 0.01):
    '''Transfrom a categorical column to onehotencoding'''
    tmp = pd.value_counts(das)/das.shape[0]
    cat = list(tmp.index[tmp > a])
    enc = OneHotEncoder(n_values = len(cat)+1, sparse = False)
    xbin = enc.fit_transform(np.transpose(
            [das.astype("category").cat.set_categories(cat).cat.rename_categories(1+np.arange(len(cat))).astype("float").fillna(0).values]))[:,1:]     
    dabin = pd.DataFrame(xbin, columns = ["{}_{}".format(das.name, x) for x in cat], index = das.index)    # origin
#     dabin = pd.DataFrame(xbin, columns = ["{}_{}".format(das.name, x) for x in (1+np.arange(len(cat)))], index = das.index) 
    if(tmp[tmp <= a].sum() > a):
        dabin = pd.concat([dabin, pd.DataFrame({"{}_Others".format(das.name):das.notnull()-dabin.sum(axis = 1)})], axis = 1)
    if(dabin.shape[1] == 2):
        dabin = pd.DataFrame({das.name: xbin[:,0]}, index = das.index)
    return(dabin)

In [11]:
Cat_to_bin(df_net_piv.loc[:, ictype['catbin'][0]]).head()

Unnamed: 0_level_0,mon01
imsi,Unnamed: 1_level_1
00002ec9f6870677ca801c213ce87c02,0
0000be0fbdbe7f8ff47f7ec3e077e489,1
000166f550781551e1eb11cfcd3f4267,1
0001a48e7fca2e7e3279a5f275525cdb,1
0001b4ebaa72f0e4c2e79a0791bf3002,1


In [13]:
dacatbin = pd.concat(map(lambda i: Cat_to_bin(df_net_piv.loc[:,i], a = 0.01), ictype['catbin']), axis = 1)
dacatbin.head()

Unnamed: 0_level_0,mon01,mon02,mon03,mon04,mon05,mon06,mon07,mon08,mon09,mon10,mon11,mon12
imsi,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
00002ec9f6870677ca801c213ce87c02,0,0,0,0,0,0,0,0,0,0,0,0
0000be0fbdbe7f8ff47f7ec3e077e489,1,1,1,1,1,1,1,1,1,1,1,1
000166f550781551e1eb11cfcd3f4267,1,1,1,1,1,1,1,1,1,1,1,1
0001a48e7fca2e7e3279a5f275525cdb,1,1,1,1,1,1,1,1,1,1,1,1
0001b4ebaa72f0e4c2e79a0791bf3002,1,1,1,1,1,1,1,1,1,1,1,1
