### 如何使用 FM

因为我们在使用 FM, FFM 的时候，大部分都需要将数据集转换成库所需要的数据格式。下面就介绍怎样用 pandas 的 cuts 来进行 one-hot 编码的转换。

In [3]:
import math
import numpy as np
import pandas as pd

train = pd.read_csv('../data/porto-seguro/train.csv')
test = pd.read_csv('../data/porto-seguro/test.csv')
test.insert(1, 'target', 0)


In [6]:
train.head()

Unnamed: 0,id,target,ps_ind_01,ps_ind_02_cat,ps_ind_03,ps_ind_04_cat,ps_ind_05_cat,ps_ind_06_bin,ps_ind_07_bin,ps_ind_08_bin,...,ps_calc_11,ps_calc_12,ps_calc_13,ps_calc_14,ps_calc_15_bin,ps_calc_16_bin,ps_calc_17_bin,ps_calc_18_bin,ps_calc_19_bin,ps_calc_20_bin
0,7,0,2,2,5,1,0,0,1,0,...,9,1,5,8,0,1,1,0,0,1
1,9,0,1,1,7,0,0,0,0,1,...,3,1,1,9,0,1,1,0,1,0
2,13,0,5,4,9,1,0,0,0,1,...,4,2,7,7,0,1,1,0,1,0
3,16,0,0,1,2,0,0,1,0,0,...,2,2,4,9,0,0,0,0,0,0
4,17,0,0,2,0,1,0,1,0,0,...,3,1,1,3,0,0,0,1,1,0


In [7]:
test.head()

Unnamed: 0,id,target,ps_ind_01,ps_ind_02_cat,ps_ind_03,ps_ind_04_cat,ps_ind_05_cat,ps_ind_06_bin,ps_ind_07_bin,ps_ind_08_bin,...,ps_calc_11,ps_calc_12,ps_calc_13,ps_calc_14,ps_calc_15_bin,ps_calc_16_bin,ps_calc_17_bin,ps_calc_18_bin,ps_calc_19_bin,ps_calc_20_bin
0,0,0,0,1,8,1,0,0,1,0,...,1,1,1,12,0,1,1,0,0,1
1,1,0,4,2,5,1,0,0,0,0,...,2,0,3,10,0,0,1,1,0,1
2,2,0,5,1,3,0,0,0,0,0,...,4,0,2,4,0,0,0,0,0,0
3,3,0,0,1,6,0,0,1,0,0,...,5,1,0,5,1,0,1,0,0,0
4,4,0,5,1,7,0,0,0,0,0,...,4,0,0,4,0,1,1,0,0,1


In [8]:
print(train.shape)
print(test.shape)

(595212, 59)
(892816, 59)


In [14]:
x = pd.concat([train, test])
x = x.reset_index(drop=True)
print(x.shape)
x.head()

(1488028, 59)


Unnamed: 0,id,target,ps_ind_01,ps_ind_02_cat,ps_ind_03,ps_ind_04_cat,ps_ind_05_cat,ps_ind_06_bin,ps_ind_07_bin,ps_ind_08_bin,...,ps_calc_11,ps_calc_12,ps_calc_13,ps_calc_14,ps_calc_15_bin,ps_calc_16_bin,ps_calc_17_bin,ps_calc_18_bin,ps_calc_19_bin,ps_calc_20_bin
0,7,0,2,2,5,1,0,0,1,0,...,9,1,5,8,0,1,1,0,0,1
1,9,0,1,1,7,0,0,0,0,1,...,3,1,1,9,0,1,1,0,1,0
2,13,0,5,4,9,1,0,0,0,1,...,4,2,7,7,0,1,1,0,1,0
3,16,0,0,1,2,0,0,1,0,0,...,2,2,4,9,0,0,0,0,0,0
4,17,0,0,2,0,1,0,1,0,0,...,3,1,1,3,0,0,0,1,1,0


In [16]:
# 删除不需要的以 'ps_calc_' 开头的数据列
unwanted = x.columns[x.columns.str.startswith('ps_calc_')]
x.drop(unwanted, inplace = True, axis = 1)
x.shape

(1488028, 39)

In [19]:
features = x.columns[2:]
categories = []

# 查看每个维度 field 的可取值个数。
m, n = train.shape
for c in features:
    trainNo = len(x.loc[:m, c].unique())
    testNo = len(x.loc[m:, c].unique())
    print(c, trainNo, testNo)
# for c in features:
#     trainno = len(x.loc[])

ps_ind_01 8 8
ps_ind_02_cat 5 5
ps_ind_03 12 12
ps_ind_04_cat 3 3
ps_ind_05_cat 8 8
ps_ind_06_bin 2 2
ps_ind_07_bin 2 2
ps_ind_08_bin 2 2
ps_ind_09_bin 2 2
ps_ind_10_bin 2 2
ps_ind_11_bin 2 2
ps_ind_12_bin 2 2
ps_ind_13_bin 2 2
ps_ind_14 5 5
ps_ind_15 14 14
ps_ind_16_bin 2 2
ps_ind_17_bin 2 2
ps_ind_18_bin 2 2
ps_reg_01 10 10
ps_reg_02 19 19
ps_reg_03 5013 5046
ps_car_01_cat 13 13
ps_car_02_cat 3 3
ps_car_03_cat 3 3
ps_car_04_cat 10 10
ps_car_05_cat 3 3
ps_car_06_cat 18 18
ps_car_07_cat 3 3
ps_car_08_cat 2 2
ps_car_09_cat 6 6
ps_car_10_cat 3 3
ps_car_11_cat 104 104
ps_car_11 5 5
ps_car_12 184 201
ps_car_13 70482 83769
ps_car_14 850 885
ps_car_15 15 15


### 将属性值 categories （进行离散化）
使用 `pandas.cut` 函数可以将属性值进行离散化。

Bin values into discrete intervals.


In [25]:
x.loc[:, 'ps_reg_03'] = pd.cut(x['ps_reg_03'], 50, labels = False)
x.loc[:,'ps_car_12'] = pd.cut(x['ps_car_12'], 50,labels=False)
x.loc[:,'ps_car_13'] = pd.cut(x['ps_car_13'], 50,labels=False)
x.loc[:,'ps_car_14'] =  pd.cut(x['ps_car_14'], 50,labels=False)
x.loc[:,'ps_car_15'] =  pd.cut(x['ps_car_15'], 50,labels=False)
len(x.loc[:, 'ps_reg_03'].unique())

34

In [29]:
m, _ = train.shape
train = x.loc[:m].copy()
test = x.loc[m:].copy()
print(train.shape)
print(test.shape)

(595215, 39)
(892814, 39)


In [31]:
# 在应用 SGD 的时候，对样本进行下 shuffle 是一个很好的实践。我们用 sample 方法对全部数据进行的那样，并且
train = train.sample(frac = 1).reset_index(drop=True)
train.shape
train.head()

Unnamed: 0,id,target,ps_ind_01,ps_ind_02_cat,ps_ind_03,ps_ind_04_cat,ps_ind_05_cat,ps_ind_06_bin,ps_ind_07_bin,ps_ind_08_bin,...,ps_car_07_cat,ps_car_08_cat,ps_car_09_cat,ps_car_10_cat,ps_car_11_cat,ps_car_11,ps_car_12,ps_car_13,ps_car_14,ps_car_15
0,76616,0,2,1,7,1,0,0,1,0,...,1,1,0,1,5,2,31,10,42,46
1,935950,0,1,2,2,0,0,1,0,0,...,1,1,2,1,83,2,31,8,41,37
2,357968,0,0,2,1,1,0,1,0,0,...,1,1,0,1,104,2,31,4,42,18
3,1319969,0,0,1,5,0,0,1,0,0,...,1,1,2,1,90,0,30,6,40,42
4,276266,0,0,2,1,0,0,1,0,0,...,1,1,2,1,42,2,30,4,42,29


In [32]:
train.drop('id', inplace=True, axis = 1)
test.drop('id', inplace=True, axis = 1)
train.head()

Unnamed: 0,target,ps_ind_01,ps_ind_02_cat,ps_ind_03,ps_ind_04_cat,ps_ind_05_cat,ps_ind_06_bin,ps_ind_07_bin,ps_ind_08_bin,ps_ind_09_bin,...,ps_car_07_cat,ps_car_08_cat,ps_car_09_cat,ps_car_10_cat,ps_car_11_cat,ps_car_11,ps_car_12,ps_car_13,ps_car_14,ps_car_15
0,0,2,1,7,1,0,0,1,0,0,...,1,1,0,1,5,2,31,10,42,46
1,0,1,2,2,0,0,1,0,0,0,...,1,1,2,1,83,2,31,8,41,37
2,0,0,2,1,1,0,1,0,0,0,...,1,1,0,1,104,2,31,4,42,18
3,0,0,1,5,0,0,1,0,0,0,...,1,1,2,1,90,0,30,6,40,42
4,0,0,2,1,0,0,1,0,0,0,...,1,1,2,1,42,2,30,4,42,29


In [33]:
test.head()

Unnamed: 0,target,ps_ind_01,ps_ind_02_cat,ps_ind_03,ps_ind_04_cat,ps_ind_05_cat,ps_ind_06_bin,ps_ind_07_bin,ps_ind_08_bin,ps_ind_09_bin,...,ps_car_07_cat,ps_car_08_cat,ps_car_09_cat,ps_car_10_cat,ps_car_11_cat,ps_car_11,ps_car_12,ps_car_13,ps_car_14,ps_car_15
595214,0,5,1,3,0,0,0,0,0,1,...,1,1,2,1,29,3,30,8,42,44
595215,0,0,1,6,0,0,1,0,0,0,...,1,1,2,1,40,2,30,5,42,32
595216,0,5,1,7,0,0,0,0,0,1,...,1,1,2,1,101,3,30,7,42,44
595217,0,0,1,6,0,0,1,0,0,0,...,0,0,2,1,11,2,29,6,41,48
595218,0,0,1,3,0,0,0,1,0,0,...,1,1,0,1,10,2,29,6,42,44


这样我们的数据都已经离散化了。如果你想用数值型的数据，别忘记先进行缩放。

In [34]:
categories = train.columns[1:]
numerics = []

In [38]:
currentcode = len(numerics)
catdict = {}
catcodes = {}
for x in numerics:
    catdict[x] = 0
for x in categories:
    catdict[x] = 1

# 训练数据处理
noOfRows = train.shape[0]
noOfColumns = len(features)
noOfRows = 10000

with open('alltrianffm.txt', 'w') as text_file:
    for n, r in enumerate(range(noOfRows)):
        if ((n % 100000) == 0):
            print('Row ', n)
        datastring = ''
        datarow = train.iloc[r].to_dict() # 对每一行的数据转成以 field为key 的对象。
        datastring += str(int(datarow['target']))
        
        for i, x in enumerate(catdict.keys()):
            if (catdict[x] == 0): # numerics 数值型数据
                datastring = datastring + " " + str(i) + ":" + str(i) + ":" + str(datarow[x])
            else: # 离散后的离散型数据
                if (x not in catcodes): # field 还没有出现
                    catcodes[x] = {}
                    currentcode += 1
                    catcodes[x][datarow[x]] = currentcode # 该 field 的对应值的对应 code
                elif(datarow[x] not in catcodes[x]): # 如果 field 中对应值没有对应 code
                    currentcode += 1
                    catcodes[x][datarow[x]] = currentcode
                
                code = catcodes[x][datarow[x]]
                datastring = datastring + " " + str(i) + ":" + str(int(code)) + ":1"
            datastring += '\n'
            text_file.write(datastring)
                    

Row  0


In [39]:
noOfRows = test.shape[0]
noOfColumns = len(features)
noOfRows = 5000

with open('alltestffm.txt', 'w') as text_file:
    for n, r in enumerate(range(noOfRows)):
        if (n % 100000 == 0):
            print('Row ', n)
        
        datastring = ''
        datarow = test.iloc[r].to_dict()
        datastring += str(int(datarow['target']))
        
        for i, x in enumerate(catdict.keys()):
            if (catdict[x] == 0):
                datastring = datastring + " " + str(i) + ":" + str(i) + ":" + str(datarow[x])
            else:
                if (x not in catcodes):
                    catcodes[x] = {}
                    currentcode += 1
                    catcodes[x][datarow[x]] = currentcode
                elif (datarow[x] not in catcodes[x]):
                    currentcode += 1
                    catcodes[x][datarow[x]] = currentcode
                
                code = catcodes[x][datarow[x]]
                datastring = datastring + " " + str(i) + ":" + str(int(code)) + ":1"
            datastring += '\n'
            text_file.write(datastring)
               

Row  0
