In [1]:
import pandas as pd
from pandas import get_dummies
import lightgbm as lgb
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import OneHotEncoder,LabelEncoder
from scipy import sparse
import numpy as np
import os
import gc
import xlearn as xl
import math

In [2]:
class FFMFormat:
    def __init__(self,vector_feat,one_hot_feat,continus_feat):
        self.field_index_ = None
        self.feature_index_ = None
        self.vector_feat=vector_feat
        self.one_hot_feat=one_hot_feat
        self.continus_feat=continus_feat
        
    def get_params(self):
        pass

    def set_params(self, **parameters):
        pass

    def fit(self, df, y=None):
        self.field_index_ = {col: i for i, col in enumerate(df.columns)}
        self.feature_index_ = dict()
        last_idx = 0
        for col in df.columns:
            if col in self.one_hot_feat:
                print(col)
                df[col]=df[col].astype('int')
                vals = np.unique(df[col])
                for val in vals:
                    if val==-1: continue
                    name = '{}_{}'.format(col, val)
                    if name not in self.feature_index_:
                        self.feature_index_[name] = last_idx
                        last_idx += 1
            elif col in self.vector_feat:
                print(col)
                vals=[]
                for data in df[col].apply(str):
                    if data!="-1":
                        for word in data.strip().split(' '):
                            vals.append(word)
                vals = np.unique(vals)
                for val in vals:
                    if val=="-1": continue
                    name = '{}_{}'.format(col, val)
                    if name not in self.feature_index_:
                        self.feature_index_[name] = last_idx
                        last_idx += 1
            self.feature_index_[col] = last_idx
            last_idx += 1
        return self

    def fit_transform(self, df, y=None):
        self.fit(df, y)
        return self.transform(df)

    def transform_row_(self, row):
        ffm = []

        for col, val in row.loc[row != 0].to_dict().items():
            if col in self.one_hot_feat:
                name = '{}_{}'.format(col, val)
                if name in self.feature_index_:
                    ffm.append('{}:{}:1'.format(self.field_index_[col], self.feature_index_[name]))
                # ffm.append('{}:{}:{}'.format(self.field_index_[col], self.feature_index_[col], 1))
            elif col in self.vector_feat:
                for word in str(val).split(' '):
                    name = '{}_{}'.format(col, word)
                    if name in self.feature_index_:
                        ffm.append('{}:{}:1'.format(self.field_index_[col], self.feature_index_[name]))
            elif col in self.continus_feat:
                if val!=-1:
                    ffm.append('{}:{}:{}'.format(self.field_index_[col], self.feature_index_[col], val))
        return ' '.join(ffm)

    def transform(self, df):
        # val=[]
        # for k,v in self.feature_index_.items():
        #     val.append(v)
        # val.sort()
        # print(val)
        # print(self.field_index_)
        # print(self.feature_index_)
        return pd.Series({idx: self.transform_row_(row) for idx, row in df.iterrows()})

In [3]:
one_hot_feature=['LBS','age','carrier','consumptionAbility','education','gender','advertiserId','campaignId', 'creativeId',
       'adCategoryId', 'productId', 'productType']
vector_feature=['interest1','interest2','interest5','kw1','kw2','topic1','topic2','os','ct','marriageStatus']
continus_feature=['creativeSize']

In [4]:
path = 'data'
train_name = 'full_merge_train_TF_1V1.csv'
vali_name = 'full_merge_train_TF_1V1.csv'
test_name = 'test.csv'
temp_name = 'ffm.csv'
train_ffm_name = 'train_ffm.csv'
vali_ffm_name = 'vali_ffm.csv'
# test_ffm_name = 'test_ffm.csv'

In [5]:
tr = FFMFormat(vector_feature,one_hot_feature,continus_feature)
train = pd.read_csv(path+'/TF_1V1_train/'+train_name)
# vali = pd.read_csv(path+'/TF_1V1_train/'+vali_name)
test = pd.read_csv(path+'/test/'+test_name)

len_train = len(train)
len_vali = math.ceil(len_train/10)

data = pd.concat([train,test],ignore_index=True)
label = np.array(data.pop('label'))

data = data[one_hot_feature+vector_feature+continus_feature]

del test
del train
gc.collect()

data_ffm = tr.fit_transform(data)

if os.path.isdir(path+'/ffm') is False:
    os.makedirs(path+'/ffm')
data_ffm.to_csv(path+'/ffm/'+temp_name,index=False)

del data_ffm
gc.collect()

LBS
age
carrier
consumptionAbility
education
gender
advertiserId
campaignId
creativeId
adCategoryId
productId
productType
interest1
interest2
interest5
kw1
kw2
topic1
topic2
os
ct
marriageStatus


2220

In [6]:
with open(path+'/'+temp_name) as fin:
    f_train_out = open(path+'/ffm/'+train_ffm_name,'w')
    f_vali_out = open(path+'/ffm/'+vali_ffm_name,'w')
    f_test_out = open(path+'/ffm/'+test_ffm_name,'w')
    for (i,line) in enumerate(fin):
        if i < len_vali:
            f_vali_out.write(str(label[i])+' '+line)
        elif i < len_train:
            f_train_out.write(str(label[i])+' '+line)
        else:
            f_test_out.write(line)
    f_train_out.close()
    f_vali_out.close()
    f_test_out.close()
    print('转换成功，数据位置：'+path)

转换成功，数据位置：data/ffm/
