In [1]:
import numpy as np
import pandas as pd
import os
from scipy import sparse
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.externals import joblib
#import xgboost as xgb
import lightgbm as lgb

In [2]:
ad_feature=pd.read_csv('../data/adFeature.csv')
user_feature=pd.read_csv('../data/userFeature/userFeature_2.csv')#29
train=pd.read_csv('../data/train/train_1.csv')
predict=pd.read_csv('../data/test1/test1_1.csv')

In [3]:
#ad_feature=pd.read_csv('../data/adFeature.csv')
#user_feature = pd.read_csv('../data/userFeature.csv')
#train = pd.read_csv('../data/train.csv')
#predict = pd.read_csv('../data/test1.csv')

In [4]:
#拼接数据（左联接），分one-hot特征、向量特征和原始特征
train.loc[train['label']==-1,'label']=0
predict['label']=-1
data=pd.concat([train,predict])
data=pd.merge(data,ad_feature,on='aid',how='left')
data=pd.merge(data,user_feature,on='uid',how='left')

data['ct'].fillna('22', inplace = True)
data['os'].fillna('1', inplace = True)
data['carrier'].fillna('1', inplace = True)
data['house'].fillna('0', inplace = True)
data=data.fillna('-1')

one_hot_feature=['LBS','age','carrier','consumptionAbility','education','gender','house','os','ct','marriageStatus']
vector_feature=['interest1','interest2','interest5','kw1','kw2','topic1','topic2']
raw_feature=['advertiserId','campaignId', 'creativeId','creativeSize','adCategoryId', 'productId', 'productType']

#LabelEncoder将各种标签分配一个可数的连续编号
for feature in one_hot_feature:
    try:
        data[feature] = LabelEncoder().fit_transform(data[feature].apply(int))
    except:
        data[feature] = LabelEncoder().fit_transform(data[feature])  

In [5]:
#统计特征
for feat in vector_feature:
    data['len_'+feat] = data[feat].apply(lambda x:0 if ((not x.strip()) or str(x)=='-1') else len(x.split(' ')))

data['meanlen_interest'] = (data['len_interest1']+data['len_interest2']+data['len_interest5'])/3.0
data['meanlen_kw'] = (data['len_kw1']+data['len_kw2'])/2.0
data['meanlen_topic'] = (data['len_topic1']+data['len_topic2'])/2.0 
data['max_ct'] = data['ct'].apply(lambda x: 0 if ((not str(x).strip()) or str(x)=='-1') else max(int(i) for i in str(x).split(' ')))    
data['max_os'] = data['os'].apply(lambda x: 0 if ((not str(x).strip()) or str(x)=='-1') else max(int(i) for i in str(x).split(' ')))
data['max_marriageStatus'] = data['marriageStatus'].apply(lambda x: 0 if ((not str(x).strip()) or str(x)=='-1') else max(int(i) for i in str(x).split(' ')))
                         
train = data[data.label != -1]
data_clicked = train[train['label'] == 1]

#1# 增加广告点击率特征
num_ad = train['aid'].value_counts().sort_index()
num_ad_clicked = data_clicked['aid'].value_counts().sort_index()

ratio_ad_clicked = num_ad_clicked / num_ad

ratio_ad_clicked = pd.DataFrame({
    'aid': ratio_ad_clicked.index,
    'ratio_ad_clicked' : ratio_ad_clicked.values
})

data = pd.merge(data, ratio_ad_clicked, on=['aid'], how='left')
print('加入广告点击率特征,OK!')
#print(ratio_clicked)

#2# 增加每个广告推送给不同的用户数
num_ad_push2user = train.groupby('aid').uid.nunique()
num_ad_push2user = pd.DataFrame({
    'aid': num_ad_push2user.index,
    'num_ad_push2user' : num_ad_push2user.values
})

data = pd.merge(data, num_ad_push2user, on=['aid'], how='left')
print('加入每个广告推送给不同用户数量特征,OK!')
#print(num_advertise_touser)


# 加入学历所对应转化率
num_education = train['education'].value_counts().sort_index()
num_education_clicked = data_clicked['education'].value_counts().sort_index()
ration_num_education = num_education_clicked / num_education
ration_num_education = pd.DataFrame({
    'education': ration_num_education.index,
    'ration_num_education' : ration_num_education.values
})

data = pd.merge(data, ration_num_education, on=['education'], how='left')
print('加入学历所对应转化率特征,OK!')
#print(ration_num_education)

# 加入用户所在LBS的历史点击率
num_lbs = train.groupby('LBS').uid.nunique()
num_lbs_clicked = data_clicked.groupby('LBS').uid.nunique()
ratio_num_lbs = num_lbs_clicked  / num_lbs 
ratio_num_lbs = ratio_num_lbs.fillna(0)
ratio_num_lbs = pd.DataFrame({
    'LBS': ratio_num_lbs.index,
    'ration_num_LBS' : ratio_num_lbs.values
})

data = pd.merge(data, ratio_num_lbs, on=['LBS'], how='left')
print('加入用户所在LBS的历史点击率特征,OK!')
#print(ratio_num_lbs)

#3#加入用户兴趣总数
vec = data[['interest1', 'interest2', 'interest5', 'kw1', 'kw2', 'topic1', 'topic2']]
num_vec = []
train_array = np.array(vec)

for i in range(train_array.shape[0]):
    num = 0
    inter = train_array[i]
    for j in inter:
        inter_lis = j.split(' ')
        if inter_lis[0] == '-1':
            continue
        num += len(inter_lis)
    num_vec.append(num)
    
num_all_vec  = pd.DataFrame(num_vec, columns=['num_all_vec'])

data =  pd.concat([data, num_all_vec], axis=1)
print('加入用户兴趣总数,OK!')   

print('count prepared!')

加入广告点击率特征,OK!
加入每个广告推送给不同用户数量特征,OK!
加入学历所对应转化率特征,OK!
加入用户所在LBS的历史点击率特征,OK!
加入用户兴趣总数,OK!
count prepared!


In [6]:
#组合特征
user_feat = ['LBS','age','consumptionAbility','education','house']
ad_feat = ['adCategoryId','productType']
combine_feat = []

Dict = dict(data['LBS'].value_counts())
data['LBS'] = data['LBS'].apply(lambda x:-1 if Dict[x]<100 else int(float(x)))
for uf in user_feat:
    for af in ad_feat:
        data[uf] = data[uf].apply(lambda x:-1 if (not str(x).strip() or str(x)=='-1') else int(float(x)))
        print(uf+' feature is combining with '+af)
        data[af+'_'+uf] = data[af].apply(int)*data[uf].apply(int)
        combine_feat += [af+'_'+uf]

new_feat = ['has_appact','has_appins','meanlen_topic','meanlen_kw','meanlen_interest','max_ct','max_os'] + \
           [('len_'+i) for i in vector_feature]

LBS feature is combining with adCategoryId
LBS feature is combining with productType
age feature is combining with adCategoryId
age feature is combining with productType
consumptionAbility feature is combining with adCategoryId
consumptionAbility feature is combining with productType
education feature is combining with adCategoryId
education feature is combining with productType
house feature is combining with adCategoryId
house feature is combining with productType


In [7]:
#概率特征
pro_feat = ['advertiserId','campaignId','creativeId','creativeSize', \
            'adCategoryId','productId','productType','age','education', \
            'LBS','aid']
new_pro_feat = [('pro_'+i) for i in pro_feat]
for feat in pro_feat:
    Dict = dict(data[feat].value_counts())
    data[feat] = data[feat].apply(lambda x:-1 if Dict[x]<100 else int(float(x)))
    Dict = dict(data[feat].value_counts())
    data['pro_'+feat] = data[feat].apply(lambda x: 100.0*(Dict[x]+1)/(data.shape[0]+Dict[x]))
print('probability prepared!')

probability prepared!


In [8]:
for feature in (one_hot_feature+combine_feat):
    print(feature+' is transformed to non-interger num...')
    max_ = data[feature].max()
    data[feature] = data[feature].apply(lambda x:int((x-max_)*(-1)))

LBS is transformed to non-interger num...
age is transformed to non-interger num...
carrier is transformed to non-interger num...
consumptionAbility is transformed to non-interger num...
education is transformed to non-interger num...
gender is transformed to non-interger num...
house is transformed to non-interger num...
os is transformed to non-interger num...
ct is transformed to non-interger num...
marriageStatus is transformed to non-interger num...
adCategoryId_LBS is transformed to non-interger num...
productType_LBS is transformed to non-interger num...
adCategoryId_age is transformed to non-interger num...
productType_age is transformed to non-interger num...
adCategoryId_consumptionAbility is transformed to non-interger num...
productType_consumptionAbility is transformed to non-interger num...
adCategoryId_education is transformed to non-interger num...
productType_education is transformed to non-interger num...
adCategoryId_house is transformed to non-interger num...
produc

In [9]:
data.head()

Unnamed: 0,aid,uid,label,advertiserId,campaignId,creativeId,creativeSize,adCategoryId,productId,productType,...,pro_campaignId,pro_creativeId,pro_creativeSize,pro_adCategoryId,pro_productId,pro_productType,pro_age,pro_education,pro_LBS,pro_aid
0,699,78508957,0,1082,295940,731679,59,13,0,6,...,1.169172,0.517806,16.380478,4.983847,34.432735,11.795798,49.104297,49.104297,49.104297,0.517806
1,1991,3637295,0,702,42104,1441131,53,10,4669,11,...,0.152019,0.152019,8.931352,11.422732,2.589898,22.406529,49.104297,49.104297,49.104297,0.152019
2,1119,19229018,0,3993,63752,798752,59,10,19256,11,...,4.376233,4.259734,16.380478,11.422732,4.376233,22.406529,49.104297,49.104297,49.104297,4.259734
3,2013,79277120,0,6937,186348,1427984,35,89,3791,9,...,1.716525,0.857338,17.7451,1.835196,1.716525,15.780996,49.104297,49.104297,49.104297,0.857338
4,692,41528441,0,6946,296367,455396,59,24,3794,11,...,7.935306,5.903622,16.380478,10.631125,7.935306,22.406529,49.104297,49.104297,49.104297,5.903622


In [10]:
# 分离测试集
data = data.fillna(0)
train = data[data.label != -1]
test = data[data.label == -1]
res = test[['aid','uid']]
test = test.drop('label', axis=1)
train_y = train.pop('label')

In [11]:
# 将上面新加入的特征进行归一化

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(train[['ratio_ad_clicked', 'num_ad_push2user','ration_num_education', 'ration_num_LBS', 'num_all_vec']].values)
scaler.fit(test[['ratio_ad_clicked', 'num_ad_push2user','ration_num_education', 'ration_num_LBS', 'num_all_vec']].values)

train_x = scaler.transform(train[['ratio_ad_clicked', 'num_ad_push2user','ration_num_education', 'ration_num_LBS', 'num_all_vec']].values)
test_x = scaler.transform(test[['ratio_ad_clicked', 'num_ad_push2user','ration_num_education', 'ration_num_LBS', 'num_all_vec']].values)

#加上原始特征
train_x = np.hstack((train_x, train[original_feature]))
test_x = np.hstack((test_x, test[original_feature]))

In [12]:
#add_feature = ['ratio_ad_clicked', 'num_ad_push2user', 'num_all_vec']
#train_x = np.hstack((train[add_feature], train[original_feature]))
#test_x = np.hstack((test[add_feature], test[original_feature]))

In [13]:
# 特征进行onehot处理
oc_encoder = OneHotEncoder()
for feature in one_hot_feature:
    oc_encoder.fit(data[feature].values.reshape(-1, 1))
    train_a=oc_encoder.transform(train[feature].values.reshape(-1, 1))
    test_a = oc_encoder.transform(test[feature].values.reshape(-1, 1))
    train_x = sparse.hstack((train_x, train_a))
    test_x = sparse.hstack((test_x, test_a))
print('one-hot prepared !')

one-hot prepared !


In [14]:
# 处理count特征向量
ct_encoder = CountVectorizer(ngram_range=(1, 2),token_pattern='(?u)\\b\\w+\\b')
for feature in vector_feature:
    ct_encoder.fit(data[feature])
    train_a = ct_encoder.transform(train[feature])
    test_a = ct_encoder.transform(test[feature])
    train_x = sparse.hstack((train_x, train_a))
    test_x = sparse.hstack((test_x, test_a))
print('cv prepared !')
# print('ths shape of train data:', test_x.shape)

cv prepared !


In [15]:
tfidf = TfidfVectorizer()#max_feature=200
for feature in vector_feature:
    print(feature+' tfidf features are being transformed...')
    tfidf.fit(data[feature])
    train_a = tfidf.transform(train[feature])
    test_a = tfidf.transform(test[feature])
    train_x = sparse.hstack((train_x, train_a))
    test_x = sparse.hstack((test_x, test_a))
print('tfidf prepared !')

interest1 tfidf features are being transformed...
interest2 tfidf features are being transformed...
interest5 tfidf features are being transformed...
kw1 tfidf features are being transformed...
kw2 tfidf features are being transformed...
topic1 tfidf features are being transformed...
topic2 tfidf features are being transformed...
tfidf prepared !


In [16]:
def LGB_predict(train_x, train_y, test_x, res):
    print("LGB test")
    clf = lgb.LGBMClassifier(
        boosting_type='gbdt', num_leaves=31, reg_alpha=0.0, reg_lambda=1,
        max_depth=-1, n_estimators=500, objective='binary',
        subsample=0.9, colsample_bytree=0.8, subsample_freq=1,
        learning_rate=0.1, min_child_weight=50, random_state=2018, n_jobs=-1
    )
    clf.fit(train_x, train_y, eval_set=[(train_x, train_y)], eval_metric='auc',early_stopping_rounds=100)
    res['score'] = clf.predict_proba(test_x)[:, 1]
    res['score'] = res['score'].apply(lambda x: float('%.6f' % x))
    res.to_csv('../data/submission.csv', index=False)
    return clf

model = LGB_predict(train_x,train_y,test_x,res)
#joblib.dump(model, './data/model_fea_add1.model')|

LGB test
[1]	valid_0's auc: 0.532033
Training until validation scores don't improve for 100 rounds.
[2]	valid_0's auc: 0.533643
[3]	valid_0's auc: 0.534553
[4]	valid_0's auc: 0.535443
[5]	valid_0's auc: 0.536215
[6]	valid_0's auc: 0.536474
[7]	valid_0's auc: 0.538024
[8]	valid_0's auc: 0.538546
[9]	valid_0's auc: 0.538913
[10]	valid_0's auc: 0.538806
[11]	valid_0's auc: 0.539189
[12]	valid_0's auc: 0.539422
[13]	valid_0's auc: 0.539574
[14]	valid_0's auc: 0.53975
[15]	valid_0's auc: 0.540019
[16]	valid_0's auc: 0.540148
[17]	valid_0's auc: 0.540897
[18]	valid_0's auc: 0.540789
[19]	valid_0's auc: 0.54121
[20]	valid_0's auc: 0.541214
[21]	valid_0's auc: 0.541446
[22]	valid_0's auc: 0.541427
[23]	valid_0's auc: 0.541588
[24]	valid_0's auc: 0.541676
[25]	valid_0's auc: 0.541595
[26]	valid_0's auc: 0.5417
[27]	valid_0's auc: 0.541667
[28]	valid_0's auc: 0.541701
[29]	valid_0's auc: 0.541694
[30]	valid_0's auc: 0.541777
[31]	valid_0's auc: 0.541804
[32]	valid_0's auc: 0.541885
[33]	valid_0'

[278]	valid_0's auc: 0.547809
[279]	valid_0's auc: 0.547835
[280]	valid_0's auc: 0.547842
[281]	valid_0's auc: 0.547832
[282]	valid_0's auc: 0.547826
[283]	valid_0's auc: 0.547851
[284]	valid_0's auc: 0.547841
[285]	valid_0's auc: 0.547881
[286]	valid_0's auc: 0.547926
[287]	valid_0's auc: 0.547931
[288]	valid_0's auc: 0.547939
[289]	valid_0's auc: 0.54798
[290]	valid_0's auc: 0.547977
[291]	valid_0's auc: 0.547986
[292]	valid_0's auc: 0.547976
[293]	valid_0's auc: 0.547992
[294]	valid_0's auc: 0.548034
[295]	valid_0's auc: 0.548068
[296]	valid_0's auc: 0.548084
[297]	valid_0's auc: 0.54815
[298]	valid_0's auc: 0.548152
[299]	valid_0's auc: 0.54816
[300]	valid_0's auc: 0.548214
[301]	valid_0's auc: 0.548235
[302]	valid_0's auc: 0.54825
[303]	valid_0's auc: 0.548263
[304]	valid_0's auc: 0.548256
[305]	valid_0's auc: 0.548257
[306]	valid_0's auc: 0.548296
[307]	valid_0's auc: 0.548305
[308]	valid_0's auc: 0.548321
[309]	valid_0's auc: 0.548308
[310]	valid_0's auc: 0.548361
[311]	valid_0'

