In [2]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from pylab import *
from IPython.display import display

# 加载数据
# 数据：抽样一小时（2017031217）的行为数据，关联当天（20170312）所有阅读时长上报日志
# 说明：带nonull的文件名表示只保留有上报阅读时长的日志
file_name = 'data/tmp_hindi_app_iflow_reco_pv_rd_2017031217' # 'data/tmp_hindi_app_iflow_reco_pv_rd_2017030117'
columns = '''tm_pv,reco_id,ds,item_id,app,set_lang,cate,item_type,tm_rd,tm_vl,is_rf,clk,pub_tm,upd_tm,len,tm_clk'''.split(",")
d1 = pd.read_csv(file_name, sep='\1', header=None, names=columns, dtype={'cate':str})
d1['cate_org'] = d1['cate']
d1['cate'] = d1['cate'].str[0:3]

# 分类CTR
d1_ctr = d1[['cate','clk']].groupby('cate').mean()

In [134]:
# 数据描述
display(d1.columns)

display(d1.head(3))

display(d1.describe())

Index([u'tm_pv', u'reco_id', u'ds', u'item_id', u'app', u'set_lang', u'cate',
       u'item_type', u'tm_rd', u'tm_vl', u'is_rf', u'clk', u'pub_tm',
       u'upd_tm', u'len', u'tm_clk', u'cate_org'],
      dtype='object')

Unnamed: 0,tm_pv,reco_id,ds,item_id,app,set_lang,cate,item_type,tm_rd,tm_vl,is_rf,clk,pub_tm,upd_tm,len,tm_clk,cate_org
0,1489312596,30835857-a1fd-4bf9-a9f4-8a5e33281c30,WKjkrQteOkIDAK0mKrpIcHWg,1005364371572074,app_iflow,hindi,25,201,\N,\N,\N,0,1489151046,1489288318,423,\N,25
1,1489309224,5cfcc594-37fe-4b0e-aea1-8913a0e96279,WLd6Q+oaTtIDAB/l4kQhvpeI,1005364371572074,app_iflow,hindi,25,201,\N,\N,\N,0,1489151046,1489288318,423,\N,25
2,1489312545,b93617d0-ad31-4ea2-8d59-fae97700452a,WMPJH/0KB3sDAFCUcq5d2AvG,1005364371572074,app_iflow,hindi,25,201,1489333815,17970,0,1,1489151046,1489288318,423,1489312783,25


Unnamed: 0,tm_pv,item_id,item_type,clk,pub_tm,upd_tm,len
count,1035361.0,1035361.0,1035361.0,1035361.0,1035361.0,1035361.0,1035361.0
mean,1489311000.0,2471427000000000.0,25.10462,0.07297938,1488819000.0,1489203000.0,2595.212
std,1039.51,1474593000000000.0,66.87796,0.2601028,3984865.0,283304.3,6481.47
min,1489309000.0,141290900000.0,0.0,0.0,1402923000.0,1485410000.0,0.0
25%,1489310000.0,1205702000000000.0,0.0,0.0,1489191000.0,1489213000.0,1008.0
50%,1489311000.0,2467040000000000.0,0.0,0.0,1489249000.0,1489286000.0,1601.0
75%,1489312000.0,3533353000000000.0,0.0,0.0,1489295000.0,1489304000.0,2325.0
max,1489313000.0,6666664000000000.0,222.0,1.0,1489404000.0,1489405000.0,157008.0


In [120]:
d2 = d1.replace('\\N', np.nan, inplace=False)

# 基础特征
d2['td_pub'] = d2['tm_pv']-d2['pub_tm']
item_ctr = d2[['item_id','clk']].groupby('item_id').mean() # 需要平滑，本次仅是演示用
item_ctr.reset_index(level=0, inplace=True)
item_ctr.rename(columns={'clk':'item_ctr'}, inplace=True)
user_ctr = d2[['ds', 'clk']].groupby('ds').mean()
user_ctr.reset_index(level=0, inplace=True)
user_ctr.rename(columns={'clk':'user_ctr'}, inplace=True)
d2 = pd.merge(d2, item_ctr, how='left', on=['item_id'])
d2 = pd.merge(d2, user_ctr, how='left', on=['ds'])

# 交叉特征
d2['cate_item_type'] = d2['cate'].str.cat(d2['item_type'].astype('str'), sep='#')

display(d2.head(3))

Unnamed: 0,tm_pv,reco_id,ds,item_id,app,set_lang,cate,item_type,tm_rd,tm_vl,...,clk,pub_tm,upd_tm,len,tm_clk,cate_org,td_pub,item_ctr,user_ctr,cate_item_type
0,1489312596,30835857-a1fd-4bf9-a9f4-8a5e33281c30,WKjkrQteOkIDAK0mKrpIcHWg,1005364371572074,app_iflow,hindi,25,201,,,...,0,1489151046,1489288318,423,,25,161550,0.115385,0.072165,025#201
1,1489309224,5cfcc594-37fe-4b0e-aea1-8913a0e96279,WLd6Q+oaTtIDAB/l4kQhvpeI,1005364371572074,app_iflow,hindi,25,201,,,...,0,1489151046,1489288318,423,,25,158178,0.115385,0.006565,025#201
2,1489312545,b93617d0-ad31-4ea2-8d59-fae97700452a,WMPJH/0KB3sDAFCUcq5d2AvG,1005364371572074,app_iflow,hindi,25,201,1489333815.0,17970.0,...,1,1489151046,1489288318,423,1489312783.0,25,161499,0.115385,0.215278,025#201


In [123]:
from sklearn.preprocessing import LabelEncoder

# 样本生成
id_col = ['reco_id']
label_col = ['clk']
cat_cols = ['ds', 'item_id', 'app', 'set_lang', 'cate', 'item_type', 'cate_item_type']
num_cols = ['len', 'td_pub', 'item_ctr', 'user_ctr']
feat_cols = cat_cols+num_cols
w_col = ['tm_vl']

# 填充缺省值
def fillna_mean(df, col):
    df_tmp = df[col][d2[col].isnull().any(axis=1)==False]
    df_fill_val = d2_tmp[col].astype('float').mean().values[0].astype('float')
    df[col] = df[col].fillna(df_fill_val)
    
fillna_mean(d2, w_col)
display(d2.head(5))

# 离散特征编码
label_encoder = LabelEncoder()
for col in cat_cols:
    d2[col] = label_encoder.fit_transform(d2[col].astype('str'))
    
# 目标编码
# d2[label_col] = label_encoder.fit_transform(d2[label_col].astype('str'))

display(d2.head(3))

# split
d2['is_train'] = np.random.rand(len(d2)) <= 0.8
d2_train = d2[d2['is_train']==True]
d2_train_x = d2_train[feat_cols].values
d2_train_y = d2_train[label_col].values
d2_train_w = d2_train[w_col].values

d2_train_dev = d2[d2['is_train']==False]
d2_train_dev_x = d2_train_dev[feat_cols].values
d2_train_dev_y = d2_train_dev[label_col].values
d2_train_dev_w = d2_train_dev[w_col].values

Unnamed: 0,tm_pv,reco_id,ds,item_id,app,set_lang,cate,item_type,tm_rd,tm_vl,...,pub_tm,upd_tm,len,tm_clk,cate_org,td_pub,item_ctr,user_ctr,cate_item_type,is_train
0,1489312596,30835857-a1fd-4bf9-a9f4-8a5e33281c30,13063,38,0,0,23,2,,51237,...,1489151046,1489288318,423,,25,161550,0.115385,0.072165,97,False
1,1489309224,5cfcc594-37fe-4b0e-aea1-8913a0e96279,14227,38,0,0,23,2,,51237,...,1489151046,1489288318,423,,25,158178,0.115385,0.006565,97,False
2,1489312545,b93617d0-ad31-4ea2-8d59-fae97700452a,15097,38,0,0,23,2,1489333815.0,17970,...,1489151046,1489288318,423,1489312783.0,25,161499,0.115385,0.215278,97,True
3,1489309825,4f3778e7-a0a2-4881-9fbd-72c28204e150,1628,38,0,0,23,2,,51237,...,1489151046,1489288318,423,,25,158779,0.115385,0.058824,97,True
4,1489309329,08c65ebb-54ad-4d17-8f6f-c82170722fa9,14172,38,0,0,23,2,,51237,...,1489151046,1489288318,423,,25,158283,0.115385,0.033632,97,False


Unnamed: 0,tm_pv,reco_id,ds,item_id,app,set_lang,cate,item_type,tm_rd,tm_vl,...,pub_tm,upd_tm,len,tm_clk,cate_org,td_pub,item_ctr,user_ctr,cate_item_type,is_train
0,1489312596,30835857-a1fd-4bf9-a9f4-8a5e33281c30,3407,21809,0,0,16,2,,51237,...,1489151046,1489288318,423,,25,161550,0.115385,0.072165,107,False
1,1489309224,5cfcc594-37fe-4b0e-aea1-8913a0e96279,4700,21809,0,0,16,2,,51237,...,1489151046,1489288318,423,,25,158178,0.115385,0.006565,107,False
2,1489312545,b93617d0-ad31-4ea2-8d59-fae97700452a,5666,21809,0,0,16,2,1489333815.0,17970,...,1489151046,1489288318,423,1489312783.0,25,161499,0.115385,0.215278,107,True


In [73]:
# 随机森林
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=20, max_depth=3, max_features="auto")
rf.fit(d2_train_x, d2_train_y.ravel())

0.813641038234 0.810713505989


In [79]:
from sklearn import metrics

# 评估
def evaluate(model, x, y, sample_weight=None):
    preds = model.predict_proba(x)
    fpr, tpr, _ = metrics.roc_curve(y, preds[:,1], sample_weight=sample_weight)
    return metrics.auc(fpr, tpr),preds

auc_train, preds_train = evaluate(rf, d2_train_x, d2_train_y)
auc_train_dev, preds_train_dev = evaluate(rf, d2_train_dev_x, d2_train_dev_y)
print auc_train,auc_train_dev

# d2_train_dev['score'] = preds_train_dev[:,1]
# d2_train_dev.to_csv('/home/datamining/jupyter/yaowq/read_duration_analy/data/model_train_dev_output.csv', columns=['reco_id','clk','score'])


0.813641038234 0.810713505989


In [131]:
# 随机森林 + 样本权重
rf.fit(d2_train_x, d2_train_y.ravel(), sample_weight=d2_train_w.astype('float').ravel())

(827741, 1)
(827741, 11)
(827741, 1)


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=3, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [140]:
# 时长加权模型 + 无加权评估
auc_train, preds_train = evaluate(rf, d2_train_x, d2_train_y)
auc_train_dev, preds_train_dev = evaluate(rf, d2_train_dev_x, d2_train_dev_y)
print auc_train,auc_train_dev

# 时长加权模型 + 时长加权评估
# auc_train, preds_train = evaluate(rf, d2_train_x, d2_train_y, d2_train_w.astype('float').ravel())
# auc_train_dev, preds_train_dev = evaluate(rf, d2_train_dev_x, d2_train_dev_y, d2_train_dev_w.astype('float').ravel())
# print auc_train,auc_train_dev

0.812301955365 0.808230571546


In [84]:
# 类型特征LabelEncoder + LR
# 注：仅用于接口测试，对于LR模型，类型变量其实不适合进行LabelEncoder，用onehot编码更合适，下文也会进一步测试
from sklearn.linear_model import LogisticRegression

# 类型变量采用LabelEncoder
lr = LogisticRegression(penalty="l1", C=100)
lr.fit(d2_train_x, d2_train_y)

# 评估
# 0.836040436558 0.832571600873
# Note：这里LabelEncoder不会带来效果的明显折损，是因为实验用的几个类型特征都是弱特征，模型贡献主要是统计ctr特征
auc_train, preds_train = evaluate(lr, d2_train_x, d2_train_y)
auc_train_dev, preds_train_dev = evaluate(lr, d2_train_dev_x, d2_train_dev_y)
print auc_train,auc_train_dev

# 类型变量采用LabelEncoder
lr = LogisticRegression(penalty="l2", C=100)
lr.fit(d2_train_x, d2_train_y)

# 评估
auc_train, preds_train = evaluate(lr, d2_train_x, d2_train_y)
auc_train_dev, preds_train_dev = evaluate(lr, d2_train_dev_x, d2_train_dev_y)
print auc_train,auc_train_dev

  y = column_or_1d(y, warn=True)


0.836040436558 0.832571600873


  y = column_or_1d(y, warn=True)


0.522764660571 0.522618092618


In [142]:
# 类型特征LabelEncoder + LR + 时长加权
# 注：0.17版本LR还不支持样本加权，会发生以下错误，需要更新到0.18以上版本
# ValueError: Solver liblinear does not support sample weights

# # 类型变量采用LabelEncoder
# lr = LogisticRegression(penalty="l1", C=100)
# lr.fit(d2_train_x, d2_train_y, sample_weight=d2_train_w.astype('float').ravel())

# # 评估
# auc_train, preds_train = evaluate(lr, d2_train_x, d2_train_y)
# auc_train_dev, preds_train_dev = evaluate(lr, d2_train_dev_x, d2_train_dev_y)
# print auc_train,auc_train_dev

In [192]:
# 类型特征OneHotEncoder + LR
# Note：OneHotEncoder太耗内存，后续尝试DictEncoder和FeatureHasher
from sklearn.preprocessing import OneHotEncoder

# d3 = d2.copy()

# 离散特征编码
# Note：Memory Error！
# label_enc = LabelEncoder()
# onehot_enc = OneHotEncoder()
# for col in cat_cols:
#     d3[col] = label_enc.fit_transform(d3[col].astype('str'))
#     d3[col] = onehot_enc.fit_transform(d3[col])
    
# display(d3.head(3))

# # split
# d3['is_train'] = np.random.rand(len(d3)) <= 0.8
# d3_train = d3[d3['is_train']==True]
# d3_train_x = d3_train[feat_cols].values
# d3_train_y = d3_train[label_col].values
# d3_train_w = d3_train[w_col].values

# d3_train_dev = d3[d3['is_train']==False]
# d3_train_dev_x = d3_train_dev[feat_cols].values
# d3_train_dev_y = d3_train_dev[label_col].values
# d3_train_dev_w = d3_train_dev[w_col].values


(1035361, 22)


In [232]:
# 类型特征FeatureHasher + LR
# 说明：相比OneHotEncoder，采用稀疏存储，可以减少大量内存，缺点是可能存在hash冲突，效果略微有损

# from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import FeatureHasher
from scipy.sparse import hstack

sample_num = 10000

feat_hash = FeatureHasher(n_features=1048576)
# fh = feat_hash.fit_transform(d2[cat_cols].head(sample_num).T.to_dict().values())
fh = feat_hash.fit_transform(d2[cat_cols].T.to_dict().values())

def split_train_test(x, y, w, nx, train_ratio=0.8):
    samples_num, feats_num = np.shape(x)
    indices = np.random.permutation(samples_num)
    train_num = int(samples_num*train_ratio)
    train_idx, test_idx = indices[:train_num], indices[train_num:]
    train_cat_x, train_num_x, train_y, train_w = x[train_idx,:], nx[train_idx,:], y[train_idx], w[train_idx]
    train_dev_cat_x, train_dev_num_x, train_dev_y, train_dev_w = x[test_idx,:], nx[test_idx,:], y[test_idx], w[test_idx]
    train_x = hstack((train_cat_x, train_num_x))
    train_dev_x = hstack((train_dev_cat_x, train_dev_num_x))
    return train_x,train_y,train_w,train_dev_x,train_dev_y,train_dev_w

# train_x, train_y, train_w, train_dev_x, train_dev_y, train_dev_w = split_train_test(
#     fh, d2[label_col].values[0:sample_num], d2[w_col].values[0:sample_num]
#     , d2[num_cols].values[0:sample_num])

train_x, train_y, train_w, train_dev_x, train_dev_y, train_dev_w = split_train_test(
    fh, d2[label_col].values, d2[w_col].values
    , d2[num_cols].values)

# 训练
lr = LogisticRegression(penalty="l1", C=10000)
lr.fit(train_x, train_y)

# 评估
# 0.834878608249 0.837114699089
# Note：如果没有ctr统计特征，auc只有0.53左右
auc_train, preds_train = evaluate(lr, train_x, train_y)
auc_train_dev, preds_train_dev = evaluate(lr, train_dev_x, train_dev_y)
print auc_train,auc_train_dev


  y = column_or_1d(y, warn=True)


0.834878608249 0.837114699089


In [239]:
# 类型特征DictVectorizer + LR
# 说明：相比OneHotEncoder，采用稀疏存储，可以减少大量内存，不过需要保存特征词典，相比FeatureHasher有一定内存开销

from sklearn.feature_extraction import DictVectorizer

sample_num = 10000

dict_vec = DictVectorizer(sparse=True, sort=True)
# fh = dict_vec.fit_transform(d2[cat_cols].head(sample_num).T.to_dict().values())
fh = dict_vec.fit_transform(d2[cat_cols].T.to_dict().values())

def split_train_test(x, y, w, nx, train_ratio=0.8):
    samples_num, feats_num = np.shape(x)
    indices = np.random.permutation(samples_num)
    train_num = int(samples_num*train_ratio)
    train_idx, test_idx = indices[:train_num], indices[train_num:]
    train_cat_x, train_num_x, train_y, train_w = x[train_idx,:], nx[train_idx,:], y[train_idx], w[train_idx]
    train_dev_cat_x, train_dev_num_x, train_dev_y, train_dev_w = x[test_idx,:], nx[test_idx,:], y[test_idx], w[test_idx]
    train_x = hstack((train_cat_x, train_num_x))
    train_dev_x = hstack((train_dev_cat_x, train_dev_num_x))
    return train_x,train_y,train_w,train_dev_x,train_dev_y,train_dev_w

# train_x, train_y, train_w, train_dev_x, train_dev_y, train_dev_w = split_train_test(
#     fh, d2[label_col].values[0:sample_num], d2[w_col].values[0:sample_num]
#     , d2[num_cols].values[0:sample_num])

train_x, train_y, train_w, train_dev_x, train_dev_y, train_dev_w = split_train_test(
    fh, d2[label_col].values, d2[w_col].values
    , d2[num_cols].values)

# 训练
lr = LogisticRegression(penalty="l1", C=10000)
lr.fit(train_x, train_y)

# 评估
# 0.83550625566 0.834445163291
# Note：如果没有ctr统计特征，auc只有0.53左右
auc_train, preds_train = evaluate(lr, train_x, train_y)
auc_train_dev, preds_train_dev = evaluate(lr, train_dev_x, train_dev_y)
print auc_train,auc_train_dev


  y = column_or_1d(y, warn=True)


0.83550625566 0.834445163291
