In [1]:
import numpy as np
import pandas as pd
import re
import scipy.stats as st
import sklearn.metrics as met
import matplotlib.pyplot as plt
import sklearn.preprocessing as prep
from sklearn.model_selection import StratifiedKFold

from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
import hyperopt.pyll.stochastic

import time
import copy

%matplotlib inline
title = 'PPD'
path = '../sources/data/PPD-First-Round-Data'
icy = 'target'

In [2]:
pathofDataSaving = '../sources/deal/PPD-Data-Saving'
irt = np.load('{}/{}_irt.npy'.format(pathofDataSaving, title))
irv = np.load('{}/{}_irv.npy'.format(pathofDataSaving, title))
dac = pd.read_hdf('{}/{}_dac.h5'.format(pathofDataSaving, title), key = 'dac')
print dac.shape
print dac.loc[irt, icy].value_counts()/dac.loc[irt, icy].shape[0]

(49999, 389)
0.0    0.926733
1.0    0.073267
Name: target, dtype: float64


In [4]:
y = dac.loc[:, [icy]]
icx = list(set(dac.columns) - set([icy]))
x = dac.loc[:, icx]
x = x.apply(lambda x: x.fillna(x.median()),axis=0) # 使用中位数填充缺失值
x = (x.rank(pct = True)-0.5/x.shape[0]).apply(st.norm.ppf) # 正态标准化
#x = (x - x.mean())/x.std() # 中心归一标准化
xt = x.loc[irt, :].values
yt = y.loc[irt, :].values
xv = x.loc[irv, :].values

In [5]:
# blending with LR & XGB
from sklearn.model_selection import train_test_split
xt_d1, xt_d2, yt_d1, yt_d2 = train_test_split(xt, yt, test_size=0.5, random_state=0, stratify=yt)

In [6]:
print pd.Series(yt_d1.flatten()).value_counts() / yt_d1.shape[0]
print pd.Series(yt_d2.flatten()).value_counts() / yt_d2.shape[0]

0.0    0.926733
1.0    0.073267
dtype: float64
0.0    0.926733
1.0    0.073267
dtype: float64


In [7]:
from sklearn.linear_model import LogisticRegression
from  xgboost import XGBClassifier
models = [
    ['lr', LogisticRegression(random_state=0, penalty='l2', C=0.003, class_weight='balanced', solver='sag')], 
    ['xgbc', XGBClassifier(seed=0, max_depth=3, learning_rate=0.05, subsample=0.9, min_child_weight=1.2, colsample_bytree=0.2, colsample_bylevel=1.0, gamma=0.3, reg_lambda=1.0)]
]
# 用d1训练模型
blending_d2      = np.zeros((xt_d2.shape[0], len(models)))
blending_test = np.zeros((xv.shape[0], len(models)))



In [8]:
from sklearn.metrics import roc_auc_score
# for every base model
for j, (name, model) in enumerate(models):
    timeStart = time.time()
    model.fit(xt_d1, yt_d1)
    print 'No.{} - {} - Training Time: {:.2f} seconds'.format(j+1, name, time.time() - timeStart)
    blending_d2[:, j]  = model.predict_proba(xt_d2)[:, 1]
    blending_test[:, j] = model.predict_proba(xv)[:, 1]
#     print 'base model %s: testSet auc Score = %.6f' % (name,  roc_auc_score(y_test, blending_test[:, j]))

  y = column_or_1d(y, warn=True)


No.1 - lr - Training Time: 2.00 seconds


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


No.2 - xgbc - Training Time: 1.70 seconds


In [21]:
# 融合前结果
# blending_d2
for i in range(len(models)):
    print 'auc of No.%d model:'%(i+1), roc_auc_score(yt_d2, pd.DataFrame(blending_d2).iloc[:, i])

auc of No.1 model: 0.763548409627
auc of No.2 model: 0.726069877076


In [34]:
# blending 融合
clf = LogisticRegression()
clf.fit(blending_d2, yt_d2)
print clf.coef_
clf_predict_prob = clf.predict_proba(blending_test)[:, 1]

# 备注： 我们掌握了初赛轮具体结果，因此可以直接模拟线上得分

print("Linear stretch of predictions to [0,1]")
clf_predict_prob_stretch = (clf_predict_prob - clf_predict_prob.min()) / (clf_predict_prob.max() - clf_predict_prob.min())

# print 'blending result( no  stretch): testSet auc Score = %.6f' % (roc_auc_score(y_test, clf_predict_prob))
# print 'blending result(with stretch): testSet auc Score = %.6f' % (roc_auc_score(y_test, clf_predict_prob_stretch))

[[ 4.27610632  1.77402898]]
Linear stretch of predictions to [0,1]


In [31]:
# 从第二轮数据中模拟初赛线上打分
def Del_string(xstr):
    xstrc = xstr.strip().strip(u'市').strip(u'省')
    if(xstrc == ''):
        xstrc = np.nan
    return(xstrc)

newpath = '../sources/data/PPD-Second-Round-Data/first_round_test_data'
par_csv = dict(index_col = 0, encoding = 'GB18030', parse_dates = ["ListingInfo"], na_values = [-1], 
               converters = dict(zip(*[["UserInfo_{}".format(i) for i in [2,4,7,8,9,19,20]], [Del_string]*7])))

dat_fr1_master = pd.read_csv('{}/Kesci_Master_9w_gbk_2.csv'.format(newpath), **par_csv)
print dat_fr1_master.shape

(19999, 227)


In [35]:
print clf_predict_prob.shape
print roc_auc_score(dat_fr1_master['target'], clf_predict_prob)
print roc_auc_score(dat_fr1_master['target'], clf_predict_prob_stretch)

(19999,)
0.767580446937
0.767580446937
