In [116]:
import numpy as np
import pandas as pd
import datetime
import matplotlib.pyplot as plt

In [117]:
#序列化/反序列化函数
import pickle
def to_file(df,filename):
    with open(filename,'wb') as f:
        pickle.dump(df,f)

def from_file(filename):
    with open(filename,'rb') as f:
        return pickle.load(f)

In [118]:
#载入训练数据和测试数据
df_train=from_file('../data/train01.pkl')
df_train.drop_duplicates(inplace=True)
df_test = from_file('../data/test01.pkl')
#df_test.drop_duplicates(inplace=True)
df = pd.concat([df_train,df_test])

In [86]:
#定义数据Class方便计算统计
class DataView:
    def __init__(self,df):
        self.data = df
        self.fields = self.data.columns.tolist()
        
    @property
    def item_list(self):
        #返回广告商品编号的list
        return self.data['item_id'].tolist()
    
    @property
    def item_set(self):
        #返回广告商品编号的集合
        return set(self.data['item_id'].tolist())

    @property
    def user_list(self):
        #返回用户的编号的列表
        return self.data['user_id'].tolist()
    
    @property
    #返回用户的编号的集合
    def user_set(self):
        return set(self.data['user_id'].tolist())
    
    @property
    def context_list(self):
        #上下文信息编号列表
        return self.data['context_id'].tolist()
    
    @property
    def context_set(self):
        #返回上下文信息编号的集合
        return set(self.data['context_id'].tolist())
    
    @property
    def shop_list(self):
        #返回店铺的编号
        return self.data['shop_id'].tolist()
    
    @property
    def shop_set(self):
        #返回店铺的集合
        return set(self.data['shop_id'].tolist())
    
    def item_category_list(self,i):
        #返回物品的i+1级类目列表
        full_cut_list = self.data['item_category_list'][self.data['item_category_list'].str.contains(';')].tolist()
        return [x.split(';')[i] for x in full_cut_list]
    def item_category_set(self,i):
        #返回物品的i+1级类目集合
        return set(self.item_category_list(i))
    
    def filter_by_context_time(self,start_time,end_time):
        #传入日期对象返回指定时间内的df
        return self.data[self.data['context_timestamp'].map(lambda x:True if start_time<=x<=end_time else False)]

In [119]:
def get_time_diff(date1,date2):
    '''返回两个日期之间的时间差，按秒计算'''
    if date1 > date2:
        return (date1-date2).seconds
    else:
        return (date2-date1).seconds

def context_timestamp_partition(t):
    '''将时间分为
    0：00—6:00凌晨,
    6:00—11:00上午，
    11:00—13:00中午，
    13:00—16:00下午，
    16:00—18:00傍晚，
    18:00—24:00晚上 6个时间阶段'''
    hour = t.hour
    if 0<=hour<6:
        return 0
    elif 6<=hour<11:
        return 1
    elif 11<=hour<13:
        return 2
    elif 13<=hour<16:
        return 3
    elif 16<=hour<18:
        return 4
    else:
        return 5

def data_list(df,column_id):
    #返回data的column_id列的列表
    return df[column_id].tolist()
def data_set(df,column_id):
    #data的column_id列的集合
    return set(data_list(df,column_id))

def min_max_normalize(df,name):
    '''最小最大归一化'''
    maxnum = df[name].max()
    minnum = df[name].min()
    df[name] = df[name].map(lambda x:float(x - minnum + 10e-8)/float(maxnum - minnum + 10e-8))
    return df

def get_category_2(x):
    assert ';' in df_test['item_category_list'].iloc[1],'不存在第二类别'
    return x.split(';')[1]

#### 训练集有27列，测试集有26列
#### 训练集478111行 测试集 18371
#### 训练集时间从2018-09-18 00:00:01到2018-09-24 23:59:47
#### 测试集时间从2018-09-25 00:00:02到2018-09-25 23:59:25
#### 产品二级类目有13种，一级品类都只有1种,一共128座城市
#### 训练集测试集共有207641用户参与， 共有496482种上下文(一次点击应该就是一次上下文)，4003个店铺，10236种物品
#### 有3534种物品，1971个店铺,3626个用户同时存在训练集和测试集，9947个用户，44个店铺只存在测试集

In [54]:
df['item_brand_id'].value_counts().describe()

count     2075.000000
mean       239.268434
std       1730.384050
min          1.000000
25%          4.000000
50%         21.000000
75%        122.000000
max      72427.000000
Name: item_brand_id, dtype: float64

### 训练集478111数据 测试集 18371

In [120]:
# 1. 类目列表特征
listItem = ['item_category_list','item_property_list']

# 2. 类别特征
singleIntItem = ['item_city_id','item_price_level','item_sales_level','item_collected_level','item_pv_level','item_brand_id','item_category_2']
singleIntUser = ['user_gender_id','user_age_level','user_occupation_id','user_star_level']
singleIntContext = ['context_page_id','time_stage']
singleIntShop = ['shop_review_num_level','shop_star_level']
singleIntFeature = singleIntItem + singleIntUser + singleIntContext + singleIntShop

# 3. 连续型特征
singleDoubleShop = ['shop_review_positive_rate','shop_score_service','shop_score_delivery','shop_score_description']
singleDoubleShopDispersed = ['shop_review_positive_rate_dispersed','shop_score_service_dispersed','shop_score_delivery_dispersed','shop_score_description_dispersed']
singleuser = ['user_clicknums_a_day','user_clicknums_a_hour']
# 4. ID列表
idList = ['instance_id','item_id','user_id','context_id','shop_id' ]

# 5. 目前还未用到的特征
unsureList = ['context_timestamp','predict_category_property']

# 5 train label标记
label = ['is_train', 'is_trade']
other = ['hour','day']

In [121]:
#特征转化函数
def convert_data(data):
    brand_counts=data['item_brand_id'].value_counts()
    small_brand=brand_counts[brand_counts<=22].index
    #处理品牌小品牌归为一类
    data['item_brand_id'] = data['item_brand_id'].apply(lambda x:77777777777 if str(x) in list(small_brand.map(str)) else x)
    #加入二级品类
    data['item_category_2'] = data['item_category_list'].map(get_category_2)
    #将广告点击时间转化为一天中的时间阶段
    data['time_stage'] = data['context_timestamp'].apply(context_timestamp_partition)
    #添加'hour'列和'day'列来统计用户每天和每小时的点击量
    data['hour'] = data['context_timestamp'].apply(lambda x:x.hour)
    data['day'] = data['context_timestamp'].apply(lambda x:x.day)
    #统计出用户在每天访问广告的次数
    user_clicknums_a_day = data.groupby(['user_id','day']).size().reset_index().rename(columns={0:'user_clicknums_a_day'})
    data = pd.merge(data,user_clicknums_a_day,'left',on=['user_id','day'])
    #统计出用户在每小时访问广告的次数
    user_clicknums_a_hour = data.groupby(['user_id','day','hour']).size().reset_index().rename(columns={0:'user_clicknums_a_hour'})
    data = pd.merge(data,user_clicknums_a_hour,'left',on=['user_id','day','hour'])
    
    """
    特征: 类别 ONE-HOT
    item_city_id 
    item_price_level 
    item_sales_level 
    item_collected_level 
    item_pv_level 
    item_brand_id
    user_gender_id 
    user_age_level 
    user_occupation_id 
    user_star_level 
    context_page_id
    shop_review_num_level
    shop_star_level
    item_category_2
    time_stage
    """ 

    #对上面的类别转换onehot编码
    singleIntFeatureList = singleIntFeature + ['instance_id']
    category = data.loc[:,singleIntFeatureList]
    #填充空值为众数
    category = category.replace(-1,category.mode().loc[0])
    category.loc[:,singleIntFeature] = category.loc[:,singleIntFeature].astype({i:'str' for i in singleIntFeature})
    dfCategory = pd.get_dummies(category)
    data = pd.merge(data,dfCategory,how='left',on='instance_id')
    """
    特征: 浮点数 离散化+OneHot
    shop_review_positive_rate
    shop_score_service 
    shop_score_delivery 
    shop_score_description 
    """
    
    #将上面的连续属性转离散
    for x in singleDoubleShop: 
        #填充连续型的空值为平均数
        ser = data[x].replace(-1,np.nan)
        ser.fillna(ser.mean(),inplace=True)
        cats = pd.cut(ser, 10, labels=[1,2,3,4,5,6,7,8,9,10])
        #ser = pd.concat([cats, ser]).astype('int')
        data[x+'_dispersed'] = cats
    #接着把转化好的切片onehot编码
    singleDoubleShopDispersedList = singleDoubleShopDispersed + ['instance_id']
    category = data.loc[:,singleDoubleShopDispersedList]
    #category.loc[:,singleDoubleShopDispersed] = category.loc[:,singleDoubleShopDispersed].astype('str')
    dfCategory = pd.get_dummies(category)
    data = pd.merge(data,dfCategory,on='instance_id')
    #标准化
    return data
    #data = data.fillna(0)

In [113]:
test=convert_data(df_test)

In [58]:
to_file(a,'feature_all.pkl')

In [122]:
data=from_file('feature_all.pkl')

In [12]:
len(data[data['is_trade']=='0'])/len(data[data['is_trade']=='1'])

52.19857682899711

In [110]:
len(data)

496860

In [127]:
len(df_test)

18371

In [124]:
df_shop_rate=pd.read_csv('purchase_rate.csv')
df_shop_rate=df_shop_rate.drop(['q'],axis=1)
#归一化
df_shop_rate = min_max_normalize(df_shop_rate,'purchase_rate')
df=pd.merge(df_shop_rate,data,how='right',on='shop_id')
df.loc[:,'purchase_rate'].fillna(0,inplace=True)

In [130]:
len(test.columns)

528

In [67]:
def gen_data(data,start_day,end_day,validate_day):
    #产生训练集和测试集
    train = data[data['day'].apply(lambda x:True if start_day<=x<=end_day else False)]
    test = data[data['day']==validate_day]
    return (train,test)

In [129]:
len(test)

18371

In [128]:
(train,te) = gen_data(df,18,24,25)

In [None]:
from sklearn.metrics import log_loss
import lightgbm as lgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import KFold 
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report 
from sklearn.metrics import precision_recall_curve, roc_curve, auc

In [71]:
def train_model(train,test):
    val_loss={}
    train_loss={}
    #数据标准化
    train=min_max_normalize(train,'user_clicknums_a_day')
    train=min_max_normalize(train,'user_clicknums_a_hour')
    test=min_max_normalize(test,'user_clicknums_a_day')
    test=min_max_normalize(test,'user_clicknums_a_hour')
    
    UselessFeatures = idList + singleDoubleShopDispersed+singleDoubleShop+singleIntFeature+listItem+unsureList+label+other
    feature = [x for x in train.columns if x not in UselessFeatures]
    #df_train_0 = train[train['is_trade']=='0']#多类别样本
    #df_train_1 = train[train['is_trade']=='1']#少类别样本
    df_temp = train.sample(len(test))
    x_train,y_train = df_temp.loc[:,feature],df_temp.loc[:,'is_trade']
    
    #x_validate,y_validate = test.loc[:,feature],test.loc[:,'is_trade']#验证集
    x_test = test.loc[:,feature]
    #clf = RandomForestClassifier(max_depth=20,max_features='log2',min_samples_leaf=50,min_samples_split=)
    lr = LogisticRegression(n_jobs=4,warm_start=True,class_weight={'0':0.6,'1':0.4},solver='sag')
    lr.fit(x_train, y_train)
    #y_v_pred = lr.predict_proba(x_validate)
    #sun1 = log_loss(y_validate,y_v_pred)
    #print('val_loss:',1,'###',sun1)
    #val_loss[1] = sun1
    train_pre = lr.predict_proba(x_train)
    sun2 = log_loss(y_train,train_pre)
    print('train_loss:',1,'###',sun2)
    y_test_pred = lr.predict_proba(x_test)
    res = y_test_pred
    for i in range(1,30):
        #打乱数据
        #df_temp = (pd.concat([df_train_0.sample(5*len(df_train_1)),df_train_1])).sample(frac=1.0)
        df_temp = train.sample(len(test))
        x_train,y_train = df_temp.loc[:,feature],df_temp.loc[:,'is_trade']
        lr.fit(x_train, y_train)
        #y_v_pred = lr.predict_proba(x_validate)
        y_test_pred = lr.predict_proba(x_test)
        res += y_test_pred
        #sun1=log_loss(y_validate,res/(i+1))
        #val_loss[i+1] = sun1
        #print('val_loss',i+1,'###',sun1)
        train_pre = lr.predict_proba(x_train)
        sun2=log_loss(y_train,train_pre)
        train_loss[i+1] = sun2
        print('train_loss:',i+1,'###',sun2)
    return lr,train_loss,res/30

In [72]:
lr,train_loss,res = train_model(train,test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


train_loss: 1 ### 0.0790246188976
train_loss: 2 ### 0.0829016577802
train_loss: 3 ### 0.0780763184355
train_loss: 4 ### 0.0909102412856
train_loss: 5 ### 0.0815745143127
train_loss: 6 ### 0.0774856952499
train_loss: 7 ### 0.0869274395524
train_loss: 8 ### 0.0825630227475
train_loss: 9 ### 0.075144598438
train_loss: 10 ### 0.0853161444218
train_loss: 11 ### 0.0842610774525
train_loss: 12 ### 0.0800099198169
train_loss: 13 ### 0.0857455268853
train_loss: 14 ### 0.0774212487053
train_loss: 15 ### 0.0863137705375
train_loss: 16 ### 0.0810348460798
train_loss: 17 ### 0.0817202708566
train_loss: 18 ### 0.0792449874831
train_loss: 19 ### 0.0793787090418
train_loss: 20 ### 0.0805358939834
train_loss: 21 ### 0.0764119731794
train_loss: 22 ### 0.0857940401344
train_loss: 23 ### 0.0820830638082
train_loss: 24 ### 0.0847357004623
train_loss: 25 ### 0.0832764357109
train_loss: 26 ### 0.0800455017645
train_loss: 27 ### 0.0940483708929
train_loss: 28 ### 0.0828778748297
train_loss: 29 ### 0.081374603

In [81]:

resultii.to_csv('submit.csv',sep=' ',index=False)

In [77]:
test['predicted_score']=res[:,1]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [144]:
one_hot = OneHotEncoder(n_values=2, sparse=False)
one_hot.fit_transform(np.array([[x] for x in  test['is_trade']]))

array([[ 1.,  0.],
       [ 1.,  0.],
       [ 1.,  0.],
       ..., 
       [ 1.,  0.],
       [ 1.,  0.],
       [ 0.,  1.]])

In [149]:
one_hot.fit_transform([[0.8],[0.5],[0.2]])

array([[ 1.,  0.],
       [ 1.,  0.],
       [ 1.,  0.]])

In [32]:
pd.Categorical.from_array(['yes','no','yes','yes','no']).codes

  """Entry point for launching an IPython kernel.


array([1, 0, 1, 1, 0], dtype=int8)

In [78]:
test[['instance_id','predicted_score']].to_csv('submit2.csv',sep=' ',index=False)

In [96]:
no_set=set(df_test['instance_id'])-set(test['instance_id'])

In [99]:
df_test[df_test['instance_id'].map(lambda x:True if x in no_set else False)]

Unnamed: 0,instance_id,item_id,item_category_list,item_property_list,item_brand_id,item_city_id,item_price_level,item_sales_level,item_collected_level,item_pv_level,...,context_timestamp,context_page_id,predict_category_property,shop_id,shop_review_num_level,shop_review_positive_rate,shop_star_level,shop_score_service,shop_score_delivery,shop_score_description
3701,493620690211425575,6389594436054073436,7908382889764677758;7258015885215914736,5131280576272319091;2636395404473730413;914848...,8423871433538420199,3122721854741763495,6,10,11,11,...,2018-09-25 20:48:58,4001,7258015885215914736:-1;7822717283490579102:821...,5211111345559045894,11,0.985345,5009,0.963611,0.959771,0.960137
4122,7784032091418480690,263926865766947419,7908382889764677758;3203673979138763595,7323906747451897092;7323906747451897092;513128...,2377959441382357698,2174699138227015967,5,7,13,10,...,2018-09-25 21:59:28,4011,509660095530134768:2636395404473730413;8277336...,3245796615214673033,13,0.987183,5011,0.968506,0.968851,0.966667
4173,3023563969625408045,5886770794298157156,7908382889764677758;2436715285093487584,5977512434884267894;5131280576272319091;263639...,1295326989909367975,8762827044490678569,6,6,10,11,...,2018-09-25 14:48:21,4008,2648343924459474923:7731581851357328826;294816...,1915715927446648474,16,0.996437,5013,0.9849,0.98248,0.978512
4815,4783775988103700626,6401098323093546050,7908382889764677758;3203673979138763595,7323906747451897092;5131280576272319091;263639...,7838285046767229711,4918413420989329604,5,1,5,7,...,2018-09-25 12:18:03,4004,3203673979138763595:-1;1771349742445680985:821...,839860939867801483,12,0.957981,5010,0.953106,0.955171,0.944966
5279,4855889122837730085,4825203222395371881,7908382889764677758;509660095530134768,5131280576272319091;7344985833148694227;652390...,7838285046767229711,7322157373578955368,7,2,2,9,...,2018-09-25 16:53:49,4001,509660095530134768:-1;5755694407684602296:-1;7...,290760514900863974,5,1.0,5002,0.96,0.96,0.96
5535,3963288418394653639,9112330538010486160,7908382889764677758;5799347067982556520,2072967855524022579;5131280576272319091;263639...,6108765258052978520,3122721854741763495,6,5,11,9,...,2018-09-25 13:20:29,4013,"5799347067982556520:3657871859501171040,773863...",8248010798833268483,14,0.97784,5011,0.951628,0.951628,0.947907
5888,4850288548049827644,5709673699425143194,7908382889764677758;3203673979138763595,7323906747451897092;5131280576272319091;263639...,7838285046767229711,2174699138227015967,5,1,3,3,...,2018-09-25 06:40:40,4008,3203673979138763595:-1;836752724084922533:5131...,3069435511756040823,15,0.985613,5012,0.974587,0.972529,0.970426
6273,4648129525801501490,999527098781133935,7908382889764677758;5799347067982556520,5131280576272319091;2636395404473730413;316326...,5890149139702262860,1019055478500227370,5,1,2,6,...,2018-09-25 15:56:04,4019,4879721024980945592:1530585142987372715;790838...,7697588901109065412,12,0.966816,5009,0.946667,0.956078,0.940392
6386,6437354539535175304,3701546921126142351,7908382889764677758;2011981573061447208,2636395404473730413;7674243629402549267;819036...,8439105518725414462,4918413420989329604,7,6,10,10,...,2018-09-25 10:31:47,4001,"8257512457089702259:7489191371213669983,506055...",7071236685848722720,11,0.968932,5009,0.967286,0.974674,0.962825
6444,1139616076479987692,487358917856820313,7908382889764677758;5799347067982556520,5131280576272319091;2636395404473730413;260867...,7583969141713309215,1019055478500227370,6,4,6,6,...,2018-09-25 00:05:52,4006,5799347067982556520:-1;7908382889764677758:-1,5384266988698356955,13,0.996461,5011,0.986649,0.986225,0.984106
