In [1]:
import numpy as np 
import random
import pandas as pd 
import scipy.special as special 
from sklearn.model_selection import StratifiedKFold, KFold
from scipy import stats 
from collections import OrderedDict, namedtuple 
from itertools import chain 
from tensorflow.python.keras.initializers import RandomNormal 
from tensorflow.python.keras.layers import * 
from tensorflow.python.keras.regularizers import  l2 

In [None]:
class BayesianSmoothing(object):
    def __init__(self, alpha, beta):
        self.alpha = alpha
        self.beta = beta

    def sample(self, alpha, beta, num, imp_upperbound):
        sample = np.random.beta(alpha, beta, num)
        I = []
        C = []
        for clk_rt in sample:
            imp = random.random() * imp_upperbound
            clk = imp * clk_rt
            I.append(int(imp))
            C.append(int(clk))
        return I, C

    def update(self, imps, clks, iter_num, epsilon):
        for i in range(iter_num):
            new_alpha, new_beta = self.__fixed_point_iteration(imps, clks, self.alpha, self.beta)
            if abs(new_alpha-self.alpha)<epsilon and abs(new_beta-self.beta)<epsilon:
                break
            self.alpha = new_alpha
            self.beta = new_beta

    def __fixed_point_iteration(self, imps, clks, alpha, beta):
        numerator_alpha = 0.0
        numerator_beta = 0.0
        denominator = 0.0

        for i in range(len(imps)):
            # special.digamma(gamma函数的导数)
            numerator_alpha += (special.digamma(clks[i]+alpha) - special.digamma(alpha))
            numerator_beta += (special.digamma(imps[i]-clks[i]+beta) - special.digamma(beta))
            denominator += (special.digamma(imps[i]+alpha+beta) - special.digamma(alpha+beta))

        return alpha*(numerator_alpha/denominator), beta*(numerator_beta/denominator)

In [10]:
"""随机变量（beta 分布）->字典特征"""
def beta_ppf(alpha, beta, dim):
    return stats.beta(alpha, beta).ppf([x/(dim+1) for x in range(0,dim+2)])
    # stats.beta.ppf()   # 累积分布的反函数

def beta_prior_feat_2_vec(data, key_col, count_col, sum_col, dim):
    data_simple = data.drop_duplicates([key_col],keep='last')
    bs = BayesianSmoothing(1, 1)
    bs.update(data_simple[count_col].values, data_simple[sum_col].values, 1000, 0.0000000001)
    if np.isnan(bs.alpha) or np.isnan(bs.beta):
        bs.alpha, bs.beta = 0, 0

    data[key_col + '_beta_cdf_value'] = list(
        map(lambda x,y:beta_cdf(x,y,dim), data[sum_col]+bs.alpha, data[count_col]-data[sum_col]+bs.beta))
    data[key_col + '_beta_ppf_value'] = list(
        map(lambda x,y:beta_ppf(x,y,dim), data[sum_col] + bs.alpha, data[count_col] - data[sum_col] + bs.beta))
    data[key_col + '_beta_key'] = [np.array([i for i in range(dim)]) for _ in range(data.shape[0])]
    
    return data[key_col+'_beta_cdf_value'].values, data[key_col + '_beta_ppf_value'].values, data[key_col + '_beta_key'].values

浮点数->字典特征

In [27]:
tmp = [[1,2,3,4],
     [3,5,7,9.0]]
tmp

[[1, 2, 3, 4], [3, 5, 7, 9.0]]

In [28]:
np.exp(tmp - np.max(tmp))

array([[3.35462628e-04, 9.11881966e-04, 2.47875218e-03, 6.73794700e-03],
       [2.47875218e-03, 1.83156389e-02, 1.35335283e-01, 1.00000000e+00]])

In [25]:
import torch 

In [29]:
torch.nn.functional.softmax(torch.tensor(tmp), dim=-1)

tensor([[0.0321, 0.0871, 0.2369, 0.6439],
        [0.0021, 0.0158, 0.1171, 0.8650]])

In [17]:
x = np.apply_along_axis(lambda x: np.exp(x-np.max(x)), 1, tmp)
denominator = np.apply_along_axis(lambda x: 1.0 / np.sum(x), 1, x)
denominator 

array([0.64391426, 0.64391426])

In [39]:
np.array([1,2,3,4]).dot(3)  # np.array([1,2,3,4]) * 3 

array([ 3,  6,  9, 12])

In [37]:
def numpy_softmax(x):
    orig_shape = x.shape
    if len(x.shape) > 1:
        exp_minmax = lambda x: np.exp(x - np.max(x))   # 减掉最大值，防止溢出
        denom = lambda x: 1.0 / np.sum(x)
        x = np.apply_along_axis(exp_minmax,1,x)
        denominator = np.apply_along_axis(denom,1,x)
        if len(denominator.shape) == 1:
            denominator = denominator.reshape((denominator.shape[0],1))
        x = x * denominator
    else:
        x_max = np.max(x)
        x = x - x_max
        numerator = np.exp(x)
        denominator =  1.0 / np.sum(numerator)
        x = numerator.dot(denominator)
    assert x.shape == orig_shape
    return x

def float2vec(float_feat, bar_num = 20, method = 'gravitation'):
    float_feat = (float_feat-np.min(float_feat))*1.0 / np.max(float_feat-np.min(float_feat))   # (x-min)/max((x-min))
    key_array = np.array([[i*1.0/(bar_num + 1) for i in range(bar_num + 1)]] * len(float_feat))  # 分桶 
    value_array = None
    if method == 'gravitation':
        value_array = 1/(np.abs(key_array - float_feat[:,None] + 0.00001))**2
        value_array = value_array/np.sum(value_array,axis=1, keepdims=True)
    if method == 'sofmax':
        value_array = 1 / np.abs(key_array - float_feat[:, None] + 0.00001)
        value_array = numpy_softmax(value_array)
    return key_array,value_array

In [33]:
float_feat = [0,1,2,3,4,5,6]
(float_feat-np.min(float_feat))*1.0 / np.max(float_feat-np.min(float_feat)) 

array([0.        , 0.16666667, 0.33333333, 0.5       , 0.66666667,
       0.83333333, 1.        ])

In [34]:
bar_num = 20 
np.array([[i*1.0/(bar_num + 1) for i in range(bar_num + 1)]] * len(float_feat))

array([[0.        , 0.04761905, 0.0952381 , 0.14285714, 0.19047619,
        0.23809524, 0.28571429, 0.33333333, 0.38095238, 0.42857143,
        0.47619048, 0.52380952, 0.57142857, 0.61904762, 0.66666667,
        0.71428571, 0.76190476, 0.80952381, 0.85714286, 0.9047619 ,
        0.95238095],
       [0.        , 0.04761905, 0.0952381 , 0.14285714, 0.19047619,
        0.23809524, 0.28571429, 0.33333333, 0.38095238, 0.42857143,
        0.47619048, 0.52380952, 0.57142857, 0.61904762, 0.66666667,
        0.71428571, 0.76190476, 0.80952381, 0.85714286, 0.9047619 ,
        0.95238095],
       [0.        , 0.04761905, 0.0952381 , 0.14285714, 0.19047619,
        0.23809524, 0.28571429, 0.33333333, 0.38095238, 0.42857143,
        0.47619048, 0.52380952, 0.57142857, 0.61904762, 0.66666667,
        0.71428571, 0.76190476, 0.80952381, 0.85714286, 0.9047619 ,
        0.95238095],
       [0.        , 0.04761905, 0.0952381 , 0.14285714, 0.19047619,
        0.23809524, 0.28571429, 0.33333333, 0.3809523

In [2]:
def get_varlen_multiply_list(embedding_dict, features, varlen_sparse_feature_columns_name_dict):
    multiply_vec_list = []
    print(embedding_dict)
    for key_feature in varlen_sparse_feature_columns_name_dict:
        for value_feature in varlen_sparse_feature_columns_name_dict[key_feature]:
            key_feature_length_name = key_feature.name + '_seq_length'
            if isinstance(value_feature, VarLenSparseFeat):
                value_input = embedding_dict[value_feature.name]
            elif isinstance(value_feature, DenseFeat):
                value_input = features[value_feature.name]
            else:
                raise TypeError("Invalid feature column type,got",type(value_feature))
            if key_feature_length_name in features:
                varlen_vec = SequenceMultiplyLayer(supports_masking=False)(
                    [embedding_dict[key_feature.name], features[key_feature_length_name], value_input])
                vec = SequencePoolingLayer('sum', supports_masking=False)(
                    [varlen_vec, features[key_feature_length_name]])
            else:
                varlen_vec = SequenceMultiplyLayer(supports_masking=True)(
                    [embedding_dict[key_feature.name], value_input])
                vec = SequencePoolingLayer('sum', supports_masking=True)( varlen_vec)
            multiply_vec_list.append(vec)
    return multiply_vec_list

class SequenceMultiplyLayer(Layer):
    def __init__(self, supports_masking, **kwargs):
        super(SequenceMultiplyLayer, self).__init__(**kwargs)
        self.supports_masking = supports_masking

    def build(self, input_shape):
        if not self.supports_masking:
            self.seq_len_max = int(input_shape[0][1])
        super(SequenceMultiplyLayer, self).build(
            input_shape)  # Be sure to call this somewhere!

    def call(self, input_list, mask=None, **kwargs):
        if self.supports_masking:
            if mask is None:
                raise ValueError(
                    "When supports_masking=True,input must support masking")
            key_input, value_input = input_list
            mask = tf.cast(mask[0], tf.float32)
            mask = tf.expand_dims(mask, axis=2)
        else:
            key_input, key_length_input, value_input = input_list
            mask = tf.sequence_mask(key_length_input,   # shape为[batch_size, 1]
                                    self.seq_len_max, dtype=tf.float32)
            mask = tf.transpose(mask, (0, 2, 1))

        embedding_size = key_input.shape[-1]
        mask = tf.tile(mask, [1, 1, embedding_size])
        key_input *= mask
        if len(tf.shape(value_input)) == 2:
            value_input = tf.expand_dims(value_input, axis=2)
            value_input = tf.tile(value_input, [1, 1, embedding_size])
        return tf.multiply(key_input,value_input)

    def compute_output_shape(self, input_shape):
        return input_shape[0]

    def compute_mask(self, inputs, mask):
        if self.supports_masking:
            return mask[0]
        else:
            return None

    def get_config(self, ):
        config = {'supports_masking': self.supports_masking}
        base_config = super(SequenceMultiplyLayer, self).get_config()
        return dict(list(base_config.items()) + list(config.items()))

array([0.53416258, 0.01761049, 0.34322868, 0.26593143, 0.093144  ,
       0.67566761, 0.6367115 , 0.13800769, 0.20335424, 0.45804717])

In [44]:
import tensorflow as tf 
tf.InteractiveSession()



<tensorflow.python.client.session.InteractiveSession at 0x1624d9793c8>

In [47]:
x = tf.sequence_mask([[1], [3], [2]], 5)
print(x.eval())
print(x.shape)
x = tf.transpose(x, [0, 2, 1])
print(x.eval(), x.shape)

[[[ True False False False False]]

 [[ True  True  True False False]]

 [[ True  True False False False]]]
(3, 1, 5)
[[[ True]
  [False]
  [False]
  [False]
  [False]]

 [[ True]
  [ True]
  [ True]
  [False]
  [False]]

 [[ True]
  [ True]
  [False]
  [False]
  [False]]] (3, 5, 1)
