In [165]:
import sklearn
import pandas as pd
import scipy
import math
import numpy as np
import gzip
import json

from sklearn.metrics import mean_squared_error, log_loss
from collections import defaultdict

In [None]:
# deepFM = FM + DNN
n = 5 # number of samples
d = m = 10 # number of features

W = [0] * m # w_i -- scalar for each feature
V = [[0] * m for i in range(m)] # V_i for each feature is interaction with others

y = sigmoid(y_fm + y_dnn) # result

In [194]:
# FM + SGD
# https://www.csie.ntu.edu.tw/~b97053/paper/Factorization%20Machines%20with%20libFM.pdf

# y_fm = np.inner(W, X)
# for i in range(d):
#     for j in range(i + 1, d):
#         y_fm += np.inner(V[i], V[j]) * X[i] * X[j]

def sigmoid(x):
    return 1 / (1 + math.exp(-x))        

def dsigmoid(x):
    return sigmoid(x) * (1 - sigmoid(x))
        
class FM(object):
    # l = lamda = regularization paramets
    # etha = learning rate
    # sigma = initialization
    # p = number of features
    # n = number of samples
    # k = the dimensionality of the factorization
    # size(V) = p * k
    def __init__(self, l=0.01, etha=0.01, sigma=0.001, k=5):
        self.l = l
        self.etha = etha
        self.sigma = sigma
        self.k = 0
        self.w0 = 0
        self.W = None
        self.V = None
    
    def fit(self, X, Y, p, n_iter=20):
        self.n = X.shape[0]
        self.p = p
        self.W = np.zeros(self.p)
        self.V = self.sigma * np.random.randn(self.p, self.k)
        
        for iteration in range(n_iter):
            for x, y in zip(X, Y):
                pred = self.predict_sample(x)
                # eq 19 + eq 5
                dl_w0 = (sigmoid(pred * y) - 1) * y
                self.w0 -= self.etha * (dl_w0 + 2 * self.l * self.w0)

#                 for i in range(self.p):
#                     if x[i] == 0:
#                         continue
                for i, fval in x:
                    dl_wi = dl_w0 * fval
                    self.W[i] -= self.etha * (dl_wi + 2 * self.l * self.W[i])
    
                    for f in range(self.k):
                        v_sum = sum([x[j] * self.V[j, f] for j in range(p)]) - self.V[i, f] * fval
                        dl_v = dl_wi * v_sum
                        self.V[i, f] -= self.etha * (dl_v + 2 * self.l * self.V[i, f])        
            
        return self
        
    def predict_sample(self, x):
        res = self.w0
        for j, fval in x:
            res += self.W[j] * fval
        for f in range(self.k):
            s1 = 0
            s2 = 0
            for j, fval in x:
                s1 += self.V[j, f] * fval
                s2 += (self.V[j, f] * fval) ** 2
            res += 0.5 * (s1 ** 2 - s2)
            
        return res
    
    def predict(self, X):
        return np.array([self.predict_sample(x) for x in X])
    
    def scale(self, Y):
        return np.array([sigmoid(y) for y in Y])
    

In [210]:
# deep component
# FFN
# https://habr.com/post/198268/,/

# for i in range(h):
#     a = sigma(W * a + b)
# y_dnn = W * a + b


class DNN(object):
    
    def __init__(self, alpha=0.5, p=50):
        self.alpha = alpha # Скорость обучения
        self.p = p # Количество скрытых z нейронов
        
    def backpropagate(self, x_in, y):
        t = [0] * self.m
        t[y] = 1
        
        x = [0] * self.n
        for i, fval in x_in:
            x[i] = fval
            
        # step 4. скрытый нейрон z
        z_in = [np.dot(self.V[:, i], x) + self.V0[i] for i in range(self.p)]
        z = [sigmoid(a) for a in z_in]

        # step 5. выходной нейрон y
        y_in = [np.dot(self.W[:, i], z_in) + self.W0[i] for i in range(self.m)]
        y = [sigmoid(a) for a in y_in]
        
        # backpropagation
        # step 6. evaluate y error
        sigma = np.array([(t[i] - y[i]) * dsigmoid(y_in[i]) for i in range(self.m)])
        delta_W = np.array([[self.alpha * sigma[k] * z[j] for k in range(self.m)] for j in range(self.p)])
        delta_W0 = self.alpha * sigma

        # step 7. evaluate z error
        sigma_in = [np.dot(sigma, self.W[j]) for j in range(self.p)]
        sigma_j = np.array([sigma_in[j] * dsigmoid(z_in[j]) for j in range(self.p)])
        delta_V = np.array([[self.alpha * sigma_j[j] * x[i] for j in range(self.p)] for i in range(self.n)])
        delta_V0 = self.alpha * sigma_j

        # Смещение скрытого нейрона j
        self.V += delta_V
        self.V0 += delta_V0
        # Смещение нейрона на выходе
        self.W += delta_W
        self.W0 += delta_W0
        
        return mean_squared_error(t, y)

    def fit(self, X, Y, n, m=2, n_iter=3):
        self.n = n # количество признаков
        self.m = m # количество выходных y нейронов
        # Смещение скрытого нейрона j
        self.V = np.random.random((self.n, self.p)) - 0.5
        self.V0 = np.random.random(self.p) - 0.5
        # Смещение нейрона на выходе
        self.W = np.random.random((self.p, self.m)) - 0.5
        self.W0 = np.random.random(self.m) - 0.5

        for iteration in range(n_iter):
            print(iteration)
            error = 0
            for x, y in zip(X, Y):
                error += self.backpropagate(x, y)
#             print(error / len(Y))
        
    def predict_sample(self, x_in):
        x = [0] * self.n
        for i, fval in x_in:
            x[i] = fval
        z_in = [np.dot(self.V[:, i], x) + self.V0[i] for i in range(self.p)]
        y = [sigmoid(np.dot(self.W[:, i], z_in) + self.W0[i]) for i in range(self.m)]
        
        return y[1]

        
    def predict(self, X):
        return np.array([self.predict_sample(x) for x in X])

        

In [215]:
# DeepFM

class DeepFM(object):
    
    def __init__(self, feature_count):
        self.feature_count = feature_count
        self.fm = FM()
        self.dnn = DNN()
        
    def fit(self, X, Y):
        print('[fit] FM')
        self.fm.fit(X, Y, self.feature_count)
        print('[fit] DNN')
        self.dnn.fit(X, Y, self.feature_count)
        print('[fit] Done')
        
    def predict_sample(self, x):
        y_fm = self.fm.predict_sample(x)
        
        y_dnn = self.dnn.predict_sample(x)
        
        return sigmoid(y_fm + y_dnn)
        
    def predict(self, X):
        return np.array([self.predict_sample(x) for x in X])

In [198]:
# test
# dnn = DNN()
# fm = FM()
deepFM = DeepFM(5)
X_train = np.array([[(2, 2), (3, 0), (4, 1), (0, 2)],
                   [(1, 1), (2, 2), (3, 1), (4, 1), (0, 2)],
                   [(1, 1), (2, 1), (3, 2), (4, 0), (0, 2)],
                   [(1, 1), (2, 1), (3, 0), (4, 0), (0, 2)],
                   [(1, 0), (2, 2), (3, 1), (4, 1)],
                   [(1, 0), (2, 2), (3, 2), (4, 1), (0, 2)],
                   [(1, 0), (2, 2), (3, 1), (0, 2)]])
Y_train = np.array([1, 1, 0, 0, 1, 1, 1])
# fm.fit(X_train, Y_train)
# dnn.fit(X_train, Y_train)
deepFM.fit(X_train, Y_train)

print(deepFM.predict(X_train))

[ 0.95266455  0.95951973  0.87693838  0.82977878  0.92653548  0.96863567
  0.94904921]


In [199]:
# read data (avazu dataset)
# https://www.kaggle.com/c/avazu-ctr-prediction/data
# train ?
# test 4,5 млн сэмплов
sample_path = 'dataset/sampleSubmission.gz'
train_path = 'dataset/train.gz'
test_path = 'dataset/test.gz'

with gzip.open(sample_path, 'rt') as f:
    reader = csv.DictReader(f, delimiter=',')
    for row in reader:
        print(row)
        break

{'id': '10000174058809263569', 'click': '0.5'}


In [217]:
# Prepare features
numerical_features = ['banner_pos', 'hour', 'id']
categorical_features = ['device_type', 'device_conn_type']
text_features = ['app_category', 'app_domain', 'app_id', 
                 'device_id', 'device_ip', 'device_model', 
                 'site_category', 'site_domain', 'site_id']

def prepare_sample(sample):
    global feature_to_index, index_to_feature
    
    result = []
    click = int(sample.get('click', -1))
    for fname, fvalue in sample.items():
        if fname in numerical_features:
            fvalue = math.log(int(fvalue) + 1)
        elif fname in text_features:
            fname = "{}_{}".format(fname, abs(hash(fvalue)) % (10 ** 4))
            fvalue = 1
        else:
            fname = "{}_{}".format(fname, fvalue)
            fvalue = 1
        if feature_to_index.get(fname, False):
            result.append((feature_to_index[fname], fvalue))
        
    return (result, click)

In [183]:
# get features
feature_to_index = dict()
index_to_feature = dict()
with open('dataset/features', 'r') as in_file:
    for feature in in_file:
        num = len(feature_to_index)
        feature_to_index[feature.strip()] = num
        index_to_feature[num] = feature.strip()
print(len(feature_to_index))

34795


In [174]:
# save features
with open('dataset/features', 'w') as out:
    for x, y in features_count.items():
        print(x, file=out)

In [201]:
train_path = 'dataset/train.gz'

train_dataset = []
with gzip.open(train_path, 'rt') as f:
    reader = csv.DictReader(f, delimiter=',')
    for num, row in enumerate(reader):
        if num % 1000000 == 0:
            print(num)
        if num == 100:
            break
        sample = prepare_sample(row)
        train_dataset.append(sample)
#         print(sample)

0


In [None]:
# learning model
features_count = len(feature_to_index)
deepFM = DeepFM(features_count)
X_train, Y_train = zip(*train_dataset)
X_train = np.array(X_train)
Y_train = np.array(Y_train)
deepFM.fit(X_train, Y_train)

print(deepFM.predict(X_train))

In [None]:
test_path = 'dataset/test.gz'

test_dataset = []
with gzip.open(test_path, 'rt') as f:
    reader = csv.DictReader(f, delimiter=',')
    for num, row in enumerate(reader):
        if num % 1000000 == 0:
            print(num)
        if num == 100:
            break
        sample = prepare_sample(row)
        test_dataset.append(sample)
#         print(sample)
X_test, Y_test = zip(*train_dataset)
deepFM.predict(X_test)