In [116]:
import numpy as np
import math
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.metrics import roc_curve, accuracy_score, auc
np.set_printoptions(suppress=True)

In [27]:
path = "F:/for learn/Python/fm_python-master/"

In [28]:
train_data = pd.read_table(path + "diabetes_train.txt", header=None, delimiter=",").values
test_data = pd.read_table(path + "diabetes_test.txt", header=None, delimiter=",").values

In [29]:
feature_num = 8
X_train = train_data[:, :feature_num]
y_train = train_data[:, feature_num]
X_test = test_data[:, :feature_num]
y_test = test_data[:, feature_num]

In [51]:
ss = StandardScaler(with_mean=True, with_std=True)
X_train_ss = ss.fit_transform(X_train)
X_test_ss = ss.transform(X_test)

In [151]:
# Plain SGD Logistic Regression
class logistic_regression():
    def __init__(self, max_iter=100, alpha=0.01):
        self.max_iter = max_iter
        self.alpha = alpha
        
    def sigmoid(self, x):
        sign = 1 if x > 0 else -1
        x = sign * 50 if abs(x) > 50 else x
        return np.asscalar(1 / (1 + np.exp(-x)))    
    
        
    def fit(self, dat, labels):
        dat = np.mat(np.c_[np.ones((dat.shape[0], 1)), dat])
        m, n = dat.shape  # m * n
        # 参数初始化
        W = np.mat(np.ones((n, 1)))  # n * 1

        i = 0
        while i < self.max_iter:
            dataIndex = np.arange(m).tolist()
            for j in range(m):
                randIndex = math.floor(np.random.randint(0, len(dataIndex)))
                x = dat[randIndex, :]  # 1 * n
                y = labels[randIndex]
                yHat = self.sigmoid(x * W)
                err = yHat - y
                #print("iter %d, sample %d, err: %.4f" % (i, randIndex, err))
                W -= self.alpha * err * x.T

                del dataIndex[randIndex]
            i += 1

        self.w0 = W[0, 0]
        self.W = W[1::]
        return self  
    
    def predict_proba(self, dat):
        n = self.W.shape[0]
        dat = np.mat(dat).reshape(-1, n)
        Z = dat * self.W + self.w0
        pos_proba = np.array([self.sigmoid(z) for z in Z]).reshape(-1, 1)
        neg_proba = 1 - pos_proba
        y_pred_proba = np.c_[neg_proba, pos_proba]
        return y_pred_proba
    
    def predict(self, dat):
        y_pred_proba = self.predict_proba(dat)
        y_pred = y_pred_proba[:, 1] >= y_pred_proba[:, 0]
        return y_pred.astype(int)

In [194]:
# User Defined FM with lr
class FMRegression():
    def __init__(self, max_iter=100, alpha=0.01, k=5):
        self.max_iter = max_iter
        self.alpha = alpha
        self.k = k
    
    def sigmoid(self, x):
        sign = 1 if x>0 else -1
        x = sign*50 if abs(x)>50 else x
        return np.asscalar(1 / (1 + np.exp(-x))) 
    
    def fit(self, dat, labels):
        dat = np.mat(dat) # m * n
        m, n = dat.shape
        # 参数初始化
        self.W = np.mat(np.ones((n, 1)))  # n * 1
        self.V = np.mat(np.random.normal(loc=0.0, scale=0.5, size=(n, self.k)))  # n * k
        self.w0 = 1.0

        i = 0
        while i < self.max_iter:
            dataIndex = np.arange(m).tolist()
            for j in range(m):
                randIndex = math.floor(np.random.randint(0, len(dataIndex)))
                x = dat[randIndex, :]  # 1 * n
                y = labels[randIndex]
                inter_1 = np.power(x * self.V, 2)  # 1 * k
                inter_2 = np.power(x, 2) * np.power(self.V, 2)  # 1 * k
                interaction = 0.5 * np.sum(inter_1 - inter_2, axis=1) 
                yHat = self.sigmoid(self.w0 + x * self.W + interaction)
                err = yHat - y
                if abs(err) <= 1e-8:
                    #print("iter: %d, sample: %d. Err too small" % (i, j))
                    continue
                self.w0 -= self.alpha * err
                self.W -= self.alpha * x.T * err 
                for s in range(n):
                    for l in range(self.k):
                        dv = x[0, s] * (np.asscalar(x * self.V[:, l]) - self.V[s, l] * x[0, s])
                        self.V[s, l] -= self.alpha * err * dv

                del dataIndex[randIndex]

            i += 1
            return self
    def predict_proba(self, dat):
        n = self.W.shape[0]
        dat = np.mat(dat).reshape(-1, n)  # m * n
        inter_1 = np.power(dat * self.V, 2)  # m * k
        inter_2 = np.power(dat, 2) * np.power(self.V, 2)  # m * k
        interaction = 0.5 * np.sum(inter_1 - inter_2, axis=1)  # m * 1        
        Z = dat * self.W + self.w0 + interaction
        pos_proba = np.array([self.sigmoid(z) for z in Z]).reshape(-1, 1)
        neg_proba = 1 - pos_proba
        y_pred_proba = np.c_[neg_proba, pos_proba]
        return y_pred_proba
    
    def predict(self, dat):
        y_pred_proba = self.predict_proba(dat)
        y_pred = y_pred_proba[:, 1] >= y_pred_proba[:, 0]
        return y_pred.astype(int)

In [196]:
lr = logistic_regression(max_iter=100, alpha=0.01)
fm_lr = FMRegression(max_iter=100, alpha=0.01, k=5)
sk_lr = LogisticRegression(max_iter=100, random_state=0)
sgd_lr = SGDClassifier(alpha=0.01, max_iter=100, loss='log', random_state=0)

In [127]:
def evaluation(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_test_proba = model.predict_proba(X_test)
    y_test_pred = model.predict(X_test)
    y_train_proba = model.predict_proba(X_train)
    y_train_pred = model.predict(X_train)
    
    fpr, tpr, thresh = roc_curve(y_true=y_test, y_score=y_test_proba[:, 1], pos_label=1)
    print("the test auc: ",auc(fpr, tpr))

    fpr, tpr, thresh = roc_curve(y_true=y_train, y_score=y_train_proba[:, 1], pos_label=1)
    print("the train auc: ",auc(fpr, tpr))

    print("the test acc:", accuracy_score(y_true=y_test, y_pred=y_test_pred))
    print("the train acc:", accuracy_score(y_true=y_train, y_pred=y_train_pred))

In [199]:
for model in [lr, fm_lr, sk_lr, sgd_lr]:
    print(model.__class__.__name__)
    evaluation(model, X_train_ss, X_test_ss, y_train, y_test)
    print("\n")

logistic_regression
the test auc:  0.874009711219
the train auc:  0.811441702951
the test acc: 0.824626865672
the train acc: 0.736


FMRegression
the test auc:  0.802517250192
the train auc:  0.768245905038
the test acc: 0.720149253731
the train acc: 0.69


LogisticRegression
the test auc:  0.875095834398
the train auc:  0.820530098832
the test acc: 0.809701492537
the train acc: 0.77


SGDClassifier
the test auc:  0.877012522361
the train auc:  0.820633768747
the test acc: 0.809701492537
the train acc: 0.768


