In [1]:
import numpy as np
from sklearn.feature_extraction import DictVectorizer
from pyfm import pylibfm


In [None]:
def loadData(filename,path="ml-100k/"):
    data = []
    y = []
    users=set()
    items=set()
    with open(path+filename) as f:
        for line in f:
            (user,movieid,rating,ts)=line.split('\t')
            data.append({ "user_id": str(user), "movie_id": str(movieid)})
            y.append(float(rating))
            users.add(user)
            items.add(movieid)

    return (data, np.array(y), users, items)


In [None]:
(train_data, y_train, train_users, train_items) = loadData("u.data")
(test_data, y_test, test_users, test_items) = loadData("u1.test")
v = DictVectorizer()
X_train = v.fit_transform(train_data)
X_test = v.transform(test_data)

# Build and train a Factorization Machine
fm = pylibfm.FM(num_factors=2, num_iter=100, verbose=True, task="regression", initial_learning_rate=0.001, learning_rate_schedule="optimal")

fm.fit(X_train,y_train)

In [None]:
# Evaluate

preds = fm.predict(X_test)
from sklearn.metrics import mean_squared_error
print("FM MSE: %.4f" % mean_squared_error(y_test,preds))
print(y_train)
print(X_train.dtype)

In [None]:
#Testing small data
from scipy import sparse
features = np.matrix([
#     Users  |     Movies     |    Movie Ratings   | Time | Last Movies Rated
#    A  B  C | TI  NH  SW  ST | TI   NH   SW   ST  |      | TI  NH  SW  ST
    [1, 0, 0,  1,  0,  0,  0,   0.3, 0.3, 0.3, 0,     13,   0,  0,  0,  0 ],
    [1, 0, 0,  0,  1,  0,  0,   0.3, 0.3, 0.3, 0,     14,   1,  0,  0,  0 ],
    [1, 0, 0,  0,  0,  1,  0,   0.3, 0.3, 0.3, 0,     16,   0,  1,  0,  0 ],
    [0, 1, 0,  0,  0,  1,  0,   0,   0,   0.5, 0.5,   5,    0,  0,  0,  0 ],
    [0, 1, 0,  0,  0,  0,  1,   0,   0,   0.5, 0.5,   8,    0,  0,  1,  0 ],
    [0, 0, 1,  1,  0,  0,  0,   0.5, 0,   0.5, 0,     9,    0,  0,  0,  0 ],
    [0, 0, 1,  0,  0,  1,  0,   0.5, 0,   0.5, 0,     12,   1,  0,  0,  0 ]
])
target = [5., 3., 1., 4., 5., 1., 5.]
X_train = np.ndarray.astype(features,float)
X_train = sparse.csr_matrix(features,dtype=float)
y_train = target
fm = pylibfm.FM(num_factors=100, num_iter=10000, verbose=False, task="regression", initial_learning_rate=0.001, learning_rate_schedule="optimal")

fm.fit(X_train,y_train)
#predict:
preds = fm.predict(X_train)
from sklearn.metrics import mean_squared_error
print("FM MSE: %.4f" % mean_squared_error(y_train,preds))
print(y_train)
print(preds)


In [None]:
train = [
    {"user": "1", "item": "5", "age": 19},
    {"user": "2", "item": "43", "age": 33},
    {"user": "3", "item": "20", "age": 55},
    {"user": "4", "item": "10", "age": 20},
]
v = DictVectorizer()
X = v.fit_transform(train)
print(X.dtype)

In [2]:
def unison_shuffled_copies(a, b):
    assert a.shape[0] == len(b)
    p = np.random.permutation(a.shape[0])
    return a[p], b[p]

In [3]:
from scipy import sparse
X_train = sparse.load_npz("ml_latest_small_formated.npz")
filename="ml-latest-small/ratings.csv"
data=np.genfromtxt(filename,skip_header=1,delimiter=",")
y_train = data[:,-2]
X_train,y_train = unison_shuffled_copies(X_train,y_train)
X_train_batch = X_train[:10000,:]
y_train_batch = y_train[:10000]
print(y_train_batch.shape)
print(X_train_batch.shape)
fm = pylibfm.FM(num_factors=10, num_iter=10, verbose=True, task="regression", initial_learning_rate=0.001, learning_rate_schedule="optimal")

fm.fit(X_train_batch,y_train_batch)

(10000,)
(10000, 29783)
Creating validation dataset of 0.01 of training for adaptive regularization
-- Epoch 1
Training MSE: 0.67263
-- Epoch 2
Training MSE: 0.50478
-- Epoch 3
Training MSE: 0.48412
-- Epoch 4
Training MSE: 0.46780
-- Epoch 5
Training MSE: 0.45479
-- Epoch 6
Training MSE: 0.44300
-- Epoch 7
Training MSE: 0.43258
-- Epoch 8
Training MSE: 0.42296
-- Epoch 9
Training MSE: 0.41442
-- Epoch 10
Training MSE: 0.40652


In [4]:
from sklearn.metrics import mean_squared_error
X_test = X_train[60000:61000]
y_test = y_train[60000:61000]
preds = fm.predict(X_test)



print("FM MSE: %.4f" % mean_squared_error(y_test,preds))
print(y_test[:10])
print(preds[:10])

FM MSE: 0.9416
[3.  4.5 4.5 3.5 4.  2.  5.  4.5 4.5 4. ]
[3.67679184 3.69814232 3.28588304 3.46021847 3.27443678 3.27258315
 3.4754188  3.44172138 3.81021882 3.51080085]
