In [20]:
import torch
from torch.autograd import Variable

In [21]:
import pandas as pd
import math
from tqdm import tqdm

In [22]:
class MatrixFactorization(torch.nn.Module):
    def __init__(self, n_users, n_items, n_factors=20):
        super().__init__()
	    # create user embeddings
        self.user_factors = torch.nn.Embedding(n_users, n_factors,
                                               sparse=True)
	    # create item embeddings
        self.item_factors = torch.nn.Embedding(n_items, n_factors,
                                               sparse=True)

    def forward(self, user, item):
    	# matrix multiplication
        return (self.user_factors(user)*self.item_factors(item)).sum(1)

    def predict(self, user, item):
        return self.forward(user, item)

In [15]:
articles_df = pd.read_csv('data/shared_articles.csv')
# Use only available pages
articles_df = articles_df[articles_df['eventType'] == 'CONTENT SHARED']
interactions_df = pd.read_csv('data/users_interactions.csv')
event_type_strength = {
   'VIEW': 1.0,
   'LIKE': 2.0, 
   'BOOKMARK': 2.5, 
   'FOLLOW': 3.0,
   'COMMENT CREATED': 4.0,  
}

interactions_df['eventStrength'] = interactions_df['eventType'].apply(lambda x: event_type_strength[x])

In [17]:
def smooth_user_preference(x):
    """normalized values
    """
    return math.log(1+x, 2)

In [31]:
interactions_df

Unnamed: 0,timestamp,eventType,contentId,personId,sessionId,userAgent,userRegion,userCountry,eventStrength
0,1465413032,VIEW,-3499919498720038879,-8845298781299428018,1264196770339959068,,,,1.0
1,1465412560,VIEW,8890720798209849691,-1032019229384696495,3621737643587579081,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2...,NY,US,1.0
2,1465416190,VIEW,310515487419366995,-1130272294246983140,2631864456530402479,,,,1.0
3,1465413895,FOLLOW,310515487419366995,344280948527967603,-3167637573980064150,,,,3.0
4,1465412290,VIEW,-7820640624231356730,-445337111692715325,5611481178424124714,,,,1.0
...,...,...,...,...,...,...,...,...,...
72307,1485190425,LIKE,-6590819806697898649,-9016528795238256703,8614469745607949425,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4...,MG,BR,2.0
72308,1485190425,VIEW,-5813211845057621660,102305705598210278,5527770709392883642,Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/53...,SP,BR,1.0
72309,1485190072,VIEW,-1999468346928419252,-9196668942822132778,-8300596454915870873,Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...,SP,BR,1.0
72310,1485190434,VIEW,-6590819806697898649,-9016528795238256703,8614469745607949425,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4...,MG,BR,1.0


In [18]:

tmp = interactions_df.groupby(['personId', 'contentId']).size()
users_interactions_count_df = tmp.groupby('personId').size()
users_with_enough_interactions_df = users_interactions_count_df[users_interactions_count_df >= 5]
users_with_enough_interactions_df = users_with_enough_interactions_df.reset_index()[['personId']]
interactions_from_selected_users_df = interactions_df.merge(
    users_with_enough_interactions_df, 
    how = 'right',
    left_on = 'personId',
    right_on = 'personId'
)
interactions_full_df = interactions_from_selected_users_df \
                    .groupby(['personId', 'contentId'])['eventStrength'].sum() \
                    .apply(smooth_user_preference).reset_index()

In [19]:
interactions_full_df

Unnamed: 0,personId,contentId,eventStrength
0,-9223121837663643404,-8949113594875411859,1.000000
1,-9223121837663643404,-8377626164558006982,1.000000
2,-9223121837663643404,-8208801367848627943,1.000000
3,-9223121837663643404,-8187220755213888616,1.000000
4,-9223121837663643404,-7423191370472335463,3.169925
...,...,...,...
39101,9210530975708218054,8477804012624580461,3.247928
39102,9210530975708218054,8526042588044002101,1.000000
39103,9210530975708218054,8856169137131817223,1.000000
39104,9210530975708218054,8869347744613364434,1.000000


In [23]:
train_size = int(0.7 * len(interactions_full_df))
interactions_train_df = interactions_full_df[:train_size]
interactions_test_df = interactions_full_df[train_size:]

#Creating a sparse pivot table with users in rows and items in columns
users_items_pivot_matrix_df = interactions_train_df.pivot(
    index='personId', 
    columns='contentId', 
    values='eventStrength'
).fillna(0)

In [25]:
users_items_pivot_matrix_df

contentId,-9222795471790223670,-9216926795620865886,-9194572880052200111,-9192549002213406534,-9190737901804729417,-9189659052158407108,-9184137057748005562,-9176143510534135851,-9172673334835262304,-9171475473795142532,...,9191014301634017491,9207286802575546269,9208127165664287660,9209629151177723638,9209886322932807692,9213260650272029784,9215261273565326920,9217155070834564627,9220445660318725468,9222265156747237864
personId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
-9223121837663643404,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
-9212075797126931087,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
-9207251133131336884,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
-9199575329909162940,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
-9196668942822132778,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3472075810981614387,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3494915559963121377,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3499125955852759846,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3508383192344282071,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [39]:
users_items_pivot_matrix = users_items_pivot_matrix_df.values
users_items_pivot_matrix = torch.FloatTensor(users_items_pivot_matrix)

n_users = len(users_items_pivot_matrix)
n_items = len(users_items_pivot_matrix[0])

model = MatrixFactorization(n_users, n_items, n_factors=20)
loss_fn = torch.nn.MSELoss() 
optimizer = torch.optim.SGD(model.parameters(),
                            lr=1e-6)




for user, item in zip(users_items_pivot_matrix, users_items_pivot_matrix[0]):
    # get user, item and rating data
    # rating = Variable(torch.FloatTensor([ratings[user, item]]))


    user = Variable(torch.LongTensor([456]))
    print(user)
    # item = Variable(torch.LongTensor([int(item)]))
    # print(user)
    # predict
    # prediction = model(user, item)
    # loss = loss_fn(prediction, rating)    
    # loss.backward()    
    # optimizer.step()

tensor([456])
tensor([456])
tensor([456])
tensor([456])
tensor([456])
tensor([456])
tensor([456])
tensor([456])
tensor([456])
tensor([456])
tensor([456])
tensor([456])
tensor([456])
tensor([456])
tensor([456])
tensor([456])
tensor([456])
tensor([456])
tensor([456])
tensor([456])
tensor([456])
tensor([456])
tensor([456])
tensor([456])
tensor([456])
tensor([456])
tensor([456])
tensor([456])
tensor([456])
tensor([456])
tensor([456])
tensor([456])
tensor([456])
tensor([456])
tensor([456])
tensor([456])
tensor([456])
tensor([456])
tensor([456])
tensor([456])
tensor([456])
tensor([456])
tensor([456])
tensor([456])
tensor([456])
tensor([456])
tensor([456])
tensor([456])
tensor([456])
tensor([456])
tensor([456])
tensor([456])
tensor([456])
tensor([456])
tensor([456])
tensor([456])
tensor([456])
tensor([456])
tensor([456])
tensor([456])
tensor([456])
tensor([456])
tensor([456])
tensor([456])
tensor([456])
tensor([456])
tensor([456])
tensor([456])
tensor([456])
tensor([456])
tensor([456])
tensor

In [30]:
model.user_factors

Embedding(771, 20, sparse=True)

In [29]:
users_items_pivot_matrix

tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 2., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])