## Dataset source
https://www.kaggle.com/gspmoreira/articles-sharing-reading-from-cit-deskdrop

In [1]:
import torch

In [30]:
import pandas as pd
import math
from tqdm import tqdm
# from sklearn.model_selection import train_test_split

In [3]:
articles_df = pd.read_csv('data/shared_articles.csv')
# Use only available pages
articles_df = articles_df[articles_df['eventType'] == 'CONTENT SHARED']

In [4]:
interactions_df = pd.read_csv('data/users_interactions.csv')

In [5]:
# personId and contentId are in the format of numpy.int64
interactions_df

Unnamed: 0,timestamp,eventType,contentId,personId,sessionId,userAgent,userRegion,userCountry
0,1465413032,VIEW,-3499919498720038879,-8845298781299428018,1264196770339959068,,,
1,1465412560,VIEW,8890720798209849691,-1032019229384696495,3621737643587579081,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2...,NY,US
2,1465416190,VIEW,310515487419366995,-1130272294246983140,2631864456530402479,,,
3,1465413895,FOLLOW,310515487419366995,344280948527967603,-3167637573980064150,,,
4,1465412290,VIEW,-7820640624231356730,-445337111692715325,5611481178424124714,,,
...,...,...,...,...,...,...,...,...
72307,1485190425,LIKE,-6590819806697898649,-9016528795238256703,8614469745607949425,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4...,MG,BR
72308,1485190425,VIEW,-5813211845057621660,102305705598210278,5527770709392883642,Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/53...,SP,BR
72309,1485190072,VIEW,-1999468346928419252,-9196668942822132778,-8300596454915870873,Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...,SP,BR
72310,1485190434,VIEW,-6590819806697898649,-9016528795238256703,8614469745607949425,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4...,MG,BR


In [6]:
event_type_strength = {
   'VIEW': 1.0,
   'LIKE': 2.0, 
   'BOOKMARK': 2.5, 
   'FOLLOW': 3.0,
   'COMMENT CREATED': 4.0,  
}

interactions_df['eventStrength'] = interactions_df['eventType'].apply(lambda x: event_type_strength[x])

In [7]:
interactions_df.head(5)

Unnamed: 0,timestamp,eventType,contentId,personId,sessionId,userAgent,userRegion,userCountry,eventStrength
0,1465413032,VIEW,-3499919498720038879,-8845298781299428018,1264196770339959068,,,,1.0
1,1465412560,VIEW,8890720798209849691,-1032019229384696495,3621737643587579081,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2...,NY,US,1.0
2,1465416190,VIEW,310515487419366995,-1130272294246983140,2631864456530402479,,,,1.0
3,1465413895,FOLLOW,310515487419366995,344280948527967603,-3167637573980064150,,,,3.0
4,1465412290,VIEW,-7820640624231356730,-445337111692715325,5611481178424124714,,,,1.0


In [8]:
tmp = interactions_df.groupby(['personId', 'contentId']).size()
users_interactions_count_df = tmp.groupby('personId').size()

print('# users: %d' % len(users_interactions_count_df))

# users: 1895


In [9]:
users_with_enough_interactions_df = users_interactions_count_df[users_interactions_count_df >= 5]
users_with_enough_interactions_df = users_with_enough_interactions_df.reset_index()[['personId']]

print('# users with at least 5 interactions: %d' % len(users_with_enough_interactions_df))

# users with at least 5 interactions: 1140


In [10]:
interactions_from_selected_users_df = interactions_df.merge(
    users_with_enough_interactions_df, 
    how = 'right',
    left_on = 'personId',
    right_on = 'personId'
)
print('# of interactions from users with at least 5 interactions: %d' % len(interactions_from_selected_users_df))

# of interactions from users with at least 5 interactions: 69868


In [11]:
def smooth_user_preference(x):
    """normalized values
    """
    return math.log(1+x, 2)

In [12]:
interactions_full_df = interactions_from_selected_users_df \
                    .groupby(['personId', 'contentId'])['eventStrength'].sum() \
                    .apply(smooth_user_preference).reset_index()

print('# of unique user/item interactions: %d' % len(interactions_full_df))

# of unique user/item interactions: 39106


In [13]:
# interactions_train_df, interactions_test_df = train_test_split(
#     interactions_full_df,
#     stratify=interactions_full_df['personId'], 
#     test_size=0.20,
#     random_state=42
# )

train_size = int(0.7 * len(interactions_full_df))
interactions_train_df = interactions_full_df[:train_size]
interactions_test_df = interactions_full_df[train_size:]

print('# interactions on Train set: %d' % len(interactions_train_df))
print('# interactions on Test set: %d' % len(interactions_test_df))

# interactions on Train set: 27374
# interactions on Test set: 11732


In [14]:
#Creating a sparse pivot table with users in rows and items in columns
users_items_pivot_matrix_df = interactions_train_df.pivot(
    index='personId', 
    columns='contentId', 
    values='eventStrength'
).fillna(0)

In [15]:
users_items_pivot_matrix_df

contentId,-9222795471790223670,-9216926795620865886,-9194572880052200111,-9192549002213406534,-9190737901804729417,-9189659052158407108,-9184137057748005562,-9176143510534135851,-9172673334835262304,-9171475473795142532,...,9191014301634017491,9207286802575546269,9208127165664287660,9209629151177723638,9209886322932807692,9213260650272029784,9215261273565326920,9217155070834564627,9220445660318725468,9222265156747237864
personId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
-9223121837663643404,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
-9212075797126931087,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
-9207251133131336884,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
-9199575329909162940,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
-9196668942822132778,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3472075810981614387,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3494915559963121377,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3499125955852759846,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3508383192344282071,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
users_items_pivot_matrix = users_items_pivot_matrix_df.values
users_items_pivot_matrix[:10]

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 2., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [17]:
users_items_pivot_matrix = torch.FloatTensor(users_items_pivot_matrix)

In [18]:
users_items_pivot_matrix

tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 2., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])

In [20]:
n_users = len(users_items_pivot_matrix)
n_items = len(users_items_pivot_matrix[0])
n_factors = 15

In [21]:
class MF_model(torch.nn.Module):
    def __init__(self, n_users, n_items, n_factors):
        super().__init__()
        self.user_factors = torch.nn.Parameter(
            torch.rand(n_users, n_factors, requires_grad=True)
        )
        self.item_factors = torch.nn.Parameter(
            torch.rand(n_factors, n_items, requires_grad=True)
        )

    def forward(self):
        return torch.mm(self.user_factors, self.item_factors)

In [22]:
model = MF_model(n_users, n_items, n_factors)
loss_f = torch.nn.MSELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)  # learning rate

In [25]:

# model = model.cuda()
# users_items_pivot_matrix = users_items_pivot_matrix.cuda()

In [31]:
for epoch in tqdm(range(100000)):
    optimizer.zero_grad()
    output = model()
    loss = loss_f(output, users_items_pivot_matrix)
    if epoch % 10000 == 0:
        print(f"Epoch: {epoch}, Loss: {loss}")
    loss.backward()
    optimizer.step()

  0%|          | 41/100000 [00:00<08:01, 207.76it/s]

Epoch: 0, Loss: 12.509073257446289


 10%|█         | 10041/100000 [00:41<06:07, 244.96it/s]

Epoch: 10000, Loss: 9.961349487304688


 20%|██        | 20048/100000 [01:23<05:31, 241.46it/s]

Epoch: 20000, Loss: 8.050006866455078


 30%|███       | 30031/100000 [02:05<04:49, 241.46it/s]

Epoch: 30000, Loss: 6.583823204040527


 40%|████      | 40048/100000 [02:46<04:03, 245.97it/s]

Epoch: 40000, Loss: 5.438353538513184


 50%|█████     | 50036/100000 [03:27<03:24, 244.30it/s]

Epoch: 50000, Loss: 4.529758930206299


 60%|██████    | 60036/100000 [04:08<02:42, 245.93it/s]

Epoch: 60000, Loss: 3.799837589263916


 70%|███████   | 70036/100000 [04:49<02:03, 243.11it/s]

Epoch: 70000, Loss: 3.2071495056152344


 80%|████████  | 80038/100000 [05:30<01:27, 229.06it/s]

Epoch: 80000, Loss: 2.7215118408203125


 90%|█████████ | 90030/100000 [06:12<00:41, 240.17it/s]

Epoch: 90000, Loss: 2.3205196857452393


100%|██████████| 100000/100000 [06:54<00:00, 241.13it/s]


In [32]:
print(model.user_factors.shape)


torch.Size([771, 15])

In [28]:
model.item_factors.shape

torch.Size([15, 2926])