In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd
import math
from util import id2cat, get_cat2id
from tqdm import tqdm
# from sklearn.model_selection import train_test_split

In [2]:
USER_KEY = "personId"
ITEM_KEY = "contentId"
RATE_KEY = "eventStrength"

# preprocess
def smooth_user_preference(x):
    """normalized values
    """
    return math.log(1+x, 2)

interactions_df = pd.read_csv('data/users_interactions.csv')
event_type_strength = {
   'VIEW': 1.0,
   'LIKE': 2.0, 
   'BOOKMARK': 2.5, 
   'FOLLOW': 3.0,
   'COMMENT CREATED': 4.0,  
}

interactions_df[RATE_KEY] = interactions_df['eventType'].apply(lambda x: event_type_strength[x])

tmp = interactions_df.groupby([USER_KEY, ITEM_KEY]).size()
users_interactions_count_df = tmp.groupby(USER_KEY).size()
users_with_enough_interactions_df = users_interactions_count_df[users_interactions_count_df >= 5]
users_with_enough_interactions_df = users_with_enough_interactions_df.reset_index()[[USER_KEY]]
interactions_from_selected_users_df = interactions_df.merge(
    users_with_enough_interactions_df, 
    how = 'right',
    left_on = USER_KEY,
    right_on = USER_KEY
)
interactions_full_df = interactions_from_selected_users_df \
                    .groupby([USER_KEY, ITEM_KEY])[RATE_KEY].sum() \
                    .apply(smooth_user_preference).reset_index()

u2idx, u_cat = get_cat2id(interactions_full_df[USER_KEY])
i2idx, i_cat = get_cat2id(interactions_full_df[ITEM_KEY])
interactions_full_df[USER_KEY] = u_cat
interactions_full_df[ITEM_KEY] = i_cat

train_size = int(0.7 * len(interactions_full_df))
interactions_train_df = interactions_full_df[:train_size]
interactions_test_df = interactions_full_df[train_size:]

In [3]:
interactions_train_df

Unnamed: 0,personId,contentId,eventStrength
0,0,65,1.000000
1,0,160,1.000000
2,0,188,1.000000
3,0,196,1.000000
4,0,313,3.169925
...,...,...,...
27369,770,575,1.000000
27370,770,604,1.000000
27371,770,611,1.584963
27372,770,774,1.584963


In [4]:
class MF(nn.Module):
    def __init__(self, num_users, num_items, emb_size=1000):
        super(MF, self).__init__()
        self.user_emb = nn.Embedding(num_users, emb_size)
        self.item_emb = nn.Embedding(num_items, emb_size)
        # initializing our matrices with a positive number generally will yield better results
        self.user_emb.weight.data.uniform_(0, 0.5)
        self.item_emb.weight.data.uniform_(0, 0.5)

    def forward(self, u, v):
        u = self.user_emb(u)
        v = self.item_emb(v)
        return (u*v).sum(1)  # taking the dot product

In [5]:
num_users = len(interactions_train_df[USER_KEY])
num_items = len(interactions_train_df[ITEM_KEY])
print(num_users)
print(num_items)
model = MF(num_users, num_items, emb_size=100)

# train_df, valid_df = train_test_split(dataset, test_size=0.2)
# resetting indices to avoid indexing errors
train_df = interactions_train_df.reset_index(drop=True)
test_df = interactions_test_df.reset_index(drop=True)

27374
27374


In [7]:
def train_epocs(model, epochs=100000, lr=0.01, wd=0.0):
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=wd)
    model.train()
    for epoch in tqdm(range(epochs)):
        user_tensor = torch.LongTensor(train_df[USER_KEY].values)
        item_tensor = torch.LongTensor(train_df[ITEM_KEY].values)
        ratings = torch.FloatTensor(train_df[RATE_KEY].values)
        # print(torch.max(item_tensor))
        # print(torch.min(item_tensor))
        
        y_hat = model(user_tensor, item_tensor)
        
        loss = F.mse_loss(y_hat, ratings)
        if epoch % 10000 == 0:
            print(f"Epoch: {epoch}, Loss: {loss}")

        optimizer.zero_grad()  # reset gradient
        loss.backward()
        optimizer.step()
        
        # print(loss.item())
        
    test(model)

def test(model):
    model.eval()
    user_tensor = torch.LongTensor(test_df[USER_KEY].values)
    item_tensor = torch.LongTensor(test_df[ITEM_KEY].values)
    ratings = torch.FloatTensor(test_df[RATE_KEY].values)
    y_hat = model(user_tensor, item_tensor)
    loss = F.mse_loss(y_hat, ratings)
    print("test loss %.3f " % loss.item())

In [8]:
train_epocs(model)

  0%|          | 2/100000 [00:00<1:39:56, 16.67it/s]

Epoch: 0, Loss: 23.870031356811523


 10%|█         | 10002/100000 [08:17<1:13:57, 20.28it/s]

Epoch: 10000, Loss: 2.4001762383152325e-11


 20%|██        | 20002/100000 [16:36<1:05:02, 20.50it/s]

Epoch: 20000, Loss: 1.4133793513337878e-07


 30%|███       | 30004/100000 [24:48<56:37, 20.60it/s]

Epoch: 30000, Loss: 3.809713007285609e-07


 40%|████      | 40004/100000 [32:47<47:58, 20.84it/s]

Epoch: 40000, Loss: 6.9762813836860005e-06


 50%|█████     | 50004/100000 [40:47<39:56, 20.86it/s]

Epoch: 50000, Loss: 4.428964075486874e-06


 60%|██████    | 60003/100000 [48:44<31:48, 20.96it/s]

Epoch: 60000, Loss: 4.258533863321645e-06


 70%|███████   | 70004/100000 [56:41<23:53, 20.93it/s]

Epoch: 70000, Loss: 8.119158337649424e-06


 80%|████████  | 80003/100000 [1:04:38<16:08, 20.65it/s]

Epoch: 80000, Loss: 1.3935922424934688e-06


 90%|█████████ | 90002/100000 [1:12:38<07:58, 20.89it/s]

Epoch: 90000, Loss: 3.4493859857320786e-05


100%|██████████| 100000/100000 [1:20:37<00:00, 20.67it/s]

test loss 1.950 





In [16]:
user = torch.tensor([10])
items = torch.tensor(train_df[ITEM_KEY].unique().tolist())
predictions = model(user, items).tolist()
print(len(predictions))
print(len(items))

rdict = {
    "items": items.numpy().tolist(),
    "recStr": predictions
}

2890
2890


In [23]:
pd.DataFrame(rdict).sort_values(by=['recStr'], ascending=False)

Unnamed: 0,items,recStr
177,1253,1.997737
175,537,1.809013
180,2293,1.808277
176,1165,1.807465
178,1325,1.806759
...,...,...
418,2083,-1.754417
1830,1000,-1.792570
1332,2269,-2.098755
383,1773,-2.133947


In [5]:
import os 
filename = os.path.join("\result", "abc.file")
filepath = os.path.join("\data", filename)
filepath

'\\data/\result/abc.file'