In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.utils.data as Data
import torch.optim as optim

In [2]:
df = pd.read_csv('./data/avazu/ctr_data.csv', usecols = [0, 1, 11], header = 0, names = ['id', 'click', 'device_id'])

num = pd.DataFrame([x for x in range(len(set(df['id'])))])
df['item_id'] = None
df['item_id'] = num

df['user_id'] = None
l = list(set(df['device_id']))
for i in l:
    for j in range(len(df['device_id'])):
        if df['device_id'][j] == i:
            df['user_id'][j] = l.index(i) + 1
            
df.to_csv('./data/avazu/ctr_data_new.csv', index = False)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == '':


In [3]:
file = pd.read_csv("./data/avazu/ctr_data_new.csv", header = 0)

# 参数 
total = 10000
ratio = 0.9
num_user = 1076
num_item = 10000

user_train = set()
user_test = set()
item_train = set()
item_test= set()

train_r = np.zeros((num_user, num_item))  
train_mr = train_r
test_r = train_r
test_mr = train_r

total_idx = np.random.permutation(total)
train_idx = total_idx[0: int(total * ratio)]
test_idx = total_idx[int(total * ratio):]

for i in train_idx:
    user_idx = int(file.iloc[i]['user_id'])
    item_idx = int(file.iloc[i]['item_id'])

    train_r[user_idx, item_idx] = int(file.iloc[i]['click'])
    train_mr[user_idx, item_idx] = 1
    
    user_train.add(user_idx)
    item_train.add(item_idx)
    
for i in test_idx:
    user_idx = int(file.iloc[i]['user_id'])
    item_idx = int(file.iloc[i]['item_id'])

    test_r[user_idx, item_idx] = int(file.iloc[i]['click'])
    test_mr[user_idx, item_idx] = 1
    
    user_test.add(user_idx)
    item_test.add(item_idx)

In [4]:
class Autorec(nn.Module):
    def __init__(self, num_user, num_item):
        super(Autorec, self).__init__()
        self.num_user = num_user
        self.num_item = num_item
        self.hidden_unit = 500
        self.lambda_value = float(1)
 
        self.encoder = nn.Sequential(
            nn.Linear(self.num_item, self.hidden_unit),
            nn.Sigmoid()
        )
        self.decoder = nn.Sequential(
            nn.Linear(self.hidden_unit, self.num_item),
        )
 
    def forward(self, torch_input):
        encoder = self.encoder(torch_input)
        decoder = self.decoder(encoder)
 
        return decoder
 
    def loss(self, decoder, input, opt, minput):
        temp2 = 0
        cost = 0
 
        cost += ((decoder - input) * minput).pow(2).sum()
        rmse = cost
 
        for i in opt.param_groups:
            for j in i['params']:
                if j.data.dim() == 2:
                    temp2 += torch.t(j.data).pow(2).sum() 
        cost += temp2 * self.lambda_value * 0.5
        
        return cost, rmse
 
    def train(self, epoch):
        rmse = 0
        cost_all = 0
        for step, (batch_x, batch_mask_x, batch_y) in enumerate(loader):
            batch_x = batch_x.type(torch.FloatTensor)
            batch_mx = batch_mask_x.type(torch.FloatTensor)
            decoder = rec(batch_x)
            loss, rmse = rec.loss(decoder = decoder, input = batch_x, opt = opt, minput = batch_mx)
            opt.zero_grad()
            loss.backward()
            opt.step()
            cost_all += loss
            rmse += rmse
 
        rmse = np.sqrt(rmse.detach().cpu().numpy() / (train_mr == 1).sum())
        print('epoch', epoch, 'rmse', rmse)
 
    def test(self, epoch):
        test_r_tensor = torch.from_numpy(test_r).type(torch.FloatTensor)
        test_mr_tensor = torch.from_numpy(test_mr).type(torch.FloatTensor)
        decoder = rec(test_r_tensor)
        unseen_user_test_list = list(user_test - user_train)
        unseen_item_test_list = list(item_test - item_train) 
 
        for user in unseen_user_test_list:
            for item in unseen_item_test_list:
                if test_mr[user, item] == 1:  
                    decoder[user, item] = 1
 
        mse = ((decoder - test_r_tensor) * test_mr_tensor).pow(2).sum()
        rmse = np.sqrt(mse.detach().cpu().numpy() / (test_mr == 1).sum())
 
        print('epoch', epoch, 'rmse', rmse)

In [5]:
rec = Autorec(num_user, num_item)
opt = optim.Adam(rec.parameters(), lr = 0.001, weight_decay = 0.001)
data =  Data.TensorDataset(torch.from_numpy(train_r), torch.from_numpy(train_mr), torch.from_numpy(train_r))
loader = Data.DataLoader(dataset = data, batch_size = 64, shuffle = True)

In [6]:
for epoch in range(0, 10):
    rec.train(epoch = epoch)
rec.test(epoch = epoch)

epoch 0 rmse 0.4574811887782245
epoch 1 rmse 0.46230865974476404
epoch 2 rmse 0.4676163896728011
epoch 3 rmse 0.4689568293697193
epoch 4 rmse 0.4696544659457765
epoch 5 rmse 0.4681763938326958
epoch 6 rmse 0.46793582939297085
epoch 7 rmse 0.4680805897242082
epoch 8 rmse 0.4677165952546077
epoch 9 rmse 0.46801152767753756
epoch 9 rmse 0.02899919732397037


In [7]:
def recommend(users, K = 10):
    df = pd.read_csv('./data/avazu/ctr_data_new.csv')
    test_r_tensor = torch.from_numpy(test_r).type(torch.FloatTensor)
    test_mr_tensor = torch.from_numpy(test_mr).type(torch.FloatTensor)
    decoder = rec(test_r_tensor)
    
    unseen_user = list(user_test - user_train)
    unseen_item = list(item_test - item_train)
    
    for user in unseen_user:
        for item in unseen_item:
            if test_mr[user, item] == 1: 
                decoder[user, item] = 0
                    
    rmse = decoder.pow(2).detach().cpu().numpy()
    
    for i in range(0, len(df['device_id'])):
        if df['device_id'][i] == users:
            idx = df['user_id'][i]
            break

    l = np.argsort(-rmse[idx])[: K]
    
    ans = []
    for i in l:
        for j in df['item_id']:
            if i == j:
                ans.append(df['id'][i])
            else:
                continue
                
    return ans

In [8]:
rec = recommend('a99f214a')
print(rec)

[10895497004782100000, 10415768676346500000, 10939160037323600000, 10263506990007800000, 10714771633334900000, 11258882264818400000, 11249673216976000000, 10006415976094800000, 10634154333675600000, 10834473020372700000]
