## 實作

### 數據集

我們使用ml-100k數據集。

該數據集提估了一些簡單的用戶與物品特徵:
* 用戶ID
* 年齡
* 性別
* 職業
* zip code(可略)

In [None]:
# 解壓縮資料集
!unzip -q /content/drive/MyDrive/Datasets/recbyhand.zip

In [None]:
import os

# ROOT = os.path.split(os.path.realpath(__file__))[0]
ROOT = './data_set'
Model_Dir = os.path.join(ROOT,'model')

class Ml_100K():
    __BASE = os.path.join(ROOT, 'ml-100k')
    ORGINAL_DIR = os.path.join(ROOT,'ml-100k-orginal')
    USER_DF = os.path.join(ORGINAL_DIR,'user_df.csv')
    ITEM_DF = os.path.join(ORGINAL_DIR,'item_df.csv')
    ITEM_DF_0 = os.path.join(ORGINAL_DIR, 'item_df_0.csv')

    KG=os.path.join(__BASE,'kg_index.tsv')
    RATING = os.path.join(__BASE,'rating_index.tsv')
    RATING5 = os.path.join(__BASE, 'rating_index_5.tsv')

class Ml_latest_small():
    __BASE = os.path.join(ROOT,'ml-latest-small')
    RATING_TS = os.path.join(__BASE,'rating_index_ts.tsv')
    SEQS = os.path.join(__BASE, 'seqs.npy')

    SEQS_NEG = os.path.join(__BASE, 'seqsWithNeg.npy')

In [None]:
import os
import numpy as np
# from data_set import filepaths as fp
import pandas as pd

base_path = Ml_100K.ORGINAL_DIR
train_path = os.path.join(base_path,'ua.base')
test_path = os.path.join(base_path,'ua.test')
user_path = os.path.join(base_path,'u.user')
item_path = os.path.join(base_path,'u.item')
occupation_path = os.path.join(base_path,'u.occupation')

In [None]:
def __read_age_index():
    age_levels = set()
    with open(user_path, 'r') as f:
        for line in f.readlines():
            d = line.strip().split('|')
            age_level = int(d[1])//10
            age_levels.add(age_level)
    return len(age_levels)

def __read_occupation_index(begin):
    occupations = {}
    with open(occupation_path,'r') as f:
        names = f.read().strip().split('\n')
    for name in names:
        occupations[name]=begin
        begin+=1
    return occupations,begin

def generate_user_df():
    begin = __read_age_index()
    gender_dict = { 'M':begin, 'F':begin+1 }
    begin += 2
    occupation_dict,begin = __read_occupation_index(begin)
    uids = []
    all_users = []

    with open(user_path,'r') as f:
        for line in f.readlines():
            user_indexs=[]
            d = line.strip().split('|')
            age = int(d[1])//10
            uids.append(d[0])
            user_indexs.append(age)
            user_indexs.append(gender_dict[d[2]])
            user_indexs.append(occupation_dict[d[3]])
            all_users.append(user_indexs)

    df = pd.DataFrame(all_users,index=uids,columns=['age', 'gender', 'occupation'])
    df.to_csv(Ml_100K.USER_DF)
    return begin

def __get_year_index(begin):
    years = set()
    with open(item_path, 'r', encoding = 'ISO-8859-1') as f:
        for line in f.readlines():
            d = line.strip().split('|')
            year = d[2].split('-')
            if len(year)>2:
                years.add(int(year[2]))
    years.add(0)
    years = sorted(years)
    print(years)
    return {k:v+begin for v,k in enumerate(years)},len(years)

def generate_item_df(begin,out):
    items = {}
    years_dict, begin = __get_year_index(begin)
    max_n_neibour = 0
    all_items = []
    iids = []
    with open( item_path, 'r', encoding = 'ISO-8859-1' ) as f:
        for line in f.readlines():
            item_index = []
            d = line.strip().split('|')
            iids.append(int(d[0]))
            year = d[2].split('-')
            if len(year) > 2:
                item_index.append(years_dict[int(year[2])])
            else:
                item_index.append(0)

            subjects = d[5:]
            if begin == 0:
                begin = len(subjects)
            for i in range(len(subjects)):
                if int(subjects[i]) == 1:
                    item_index.append( begin+i )
            all_items.append( item_index )
            if len(item_index) > max_n_neibour:
                max_n_neibour = len(item_index)
    n_all=[]
    for item in all_items:
        n_all.append( np.random.choice( item, size = max_n_neibour, replace = True ) )

    df = pd.DataFrame( n_all, index = iids )
    df.to_csv(out )

    #print( all_items, max_n_neibour )
    return items

def get1or0(r):
    return 1.0 if r>3 else 0.0


def __read_rating_data(path):
    triples=[]
    with open(path,'r') as f:
        for line in f.readlines():
            d=line.strip().split('\t')
            triples.append([int(d[0]),int(d[1]),get1or0(int(d[2]))])
    return triples

def read_data_user_item_df():
    user_df = pd.read_csv( Ml_100K.USER_DF, index_col = 0 )
    item_df = pd.read_csv( Ml_100K.ITEM_DF_0, index_col = 0 )
    train_triples = __read_rating_data(train_path)
    test_triples= __read_rating_data(test_path)
    return train_triples, test_triples, user_df, item_df, max(user_df.max())+1, max(item_df.max())+1


def read_data():
    user_df = pd.read_csv( Ml_100K.USER_DF, index_col = 0 )
    item_df = pd.read_csv( Ml_100K.ITEM_DF, index_col = 0 )
    train_triples = __read_rating_data(train_path)
    test_triples= __read_rating_data(test_path)
    return train_triples, test_triples, user_df, item_df,max(item_df.max())+1

In [None]:
if __name__ == '__main__':
    item_df = generate_item_df(0, Ml_100K.ITEM_DF_0)
    #print(item_df)

    train_triples, test_triples, user_df, item_df,lenitems = read_data()
    print(user_df)
    print(item_df)

[0, 1922, 1926, 1930, 1931, 1932, 1933, 1934, 1935, 1936, 1937, 1938, 1939, 1940, 1941, 1942, 1943, 1944, 1945, 1946, 1947, 1948, 1949, 1950, 1951, 1952, 1953, 1954, 1955, 1956, 1957, 1958, 1959, 1960, 1961, 1962, 1963, 1964, 1965, 1966, 1967, 1968, 1969, 1970, 1971, 1972, 1973, 1974, 1975, 1976, 1977, 1978, 1979, 1980, 1981, 1982, 1983, 1984, 1985, 1986, 1987, 1988, 1989, 1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998]
     age  gender  occupation
1      2       8          29
2      5       9          23
3      2       8          30
4      2       8          29
5      3       9          23
..   ...     ...         ...
939    2       9          28
940    3       8          10
941    2       8          28
942    4       9          20
943    2       8          28

[943 rows x 3 columns]
        0    1    2    3    4    5    6
1      77   99   76   77   77   77   99
2      74   99   74   88   99   74   74
3      88   99   88   99   88   99   88
4      80   99   99   73   99   99   9

### 建立模型

我們省略了零次項與一次項的計算，因為這些東西不是很重要。

In [None]:
import numpy as np
from sklearn.metrics import precision_score, recall_score, accuracy_score
# from chapter2 import dataloader4ml100kIndexs
from torch.utils.data import DataLoader
import torch
from torch import nn
from torch.nn import Parameter,init

class AFM( nn.Module ):

    def __init__( self, n_features, user_df, item_df, k, t ):
        super( AFM, self ).__init__( )
        self.features = nn.Embedding( n_features, k, max_norm = 1 )
        self.attention_liner = nn.Linear( k, t )
        self.h = init.xavier_uniform_( Parameter( torch.empty( t, 1 ) ) )
        self.p = init.xavier_uniform_( Parameter( torch.empty( k, 1 ) ) )
        self.user_df = user_df
        self.item_df = item_df


    #FM聚合
    def FMaggregator( self, feature_embs ):
        # feature_embs:[ batch_size, n_features, k ]
        # [ batch_size, k ]
        square_of_sum = torch.sum( feature_embs, dim = 1 )**2
        # [ batch_size, k ]
        sum_of_square = torch.sum( feature_embs**2, dim = 1 )
        # [ batch_size, k ]
        output = square_of_sum - sum_of_square
        return output

    #注意力計算
    def attention( self, embs ):
        # embs: [ batch_size, k ]
        #[ batch_size, t ]
        embs = self.attention_liner( embs )
        #[ batch_size, t ]
        embs = torch.relu( embs )
        #[ batch_size, 1 ]
        embs = torch.matmul( embs, self.h )
        #[ batch_size, 1 ]
        atts = torch.softmax( embs, dim=1 )
        return atts

    #把用戶與物品的特徵，合併起來
    def __getAllFeatures( self, u, i ):
        users = torch.LongTensor( self.user_df.loc[u].values )
        items = torch.LongTensor( self.item_df.loc[i].values )
        all = torch.cat( [ users, items ], dim = 1 )
        return all

    def forward( self, u, i ):
        all_feature_index = self.__getAllFeatures( u, i )
        all_feature_embs = self.features( all_feature_index )
        embs = self.FMaggregator( all_feature_embs )
        atts = self.attention( embs )
        #[ batch_size, 1 ]
        outs = torch.matmul(atts * embs, self.p)
        #[ batch_size ]
        outs = torch.squeeze(outs)
        # [ batch_size ]
        logit = torch.sigmoid( outs )
        return logit

### 定義評估函式

In [None]:
def doEva(net, test_triple):
    d = torch.LongTensor(test_triple)
    u, i, r = d[:, 0], d[:, 1], d[:, 2]
    with torch.no_grad():
        out = net(u,i)
    y_pred = np.array([1 if i >= 0.5 else 0 for i in out])

    precision = precision_score( r, y_pred )
    recall = recall_score( r, y_pred )
    acc = accuracy_score( r, y_pred )
    return precision,recall,acc

### 定義訓練函式

In [None]:
def train( epochs = 20, batchSize = 1024, lr = 0.02, k = 128, t= 64, eva_per_epochs = 1,need_eva=True ):

    train_triples, test_triples, user_df, item_df,n_features= read_data( )

    net = AFM( n_features,user_df, item_df,  k, t  )

    criterion = torch.nn.BCELoss( )

    optimizer = torch.optim.AdamW( net.parameters(), lr=lr, weight_decay=0.3)

    for e in range(epochs):
        all_lose = 0
        for u,i,r in DataLoader( train_triples, batch_size = batchSize, shuffle = True ):
            r = torch.FloatTensor( r.detach().numpy() )
            optimizer.zero_grad()
            logits = net( u, i )
            loss = criterion( logits, r )
            all_lose += loss
            loss.backward( )
            optimizer.step( )
        print('epoch {},avg_loss={:.4f}'.format(e,all_lose/(len(train_triples)//batchSize)))

        if e % eva_per_epochs == 0 and need_eva:
            p, r, acc = doEva( net, train_triples)
            print('train:p:{:.4f}, r:{:.4f}, acc:{:.4f}'.format( p, r, acc ))
            p, r, acc = doEva( net, test_triples )
            print('test:p:{:.4f}, r:{:.4f}, acc:{:.4f}'.format( p, r, acc ))

    return net

### 進行訓練

In [None]:
if __name__ == '__main__':
    train()

epoch 0,avg_loss=0.6655


  d = torch.LongTensor(test_triple)


train:p:0.6275, r:0.7778, acc:0.6231
test:p:0.6352, r:0.7665, acc:0.6093


  d = torch.LongTensor(test_triple)


epoch 1,avg_loss=0.6579


  d = torch.LongTensor(test_triple)


train:p:0.6391, r:0.7415, acc:0.6268
test:p:0.6457, r:0.7254, acc:0.6099


  d = torch.LongTensor(test_triple)


epoch 2,avg_loss=0.6585


  d = torch.LongTensor(test_triple)


train:p:0.6588, r:0.6442, acc:0.6201
test:p:0.6594, r:0.6162, acc:0.5928


  d = torch.LongTensor(test_triple)


epoch 3,avg_loss=0.6569


  d = torch.LongTensor(test_triple)


train:p:0.6413, r:0.7234, acc:0.6246
test:p:0.6478, r:0.6806, acc:0.6001


  d = torch.LongTensor(test_triple)


epoch 4,avg_loss=0.6566


  d = torch.LongTensor(test_triple)


train:p:0.6542, r:0.6900, acc:0.6283
test:p:0.6544, r:0.6564, acc:0.5997


  d = torch.LongTensor(test_triple)


epoch 5,avg_loss=0.6562


  d = torch.LongTensor(test_triple)


train:p:0.6536, r:0.6573, acc:0.6192
test:p:0.6555, r:0.6213, acc:0.5910


  d = torch.LongTensor(test_triple)


epoch 6,avg_loss=0.6575


  d = torch.LongTensor(test_triple)


train:p:0.6616, r:0.6378, acc:0.6207
test:p:0.6610, r:0.6050, acc:0.5910


  d = torch.LongTensor(test_triple)


epoch 7,avg_loss=0.6558


  d = torch.LongTensor(test_triple)


train:p:0.6311, r:0.7855, acc:0.6288
test:p:0.6349, r:0.7597, acc:0.6073


  d = torch.LongTensor(test_triple)


epoch 8,avg_loss=0.6556


  d = torch.LongTensor(test_triple)


train:p:0.6549, r:0.6886, acc:0.6285
test:p:0.6536, r:0.6595, acc:0.5998


  d = torch.LongTensor(test_triple)


epoch 9,avg_loss=0.6561


  d = torch.LongTensor(test_triple)


train:p:0.6451, r:0.7220, acc:0.6280
test:p:0.6491, r:0.6846, acc:0.6024


  d = torch.LongTensor(test_triple)


epoch 10,avg_loss=0.6564


  d = torch.LongTensor(test_triple)


train:p:0.6663, r:0.6317, acc:0.6227
test:p:0.6645, r:0.6109, acc:0.5954


  d = torch.LongTensor(test_triple)


epoch 11,avg_loss=0.6563


  d = torch.LongTensor(test_triple)


train:p:0.6535, r:0.7020, acc:0.6307
test:p:0.6512, r:0.6800, acc:0.6032


  d = torch.LongTensor(test_triple)


epoch 12,avg_loss=0.6548


  d = torch.LongTensor(test_triple)


train:p:0.6426, r:0.7333, acc:0.6283
test:p:0.6494, r:0.6941, acc:0.6053


  d = torch.LongTensor(test_triple)


epoch 13,avg_loss=0.6545


  d = torch.LongTensor(test_triple)


train:p:0.6423, r:0.7089, acc:0.6221
test:p:0.6418, r:0.6945, acc:0.5980


  d = torch.LongTensor(test_triple)


epoch 14,avg_loss=0.6555


  d = torch.LongTensor(test_triple)


train:p:0.6656, r:0.6359, acc:0.6233
test:p:0.6659, r:0.6149, acc:0.5978


  d = torch.LongTensor(test_triple)


epoch 15,avg_loss=0.6569


  d = torch.LongTensor(test_triple)


train:p:0.6311, r:0.7726, acc:0.6259
test:p:0.6386, r:0.7468, acc:0.6081


  d = torch.LongTensor(test_triple)


epoch 16,avg_loss=0.6546


  d = torch.LongTensor(test_triple)


train:p:0.6474, r:0.6953, acc:0.6234


  d = torch.LongTensor(test_triple)


test:p:0.6544, r:0.6762, acc:0.6051
epoch 17,avg_loss=0.6550


  d = torch.LongTensor(test_triple)


train:p:0.6436, r:0.7463, acc:0.6325
test:p:0.6480, r:0.7082, acc:0.6076


  d = torch.LongTensor(test_triple)


epoch 18,avg_loss=0.6549


  d = torch.LongTensor(test_triple)


train:p:0.6479, r:0.7226, acc:0.6308
test:p:0.6524, r:0.6884, acc:0.6066


  d = torch.LongTensor(test_triple)


epoch 19,avg_loss=0.6547


  d = torch.LongTensor(test_triple)


train:p:0.6297, r:0.7963, acc:0.6298
test:p:0.6348, r:0.7837, acc:0.6130


  d = torch.LongTensor(test_triple)
