In [16]:
from client import Client
from server import Server
import os
import dgl
from time import time
import sys

# from ourparse import *
from scipy.sparse import lil_matrix, csr_matrix
from model import model
from rec_dataset import *
import torch.utils.data as dataloader
import copy
import random
from util import *
# print("use:", torch.device(args.device))

##### Set up the hyper parameters

In [2]:
hyperParams = {
        'fea_dim': 64,
        'in_dim': 64,
        'hidden_dim': 64,
        'out_dim': 64,
        'shared_num': 20,
        'path': 'Data/',
        'dataset': 'acm',  # lastfm
        'device': "cuda:1" if torch.cuda.is_available() else 'cpu',
        'num_heads': [2],
        'eps': 1.0,
        'num_sample': 0,
        'valid_step': 5,
        'nonlinearity': "relu",
        'log_dir': "./log/",
        'is_gcn': False,
        'is_attention': False,
        'hetero': True,
        'is_trans': False,
        'is_random_init': True,
        'is_graph': True,
        'local_train_num': 1,
        'agg_mode': "add",
        'agg_func': "ATTENTION",
        'lr': 0.01,
        'dropout': 0.0,
        'weight_decay': 0.0,
        'epochs': 10000,
        'batch_size': 32,
        'l2_reg': True,
        'grad_limit': 1.0,
        'clients_limit': 0.1,
        'items_limit': 60,
        'type': "ATTENTION",
        'p1': 1.0,
        'p2': 1.0
    }

##### Init the random seed

In [3]:
def setup_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    random.seed(seed)
    torch.backends.cudnn.deterministic = True

setup_seed(20211111)

##### Split train/test data func

In [5]:
def train_test_split(p_vs_a):
    train_id = []
    train_fed_id = []
    test_id = []
    test_negative_id = []
    p_vs_a_ = copy.deepcopy(p_vs_a)#
    p_vs_a_random = copy.deepcopy(p_vs_a)
    p_vs_a_random = p_vs_a_random.tolil()
    p_num = p_vs_a_.shape[0]
    a_num = p_vs_a_.shape[1]
    for i in range(p_num):#each paper
        cur_a = p_vs_a_[i].nonzero()[1]
        '''p_vs_a random'''
        p_vs_a_random[i,:]=0
        sample_len = len(cur_a)
        sample_a = random.sample(list(range(p_vs_a_.shape[1])), sample_len)
        #print(sample_a)
        p_vs_a_random[i, sample_a] = 1
        # print(p_vs_a_random[i].nonzero()[1])
        '''end'''

        if(len(cur_a)==1):
            train_id.append([i, cur_a[0]])
            train_fed_id.append(list(cur_a))#
        elif(len(cur_a)!=0):
            sample_train = random.sample(list(cur_a), len(cur_a)-1)
            train_fed_id.append(sample_train)#
            for j in sample_train:
                train_id.append([i, j])
            cur_test_id =list(set(cur_a)-set(sample_train))[0]
            test_id.append([i, cur_test_id])
            p_vs_a_[i, cur_test_id] = 0

            '''p_vs_a random'''
            p_vs_a_random[i, cur_test_id] = 0#random
            '''end'''

            test_negative_pool = list(set(range(a_num))-set(cur_a))#0-10... -
            test_negative_id.append(random.sample(test_negative_pool, 99))
        else:
            train_fed_id.append([])
    #print(len(train_fed_id))
    #print(test_negative_id[2])
    return p_vs_a_, p_vs_a_random, train_fed_id, train_id, test_id, test_negative_id

##### Load original ACM dataset

In [None]:
data_path = './data/ACM/ACM.mat'
data = sio.loadmat(data_path)
p_vs_f = data['PvsL']# (12499, 73)
p_vs_a = data['PvsA']#(12499, 17431)
p_vs_t = data['PvsT']#(12499, 1903)
p_vs_c = data['PvsC']#(12499, 14) # 每行只有一个元素为1，其他为0

In [12]:
p_vs_a_csr = p_vs_c.tocsr()

# 遍历每一行，打印非零元素值
for i in range(5):
    row = p_vs_a_csr[i]  # 获取第 i 行
    non_zero_values = row.data  # 获取非零元素值
    print(f"Row {i}: {non_zero_values}")
print(p_vs_a_csr.nnz)


Row 0: [1.]
Row 1: [1.]
Row 2: [1.]
Row 3: [1.]
Row 4: [1.]
12499


##### Load the Ml-100k dataset

In [29]:
path = 'data/'
dataset = '100k/'
file_name = 'u.data'

"""
@path: the path of the dataset
@dataset: the name of the dataset
@file_name: the name of the rating file
"""
import os
import pandas as pd

dataset_file = os.path.join(path, dataset, file_name)

min_rates = 10

ratings = pd.read_csv(dataset_file, sep='\t', header=None, names=['uid', 'mid', 'rating', 'timestamp'],
                        engine='python')

ratings = datasetFilter(ratings, min_rates)

# Reindex user id and item id
user_id = ratings[['uid']].drop_duplicates().reindex()
user_id['userId'] = np.arange(len(user_id))


item_id = ratings[['mid']].drop_duplicates()
item_id['itemId'] = np.arange(len(item_id))
item_id.sort_values(by='mid', inplace=True)
item_id.head()

Unnamed: 0,mid,itemId
24,1,24
172,2,147
302,3,233
48,4,47
78,5,75


In [30]:
ratings = pd.merge(ratings, user_id, on=['uid'], how='left')
ratings = pd.merge(ratings, item_id, on=['mid'], how='left')

ratings = ratings[['userId', 'itemId', 'rating', 'timestamp']].sort_values(by='userId', ascending=True)
ratings.loc[ratings['rating'] > 0, 'rating'] = 1.0
ratings.head()


Unnamed: 0,userId,itemId,rating,timestamp
0,0,0,1,881250949
14606,0,363,1,881252172
23189,0,1006,1,881252110
56628,0,432,1,881251955
78787,0,491,1,881250949


In [40]:

item_file = 'u.item'
use_columns = [0] + list(range(-19, 0))  # 第一列和最后 19 列
column_names = ['mid','unknown', 'Action', 'Adventure',
                'Animation', 'Children', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror',
                'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
item_df = pd.read_csv(path +dataset+ item_file, sep='|', header=None, encoding='latin1')
item_df = item_df.iloc[:, use_columns]
item_df.columns = column_names
item_df = pd.merge(item_df, item_id, on='mid', how='left').sort_values(by='itemId', ascending=True)
item_df = item_df.iloc[:, 1:20]
item_matrix=item_df.values
item_df.head()

Unnamed: 0,unknown,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
241,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
301,0,0,0,0,0,0,1,0,0,0,1,0,0,1,0,0,1,0,0
376,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
50,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,1
345,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0


In [50]:
# results = []
# with open(path+item_file, 'r', encoding='latin1') as f:
#     for line in f:
#         tags = line.strip().split('|')[-19:]
#         results.append([int(x) for x in tags])
# results = np.array(results)
# movie_vs_tag = csr_matrix(results)
for tags in item_matrix[0:5]:
    print(np.where(tags == 1)[0])
movie_vs_tag = csr_matrix(item_matrix)




[5]
[ 6 10 13 16]
[4 5]
[ 8 14 17 18]
[6 8]


In [None]:
user_ids = ratings['userId'].astype('category').cat.codes  # 将 userId 转换为连续整数
item_ids = ratings['itemId'].astype('category').cat.codes  # 将 itemId 转换为连续整数

# 创建稀疏矩阵
user_vs_movie = csr_matrix(
    (ratings['rating'], (user_ids, item_ids))  # 数据值和行列索引
)
# for i in range(5):
#     row = user_vs_movie[i]  # 获取第 i 行
#     non_zero_values = row.data  # 获取非零元素值
#     print(f"Row {i}: {non_zero_values}")

Row 0: [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1]
Row 1: [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
Row 2: [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
Row 3: [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 

In [None]:
m_t_dense = movie_vs_tag.todense()
u_m_dense = user_vs_movie.todense()
u_t_dense = np.dot(u_m_dense, m_t_dense)
user_vs_tag = csr_matrix(u_t_dense)
# for i in range(5):
#     row = user_vs_tag[i]  # 获取第 i 行
#     non_zero_values = row.data  # 获取非零元素值
#     print(f"Row {i}: {non_zero_values}")
# for tags in u_t_dense[0:5]:
#     print(np.where(tags >= 1)[0])


  (0, 1)	3
  (0, 2)	4
  (0, 4)	2
  (0, 5)	32
  (0, 7)	1
  (0, 8)	11
  (0, 9)	1
  (0, 14)	11
  (0, 15)	2
  (0, 17)	2
  (1, 1)	30
  (1, 2)	13
  (1, 3)	5
  (1, 4)	10
  (1, 5)	13
  (1, 6)	13
  (1, 8)	25
  (1, 9)	2
  (1, 10)	3
  (1, 11)	2
  (1, 12)	4
  (1, 13)	10
  (1, 14)	7
  (1, 15)	7
  (1, 16)	51
  :	:
  (941, 1)	5
  (941, 2)	1
  (941, 5)	2
  (941, 6)	5
  (941, 8)	15
  (941, 10)	2
  (941, 11)	3
  (941, 13)	6
  (941, 14)	6
  (941, 15)	2
  (941, 16)	13
  (941, 17)	2
  (942, 1)	10
  (942, 2)	7
  (942, 3)	3
  (942, 4)	2
  (942, 5)	7
  (942, 6)	1
  (942, 8)	5
  (942, 12)	1
  (942, 13)	1
  (942, 14)	1
  (942, 15)	8
  (942, 16)	7
  (942, 17)	1


In [58]:
sio.savemat("data/100k/100k_non_normal.mat", {"user_vs_movie": user_vs_movie, "movie_vs_tag": movie_vs_tag, "user_vs_tag": user_vs_tag})

In [61]:
data = sio.loadmat("data/100k/100k_non_normal.mat")
user_vs_movie = data['user_vs_movie']
movie_vs_tag = data['movie_vs_tag']
user_vs_tag = data['user_vs_tag']
user_vs_tag.todense()

matrix([[ 0,  3,  4, ...,  0,  2,  0],
        [ 0, 30, 13, ..., 51,  2,  3],
        [ 0, 66, 35, ..., 23, 15,  4],
        ...,
        [ 0, 18,  9, ..., 10,  9,  0],
        [ 0,  5,  1, ..., 13,  2,  0],
        [ 0, 10,  7, ...,  7,  1,  0]])

In [89]:
user_vs_tag_dense = user_vs_tag.todense()
for i in range(user_vs_tag_dense.shape[0]):
    row = user_vs_tag_dense[i,:]  # 获取第 i 行
    non_zero_values = row[row >= 1]  # 获取非零元素值
    # print(non_zero_values)
    if non_zero_values.size >0:
        # mean_value = np.median(non_zero_values, axis=1)
        mean_value = np.mean(non_zero_values)
        # print(mean_value)
        
        row[row < mean_value] = 0
        row[row >= mean_value] = 1
        # print(row)
proc_user_vs_tag = csr_matrix(user_vs_tag_dense)

rows, cols = proc_user_vs_tag.nonzero()  # 获取非零元素的行和列索引
values = proc_user_vs_tag.data  # 获取非零元素的值

# # 打印每个非零元素的行、列坐标及其值
for r, c, v in zip(rows, cols, values):
    print(f"Row: {r}, Column: {c}, Value: {v}")

Row: 0, Column: 5, Value: 1
Row: 0, Column: 8, Value: 1
Row: 0, Column: 14, Value: 1
Row: 1, Column: 1, Value: 1
Row: 1, Column: 2, Value: 1
Row: 1, Column: 5, Value: 1
Row: 1, Column: 6, Value: 1
Row: 1, Column: 8, Value: 1
Row: 1, Column: 16, Value: 1
Row: 2, Column: 1, Value: 1
Row: 2, Column: 2, Value: 1
Row: 2, Column: 5, Value: 1
Row: 2, Column: 15, Value: 1
Row: 2, Column: 16, Value: 1
Row: 3, Column: 1, Value: 1
Row: 3, Column: 5, Value: 1
Row: 3, Column: 8, Value: 1
Row: 3, Column: 14, Value: 1
Row: 3, Column: 16, Value: 1
Row: 4, Column: 1, Value: 1
Row: 4, Column: 5, Value: 1
Row: 4, Column: 8, Value: 1
Row: 4, Column: 14, Value: 1
Row: 4, Column: 16, Value: 1
Row: 5, Column: 1, Value: 1
Row: 5, Column: 2, Value: 1
Row: 5, Column: 4, Value: 1
Row: 5, Column: 5, Value: 1
Row: 5, Column: 8, Value: 1
Row: 5, Column: 14, Value: 1
Row: 5, Column: 16, Value: 1
Row: 5, Column: 17, Value: 1
Row: 6, Column: 1, Value: 1
Row: 6, Column: 2, Value: 1
Row: 6, Column: 5, Value: 1
Row: 6, C

In [90]:
sio.savemat("data/100k/100k_mean.mat", {"user_vs_movie": user_vs_movie, "movie_vs_tag": movie_vs_tag, "user_vs_tag": proc_user_vs_tag})