# 用户社交数据（user_friends.csv）处理
（只取训练集和测试集中出现的用户ID）

数据来源于Kaggle竞赛：Event Recommendation Engine Challenge，根据
events they’ve responded to in the past
user demographic information
what events they’ve seen and clicked on in our app
用户对某个活动是否感兴趣

竞赛官网：
https://www.kaggle.com/c/event-recommendation-engine-challenge/data

user_friends.csv文件：共2维特征
user：用户ID
friends：以空格隔开的用户好友ID列表

# 导入工具包

In [43]:
import pandas as pd

import numpy as np
import scipy.sparse as ss
import scipy.io as sio

#保存数据
import pickle

from sklearn.preprocessing import normalize

总的用户数目超过训练集和测试集中的用户，
为节省处理时间和内存，先去处理train和test，得到竞赛需要用到的活动和用户
然后对在训练集和测试集中出现过的事件和用户建立新的ID索引
先运行user_event.ipynb,
得到事件列表文件：PE_userIndex.pkl

# 读取之前算好的测试集和训练集中出现过的用户

In [44]:
#读取训练集和测试集中出现过的事件列表
userIndex = pickle.load(open("PE_userIndex.pkl", 'rb'))
n_users = len(userIndex)

print("number of users in train & test :%d" % n_users)

number of users in train & test :3391


# 读取之前用户-活动分数矩阵，将朋友参加活动的影响扩展到用户

In [45]:
#用户-事件关系矩阵
userEventScores = sio.mmread("PE_userEventScores")

#后续用于将用户朋友参加的活动影响到用户
eventsForUser = pickle.load(open("PE_eventsForUser.pkl", 'rb'))

In [46]:
print(eventsForUser)
print(type(eventsForUser))

defaultdict(<class 'set'>, {1185: {5184, 2596, 6926, 3575, 3483, 8414}, 384: {1534, 1738, 5678, 8369, 7221, 3255, 4670}, 1487: {5345, 11010, 3587, 7184, 3701, 2296}, 2891: {1928, 7408, 12981, 6233, 8730, 8121}, 2905: {7592, 12506, 12045, 6323, 7542, 5306}, 2845: {4800, 3233, 1926, 423, 12263, 9063, 9649, 977, 9622, 7702, 5976, 10486, 2331, 8408}, 593: {12640, 1698, 9287, 4589, 3995, 3771}, 228: {11904, 8101, 9063, 2569, 3025, 13367}, 2294: {2784, 5134, 5299, 8758, 1944, 11898, 13308}, 269: {1440, 9608, 2859, 12812, 1523, 10781}, 2777: {768, 3650, 12186, 5435, 4636, 1183}, 2401: {3270, 362, 3153, 1589, 12028, 7869}, 824: {3013, 1190, 9100, 4557, 5051, 3581}, 2558: {9985, 6668, 6926, 11920, 146, 11930, 6828, 2866, 310, 4156, 5184, 10182, 10955, 7755, 10450, 7381, 5718, 12634, 9063, 2164, 3575}, 2926: {3043, 2820, 13004, 6290, 12855, 8729}, 3093: {5184, 9063, 6668, 1965, 6926, 3575, 5435, 4156, 3133}, 3166: {1058, 9031, 6857, 5171, 8213}, 3016: {33, 3463, 12107, 3980, 3918, 509}, 709: {11

In [47]:
print(type(eventsForUser))

<class 'collections.defaultdict'>


# user_friends.csv

In [48]:
#读取数据

"""
  找出某用户的那些朋友
  1)如果你有更多的朋友，可能你性格外向，更容易参加各种活动
  2)如果你朋友会参加某个活动，可能你也会跟随去参加一下
"""
 
#用户有多少个朋友
numFriends = np.zeros((n_users))
userFriends = ss.dok_matrix((n_users, n_users))
    
fin = open("user_friends.csv", 'r')
#字段：user，friends
fin.readline()                # skip header

#ln = 0
for line in fin:  #对每个用户        
    cols = line.strip().split(",")
    user = str(cols[0])    #user
    
    #if userIndex.has_key(user):   #该用户在训练集和测试集的用户列表中
    if user in userIndex:
        friends = cols[1].split(" ")  #friends
        i = userIndex[user]       #该用户的索引
        numFriends[i] = len(friends)
        for friend in friends:  #该用户的每个朋友
            str_friend = str(friend)
            #if userIndex.has_key(str_friend):  #如果朋友也在训练集或测试集中出现
            if str_friend in userIndex:
                j = userIndex[str_friend]   #朋友的索引
            
                # the objective of this score is to infer the degree to
                # and direction in which this friend will influence the
                # user's decision, so we sum the user/event score for
                # this user across all training events.
            
                #userEventScores为用户对活动的打分（interested - not interseted）
                #在Users-Events.ipynb中计算好了
                eventsForUser = userEventScores.getrow(j).todense()
            
                #所有朋友参加活动的数量（平均频率）
                score = eventsForUser.sum() / np.shape(eventsForUser)[1]
                userFriends[i, j] += score
                userFriends[j, i] += score
            
fin.close()
    

#用户的朋友数目
# 归一化数组
sumNumFriends = numFriends.sum(axis=0)
numFriends = numFriends / sumNumFriends
sio.mmwrite("UF_numFriends", np.matrix(numFriends))

#
userFriends = normalize(userFriends, norm="l2", axis=0, copy=False)
sio.mmwrite("UF_userFriends", userFriends)


numFriends应该是每个users拥有的朋友数目  
userFriends应该是一个3391x3391的矩阵，里面对应的是两个用户之间的score（类似于用户之间的关系），score可能是共同参加活动的频率  
用户的每个朋友参加活动的分数对该用户的影响

In [57]:
numFriends

array([2.43341801e-04, 2.26189956e-04, 1.01571082e-04, ...,
       4.40588019e-04, 5.06515423e-05, 6.35154261e-05])

In [58]:
numFriends.shape

(3391,)

***测试集和训练集加起来刚好是这么多个users***

In [55]:
print(sumNumFriends)

3731377.0


In [56]:
print(type(sumNumFriends))

<class 'numpy.float64'>


In [50]:
print(eventsForUser)
print(type(eventsForUser))

[[0. 0. 0. ... 0. 0. 0.]]
<class 'numpy.matrixlib.defmatrix.matrix'>


In [51]:
print(eventsForUser.sum())
print(np.shape(eventsForUser))

1.0
(1, 13418)


In [52]:
print(userFriends)

  (2750, 0)	1.0
  (1822, 3)	0.816496580927726
  (2920, 3)	0.408248290463863
  (463, 3)	0.408248290463863
  (543, 5)	0.8944271909999159
  (808, 5)	0.4472135954999579
  (1868, 11)	1.0
  (2573, 12)	1.0
  (3233, 16)	1.0
  (381, 17)	0.7071067811865475
  (3289, 17)	0.7071067811865475
  (1063, 18)	1.0
  (1047, 19)	1.0
  (1059, 20)	0.5547001962252291
  (1460, 20)	0.8320502943378437
  (1109, 22)	0.485071250072666
  (751, 22)	0.485071250072666
  (1231, 22)	0.7276068751089989
  (101, 23)	1.0
  (282, 28)	1.0
  (278, 29)	1.0
  (695, 30)	1.0
  (2860, 31)	0.7999999999999999
  (1486, 31)	0.6
  (490, 34)	1.0
  :	:
  (1655, 3376)	0.38729833462074165
  (1727, 3376)	0.12909944487358058
  (1983, 3376)	0.12909944487358058
  (2024, 3376)	0.25819888974716115
  (2123, 3376)	0.38729833462074165
  (1773, 3376)	0.25819888974716115
  (768, 3376)	0.25819888974716115
  (1530, 3376)	0.25819888974716115
  (903, 3376)	0.25819888974716115
  (1740, 3376)	0.38729833462074165
  (1446, 3376)	0.38729833462074165
  (1288, 337

In [59]:
userFriends.shape

(3391, 3391)

In [53]:
print(userEventScores)

  (1185, 3575)	1.0
  (384, 4670)	1.0
  (384, 1738)	1.0
  (384, 7221)	1.0
  (1487, 5345)	1.0
  (2891, 12981)	1.0
  (2905, 7542)	1.0
  (2845, 8408)	1.0
  (2845, 423)	1.0
  (2845, 9649)	1.0
  (2845, 7702)	1.0
  (593, 12640)	1.0
  (593, 4589)	1.0
  (228, 3025)	1.0
  (2294, 8758)	1.0
  (2294, 1944)	1.0
  (2294, 5299)	1.0
  (269, 9608)	1.0
  (2777, 5435)	1.0
  (2401, 3153)	1.0
  (824, 3581)	1.0
  (2558, 2164)	1.0
  (2558, 5184)	1.0
  (2558, 3575)	1.0
  (2558, 10955)	1.0
  :	:
  (1429, 11384)	1.0
  (3011, 13316)	1.0
  (303, 911)	1.0
  (767, 13333)	1.0
  (779, 5816)	1.0
  (3048, 268)	1.0
  (3048, 10125)	1.0
  (3048, 5054)	1.0
  (3048, 8008)	1.0
  (3048, 12377)	1.0
  (1605, 4858)	1.0
  (1886, 7338)	1.0
  (1886, 11874)	1.0
  (1886, 3002)	1.0
  (1886, 5809)	1.0
  (1886, 12158)	1.0
  (1886, 5805)	1.0
  (435, 3445)	1.0
  (1127, 6305)	1.0
  (1127, 67)	1.0
  (1127, 4793)	1.0
  (2730, 4484)	1.0
  (2730, 10883)	1.0
  (2039, 12931)	1.0
  (3372, 1715)	1.0
