In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [3]:
data = pd.read_csv('./movie_lens/ratings.csv')
data = data.drop('timestamp',axis=1)
data.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 3 columns):
userId     100836 non-null int64
movieId    100836 non-null int64
rating     100836 non-null float64
dtypes: float64(1), int64(2)
memory usage: 2.3 MB


#### 接下来的两个数据，仅用来查看数据，不参加操作

In [3]:
#用户信息
user_number = data.groupby('userId')['movieId'].count()
user_number.describe()

count     610.000000
mean      165.304918
std       269.480584
min        20.000000
25%        35.000000
50%        70.500000
75%       168.000000
max      2698.000000
Name: movieId, dtype: float64

In [4]:
#影片信息
movie_number = data.groupby('movieId')['userId'].count()
movie_number.describe()

count    9724.000000
mean       10.369807
std        22.401005
min         1.000000
25%         1.000000
50%         3.000000
75%         9.000000
max       329.000000
Name: userId, dtype: float64

#### 做一个数据集用来后期使用，类型为：
``` userId_1 userId_2 movieIds
```

`抽出所有的共同评分的数据信息`

In [5]:
#制作公共影片文件  common.txt


#获得公共影片信息
#movie_1就是用户j评过分的电影,同理movie_2就是用户i评过分的电影
f = open( 'common.txt', 'w', encoding='utf8')
f.write('userId_1,userId_2,movieIds\n')
for j in range(21):     #这里的迭代体 5 是用户数，假定用户1为比较用户，用户2为被比较用户，此处显然为用户1的个数
    if j ==0 :continue
    for i in range(21):    #这里的迭代体 11 是用户2的个数
        if i == 0 : continue
        if j == i : continue
            #获得电影编号
        movie_1 = data[data['userId']==j]
        movie_2 = data[data['userId']==i]
        f.write( str(j) +','+str(i)+',')
        for m_id in movie_1['movieId']:                   #遍历用户1的项目，每个都拿去在用户2的项目里检索
            judge = movie_2[movie_2['movieId'] == m_id]    #为获得共有电影编号参数 number 做准备
            number = judge['movieId']
            if judge.empty: pass                          #检查是否有共同项目，无则pass，有则写入文件
            else:  
                f.write(str(int(number)) + ' ')
        f.write('\n')
f.close()

#### 导入做好的数据集

In [12]:
#读取公共影片信息
wangluo = pd.read_table('common.txt',sep=',')
wangluo.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 380 entries, 0 to 379
Data columns (total 3 columns):
userId_1    380 non-null int64
userId_2    380 non-null int64
movieIds    348 non-null object
dtypes: int64(2), object(1)
memory usage: 9.0+ KB


In [13]:
#为后续操作方便，先填充na数据
wangluo = wangluo.fillna(0)
wangluo.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 380 entries, 0 to 379
Data columns (total 3 columns):
userId_1    380 non-null int64
userId_2    380 non-null int64
movieIds    380 non-null object
dtypes: int64(2), object(1)
memory usage: 9.0+ KB


#### 获取两个用户的评分
主要变量及命名：

wangluo:信任网络数据（未加工）

index:用户号及共同项目数数据

rating:两用户对同项目的评分数据

In [14]:
len(wangluo)

380

In [15]:
#注意pandas的索引取值需借用 loc ，且拼接的字符串不可直接遍历，须先进行切分
index = []
rating = []
length = len(wangluo)
for num in range(length):
    #先取出用户号
    userid_1 = int(wangluo.loc[num][0])
    userid_2 = int(wangluo.loc[num][1])
    #取得此时的项目号（序列）
    if wangluo.loc[num][2] == 0:
        index.append([userid_1,userid_2,0])
        continue
    movie_common = wangluo.loc[num][2].split()
    #将用户一，用户二，及两用户的共同项目数加入到列表中
    index.append([userid_1,userid_2,len(movie_common)])
    #对同一项目，分别记录两用户的评分
    for movie_id in movie_common:
        user_1 = data[data['userId'] == userid_1]
        user_2 = data[data['userId'] == userid_2]
        rating_1 = int( user_1[user_1['movieId'] == int(movie_id)]['rating'] )
        rating_2 = int( user_2[user_2['movieId'] == int(movie_id)]['rating'] )
        rating.append([rating_1,rating_2])
# print(index)
# print(rating)

In [9]:
len(movie_common)

2

#### 进行信任值计算

计算信任值的数学表示

`T = 1 - [ abs(r1 - r2) for r1 in rating1  for r2 in rating2] / (5 * len(movie_common) `

In [16]:
#本函数不做输出，但生成文件：xinren.txt 
def xinren(index,rating):
        
    '''
        index:为用户Id与共同项目数目的列表，形如 [1, 2, 2]
        rating:为两用户对同一项目的评分数据列表
    '''    
        
        
        
    i = 0 #用来存放共同项目个数
    j = 0 #用来做遍历评分列表的下标标签
    r = 0 #记录两用户评分的差的绝对值的累加和

    T = []     #信任值列表
    f = open( 'xinren.txt', 'w', encoding='utf8')
    f.write('user_1,user_2,credit\n')
    for mata in index:
        if mata[2] == 0: 
            T.append(0)
            continue
        if i == mata[2]:
            i = 0
            continue
        i = mata[2]
        for time in range(i):
    #         print(rating[j])
            r += abs( rating[j][0] - rating[j][1])
            j += 1
        credit = round( 1 - r / (5 * i) , 4 )
        f.write(str(mata[0]) + ',' + str(mata[1]) + ',' + str(credit) + '\n')
        T.append( credit )
        r = 0
    #     print("one rand")
    f.close()
# print(index)
# print('\n计算得到的信任值为：\n')
# print(T)

xinren(index,rating)

[[1, 2, 2], [1, 3, 7], [1, 4, 45], [1, 5, 13], [1, 6, 33], [1, 7, 26], [1, 8, 15], [1, 9, 5], [1, 10, 6], [1, 11, 16], [1, 12, 2], [1, 13, 6], [1, 14, 13], [1, 15, 25], [1, 16, 26], [1, 17, 38], [1, 18, 70], [1, 19, 123], [1, 20, 33], [2, 1, 2], [2, 3, 0], [2, 4, 1], [2, 5, 1], [2, 6, 2], [2, 7, 3], [2, 8, 1], [2, 9, 0], [2, 10, 5], [2, 11, 2], [2, 12, 0], [2, 13, 1], [2, 14, 1], [2, 15, 10], [2, 16, 5], [2, 17, 6], [2, 18, 20], [2, 19, 2], [2, 20, 1], [3, 1, 7], [3, 2, 0], [3, 4, 1], [3, 5, 1], [3, 6, 3], [3, 7, 0], [3, 8, 1], [3, 9, 0], [3, 10, 0], [3, 11, 0], [3, 12, 0], [3, 13, 0], [3, 14, 1], [3, 15, 3], [3, 16, 3], [3, 17, 4], [3, 18, 6], [3, 19, 5], [3, 20, 5], [4, 1, 45], [4, 2, 1], [4, 3, 1], [4, 5, 12], [4, 6, 27], [4, 7, 22], [4, 8, 9], [4, 9, 1], [4, 10, 9], [4, 11, 8], [4, 12, 5], [4, 13, 7], [4, 14, 8], [4, 15, 13], [4, 16, 27], [4, 17, 23], [4, 18, 42], [4, 19, 75], [4, 20, 24], [5, 1, 13], [5, 2, 1], [5, 3, 1], [5, 4, 12], [5, 6, 36], [5, 7, 9], [5, 8, 19], [5, 9, 0], [