In [2]:
import pandas as pd
import numpy as np
import os
import math
import pickle
from itertools import permutations
from sklearn.model_selection import train_test_split

In [35]:
#导入数据
X_train=pd.read_csv('train.csv')
X_test=pd.read_csv('test.csv')


In [36]:

X_train_uid = np.array(X_train['uid'])
X_train_iid = np.array(X_train['iid'])
Y_train_score = np.array(X_train['score']).astype("float32")
X_test_uid = np.array(X_test['uid'])
X_test_iid = np.array(X_test['iid'])
X_test.head


<bound method NDFrame.head of            uid    iid
0            0  12960
1            1  12726
2            1  11463
3            1  10739
4            1   3441
5            1    301
6            1  13291
7            1   2814
8            1   2857
9            2  12860
10           2  11091
11           2  13057
12           3   8992
13           3  11082
14           3   2665
15           3  12570
16           3  13410
17           3  12714
18           3  14649
19           3   2635
20           4  14339
21           4  13000
22           5   1326
23           5   2308
24           5   1934
25           5   2405
26           5  13509
27           5  12362
28           5   7636
29           5   5155
...        ...    ...
546166  223267  12181
546167  223267   3569
546168  223277  11865
546169  223686  12983
546170  223842   1801
546171  223842   1418
546172  223842    146
546173  223842   3033
546174  223842    282
546175  223842   2883
546176  223842   2161
546177  223842  10018
54

In [37]:
def generate_user_item_matrix(train):
        users = train.uid.unique()
        products = train.iid.unique()
        uid_iid_mat = np.zeros((users.shape[0],products.shape[0]), dtype=np.int8)
        uid_iid_mat = pd.DataFrame(uid_iid_mat, index=users, columns=products)
        train = train.drop_duplicates()
        for index, row in train.iterrows():  # 获取每行的index、row
            uid_iid_mat.loc[row['uid'], row['iid']] = row['score'] # 把结果返回给data
        return uid_iid_mat


def cosine_sim(rate_mat, i, j):
    a = rate_mat[:, i]
    b = rate_mat[:, j]
    m = np.dot(a, b)
    n = np.sqrt(np.dot(a, a) * np.dot(b, b))
    return m/float(n)


def cosine_sim_s(rate_mat, i, j):
    a = rate_mat[:, i]
    b = rate_mat[:, j]
    intersection = a * b
    if intersection[intersection != 0].size == 0:
        return 0.0

    c = a[a != 0]  # 评价物品i的所有用户评分
    d = b[b != 0]
    p = np.mean(c)  # 物品i的所有用户评分均值
    q = np.mean(d)

    m = np.dot(a[intersection != 0] - p, b[intersection != 0] - q)
    n = np.sqrt(np.dot(c - p, c - p) * np.dot(d - q, d - q))
    if n == 0:
        return 0.0
    return m / float(n)


def pearson(rate_mat, i, j):
    a = rate_mat[:, i]
    b = rate_mat[:, j]
    intersection = a * b
    if intersection[intersection != 0].size == 0:
        return 0.0

    c = a[intersection != 0]  # 评价物品i的公共用户评分
    d = b[intersection != 0]
    p = np.mean(a[a != 0])  # 物品i的所有用户评分均值
    q = np.mean(b[b != 0])

    m = np.dot(c - p, d - q)
    n = np.sqrt(np.dot(c - p, c - p) * np.dot(d - q, d - q))
    if n == 0:
        return 0.0
    return m / float(n)


def get_rate_cos(rate_mat, n_iid, function):
        shapes = [n_iid, n_iid]
        rate_cos = np.zeros(shapes)
        for i in range(shapes[0]):
            for j in range(shapes[1]):
                if i == j:
                    rate_cos[i, j] = 1
                elif rate_cos[j, i] != 0:
                    rate_cos[i, j] = rate_cos[j, i]
                else:
                    rate_cos[i, j] = eval(function)(np.array(rate_mat), i, j)
        iid_index = rate_mat.columns
        rate_cos = pd.DataFrame(rate_cos, index=iid_index, columns=iid_index)
        return rate_cos


def recommendation_s(uid, iid, iid_iid_sim, rate_mat, k=10):
    score = 0
    weight = 0
    iid_sim = iid_iid_sim.loc[iid,:].values #商品iid对应所有商品的相似度
    uid_action = rate_mat.loc[uid,:].values #用户uid对应所有商品的行为评分
    iid_action = rate_mat.loc[:,iid].values #物品iid得到的所有用户的评分
    sim_indexs = np.argsort(iid_sim)[-(k+1):-1] #最相似的k个物品的index

    iid_i_mean = np.sum(iid_action)/iid_action[iid_action != 0].size
    for j in sim_indexs:
        if uid_action[j] != 0:
            iid_j_action = rate_mat.values[:,j]
            iid_j_mean = np.sum(iid_j_action)/iid_j_action[iid_j_action != 0].size
            score += iid_sim[j] * (uid_action[j] - iid_j_mean)
            weight += abs(iid_sim[j])
    print(iid_i_mean, score, weight)
    if weight == 0:
        return iid_i_mean
    else:
        return iid_i_mean + score/float(weight)


def pred(num, k, iid_index, iid_iid_sim, rate_mat):
    result = np.zeros(num[0])
    count = 0
    for i in range(num[0]):
        a = X_test.iloc[i,0] #取uid
        b = X_test.iloc[i,1] #iid
        if b not in iid_index:
            result[i] = 3
            count = count + 1
        else:
            result[i] = recommendation_s(a, b, iid_iid_sim,rate_mat, k)
    return result


if __name__ == '__main__':
    X_train['iid'] = X_train['iid'].apply(str)
    X_train['uid'] = X_train['uid'].apply(str)
    rate_mat = generate_user_item_matrix(X_train).fillna(0)
    n_iid = rate_mat.shape[1]
    rate_cos = get_rate_cos(rate_mat, n_iid, 'cosine_sim').fillna(0)
    iid_index = rate_mat.columns
    # 开始预测
    num = X_test.shape
    result = pred(num, 5, iid_index, rate_cos, rate_mat)
    Y_test_score = pd.DataFrame(np.array(result), columns=['score'])
    # 把data中score写入to.csv文件中。
    Y_test_score.to_csv('score_0201.csv', index=False, columns=['score'])
    

KeyboardInterrupt: 

### 基于矩阵分解的解法
* SVD
* NMF
* RSVD
* SVD++
* SVDfeature
* libmf
* libfm

In [2]:
from surprise import SVD, SVDpp, BaselineOnly, NMF
from surprise import Dataset, Reader
from surprise import evaluate, print_perf
from surprise.model_selection import cross_validate, GridSearchCV, train_test_split
import pandas as pd
import numpy as np

X_trainrow=pd.read_csv('train.csv')
X_testrow=pd.read_csv('test.csv')

In [3]:

X_train=X_trainrow.fillna(3.0)
X_test=X_testrow
reader=Reader(rating_scale=(1,5))
data=Dataset.load_from_df(X_train[['uid','iid','score']],reader)
algo=SVD()
trainset,testset=train_test_split(data,test_size=0.25)
algo.fit(trainset)
predictions = algo.test(testset)


NameError: name 'accuracy' is not defined

In [4]:
from surprise import accuracy
accuracy.rmse(predictions)

RMSE: 0.8552


0.8551983702516954

In [7]:

def pred():
    num=X_test.shape
    result=np.zeros(num[0])
    for i in range(num[0]):
        a=X_test.iloc[i,0]
        b=X_test.iloc[i,1]
        result[i]=algo.predict(a,b).est
    return result

result=pred()
Y_test_score=pd.DataFrame(np.array(result),columns=['score'])
Y_test_score.to_csv('score_0302.csv',index=False,columns=['score'])

In [8]:
Y_test_score.head

<bound method NDFrame.head of            score
0       3.374214
1       3.033342
2       2.840727
3       2.828560
4       3.020992
5       3.369343
6       2.693649
7       2.655401
8       3.358118
9       3.208779
10      4.349189
11      3.712055
12      4.410074
13      4.734612
14      5.000000
15      4.193359
16      5.000000
17      2.044354
18      3.402113
19      4.039068
20      2.912757
21      3.637723
22      4.367030
23      4.317493
24      4.470900
25      4.600774
26      5.000000
27      4.346885
28      4.352058
29      5.000000
...          ...
546166  3.842212
546167  3.534322
546168  3.336277
546169  3.456218
546170  3.868203
546171  3.842456
546172  3.426389
546173  3.581624
546174  3.184718
546175  3.584050
546176  3.764626
546177  4.001624
546178  3.860460
546179  4.005994
546180  3.015817
546181  3.631522
546182  3.689690
546183  4.162986
546184  2.966850
546185  3.669949
546186  3.402559
546187  4.425407
546188  3.276732
546189  3.572484
546190  3.078473
5

### 基于聚类的推荐

In [80]:
import pandas as pd
import numpy as np

In [82]:
train = pd.read_csv('train.csv')

test = pd.read_csv('test.csv')

In [92]:
rate_rank = train.groupby('uid').mean().loc[:,['score']].iloc[:,-1]

In [124]:
rate_rank.head

<bound method NDFrame.head of        group
uid         
0          5
1          6
2          7
3          6
4          6
5          9
6          6
7          7
8          6
9          6
10         6
11         5
12         6
13         7
14         3
15         5
16         6
17         7
18         6
19         7
20         6
21         6
22         6
23         7
24         6
25         6
26         8
27         5
28         5
29         7
...      ...
10743      7
10744      6
10745      6
10748      7
10749      5
10750      7
10751      6
10752      7
10753      7
10754      7
10755      7
10756      8
10758      7
10759      7
10760      7
10761      7
10762      7
10763      7
10764      6
10766      7
10767      7
10768      7
10769      7
10770      7
10771      5
10772      8
10773      6
10774      7
10775      8
10776      6

[9225 rows x 1 columns]>

In [93]:
rate_rank = pd.DataFrame(np.int32((rate_rank*2).values), index=rate_rank.index, columns=['group'])

In [86]:
rate_rank.head

<bound method NDFrame.head of        group
uid         
0          5
1          6
2          7
3          6
4          6
5          9
6          6
7          7
8          6
9          6
10         6
11         5
12         6
13         7
14         3
15         5
16         6
17         7
18         6
19         7
20         6
21         6
22         6
23         7
24         6
25         6
26         8
27         5
28         5
29         7
...      ...
10743      7
10744      6
10745      6
10748      7
10749      5
10750      7
10751      6
10752      7
10753      7
10754      7
10755      7
10756      8
10758      7
10759      7
10760      7
10761      7
10762      7
10763      7
10764      6
10766      7
10767      7
10768      7
10769      7
10770      7
10771      5
10772      8
10773      6
10774      7
10775      8
10776      6

[9225 rows x 1 columns]>

In [87]:
train.shape

(3159974, 4)

In [90]:
rate_rank_des = rate_rank.reset_index()

In [94]:
rate_rank

Unnamed: 0_level_0,group
uid,Unnamed: 1_level_1
0,5
1,6
2,7
3,6
4,6
5,9
6,6
7,7
8,6
9,6


In [97]:
train_plus = pd.merge(train, rate_rank_des, how='left', on='uid')

In [98]:
train_plus.head

<bound method NDFrame.head of            uid    iid  score    time  group
0            0      0    2.0    19.0     10
1            0      8    4.0   273.0     10
2            0     13    1.0   587.0     10
3            0     18    3.0    15.0     10
4            0     34    3.0    17.0     10
5            0     38    4.0    37.0     10
6            0     44    5.0   245.0     10
7            0     59    2.0   308.0     10
8            0    115    5.0   177.0     10
9            0    124    1.0    37.0     10
10           0    164    3.0    43.0     10
11           0    170    3.0   177.0     10
12           0    196    3.0   598.0     10
13           0    301    3.0   308.0     10
14           0    313    4.0   417.0     10
15           0    314    2.0   273.0     10
16           0    322    4.0   417.0     10
17           0    353    4.0    37.0     10
18           0    355    1.0    89.0     10
19           0    356    2.0   177.0     10
20           0    410    3.0   177.0     10
21

In [99]:
test_plus = pd.merge(X_test, rate_rank_des, how='left', on='uid')

In [100]:
res = train_plus.groupby(['iid','group']).mean().reset_index().loc[:, ['iid', 'group', 'score']]
result = pd.merge(test_plus, res, how='left', on=['iid','group']).fillna(3.0)
result.to_csv('score_01.csv', index=False, columns=['score'])

In [101]:
users=train.uid.unique()

In [103]:
users.shape

(9225,)

In [104]:
items=train.iid.unique()

In [106]:
items.shape

(14009,)