In [1]:
%matplotlib inline
%reload_ext autoreload
%autoreload 2
# 多行输出
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all" 

# 协同过滤

In [2]:
from fastai import *
from fastai.collab import *
from fastai.tabular import *

## 获取数据

In [3]:
path = untar_data(URLs.ML_SAMPLE)
path

  with open(fpath, 'r') as yaml_file: return yaml.load(yaml_file)


PosixPath('/home/lab/.fastai/data/movie_lens_sample')

In [4]:
# 数据集包含的文件
path.ls()

[PosixPath('/home/lab/.fastai/data/movie_lens_sample/ratings.csv')]

In [5]:
ratings = pd.read_csv(path/'ratings.csv')
ratings.head(10)

Unnamed: 0,userId,movieId,rating,timestamp
0,73,1097,4.0,1255504951
1,561,924,3.5,1172695223
2,157,260,3.5,1291598691
3,358,1210,5.0,957481884
4,130,316,2.0,1138999234
5,580,1196,4.0,1220561546
6,544,2918,5.0,1435787004
7,213,1200,3.0,1462634054
8,176,2571,4.5,1340714691
9,481,4886,4.5,1437002227


In [6]:
ratings.describe()

Unnamed: 0,userId,movieId,rating,timestamp
count,6031.0,6031.0,6031.0,6031.0
mean,350.269773,1892.725419,3.807826,1127659000.0
std,194.409989,4483.290497,0.961857,159180100.0
min,15.0,1.0,0.5,853892800.0
25%,176.0,457.0,3.0,976424300.0
50%,358.0,1089.0,4.0,1111489000.0
75%,518.0,2028.0,4.5,1232810000.0
max,665.0,58559.0,5.0,1473804000.0


In [7]:
ratings.userId.unique().shape
ratings.movieId.unique().shape

(100,)

(100,)

In [8]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6031 entries, 0 to 6030
Data columns (total 4 columns):
userId       6031 non-null int64
movieId      6031 non-null int64
rating       6031 non-null float64
timestamp    6031 non-null int64
dtypes: float64(1), int64(3)
memory usage: 188.5 KB


In [9]:
# 创建数据束
data = CollabDataBunch.from_df(ratings, seed=42)
data

TabularDataBunch;

Train: LabelList (4825 items)
x: CollabList
userId 73; movieId 1097; ,userId 561; movieId 924; ,userId 157; movieId 260; ,userId 358; movieId 1210; ,userId 130; movieId 316; 
y: FloatList
4.0,3.5,3.5,5.0,2.0
Path: .;

Valid: LabelList (1206 items)
x: CollabList
userId 306; movieId 2628; ,userId 605; movieId 3793; ,userId 313; movieId 4886; ,userId 468; movieId 1136; ,userId 380; movieId 539; 
y: FloatList
3.0,2.0,4.5,4.0,1.5
Path: .;

Test: None

## 创建学习器

In [10]:
doc(collab_learner)

In [11]:
y_range = [0, 5]
learn = collab_learner(data, n_factors=50, y_range=y_range)

In [12]:
learn

CollabLearner(data=TabularDataBunch;

Train: LabelList (4825 items)
x: CollabList
userId 73; movieId 1097; ,userId 561; movieId 924; ,userId 157; movieId 260; ,userId 358; movieId 1210; ,userId 130; movieId 316; 
y: FloatList
4.0,3.5,3.5,5.0,2.0
Path: .;

Valid: LabelList (1206 items)
x: CollabList
userId 306; movieId 2628; ,userId 605; movieId 3793; ,userId 313; movieId 4886; ,userId 468; movieId 1136; ,userId 380; movieId 539; 
y: FloatList
3.0,2.0,4.5,4.0,1.5
Path: .;

Test: None, model=EmbeddingDotBias(
  (u_weight): Embedding(101, 50)
  (i_weight): Embedding(101, 50)
  (u_bias): Embedding(101, 1)
  (i_bias): Embedding(101, 1)
), opt_func=functools.partial(<class 'torch.optim.adam.Adam'>, betas=(0.9, 0.99)), loss_func=FlattenedLoss of MSELoss(), metrics=[], true_wd=True, bn_wd=True, wd=0.01, train_bn=True, path=PosixPath('.'), model_dir='models', callback_fns=[functools.partial(<class 'fastai.basic_train.Recorder'>, add_time=True)], callbacks=[], layer_groups=[Sequential(
  (0): Em

## stage1

In [13]:
learn.fit_one_cycle(4, 5e-3)

epoch,train_loss,valid_loss,time
0,2.292531,1.674721,00:00
1,1.018198,0.687767,00:00
2,0.722456,0.660675,00:00
3,0.609247,0.654943,00:00


## 基本原理

1. 为用户和电影分别建立 Embedding 矩阵，如每个用户用5个数字表示，每个电影也用5个数字表示
2. 对某个用户评分的预测，也就是用户所表示的(1,5)向量乘以表示电影的(5,1)向量
3. 为了得到 Embedding 矩阵，我们使用线性模型，并用MSE或者RMSE作为损失函数，使用SGD训练
4. 使用 y_range 是为了使用先验知识，对预测结果进行限制，也相当于加了一个小的神经网络

```python
def trunc_normal_(x:Tensor, mean:float=0., std:float=1.) -> Tensor:
    "Truncated normal initialization."
    # From https://discuss.pytorch.org/t/implementing-truncated-normal-initializer/4778/12
    return x.normal_().fmod_(2).mul_(std).add_(mean)

def embedding(ni:int,nf:int) -> nn.Module:
    "Create an embedding layer."
    emb = nn.Embedding(ni, nf)
    # See https://arxiv.org/abs/1711.09160
    with torch.no_grad(): trunc_normal_(emb.weight, std=0.01)
    return emb

class EmbeddingDotBias(nn.Module):
    "Base dot model for collaborative filtering."
    def __init__(self, n_factors:int, n_users:int, n_items:int, y_range:Tuple[float,float]=None):
        super().__init__()
        self.y_range = y_range
        (self.u_weight, self.i_weight, self.u_bias, self.i_bias) = [embedding(*o) for o in [
            (n_users, n_factors), (n_items, n_factors), (n_users,1), (n_items,1)
        ]]

    def forward(self, users:LongTensor, items:LongTensor) -> Tensor:
        dot = self.u_weight(users)* self.i_weight(items)
        res = dot.sum(1) + self.u_bias(users).squeeze() + self.i_bias(items).squeeze()
        if self.y_range is None: return res
        return torch.sigmoid(res) * (self.y_range[1]-self.y_range[0]) + self.y_range[0]
```

In [14]:
data.valid_ds[:10]

LabelList (10 items)
x: CollabList
userId             306
movieId           2628
rating               3
timestamp    956082889
Name: 1870, dtype: object,userId             605
movieId           3793
rating               2
timestamp    980174184
Name: 2584, dtype: object,userId              313
movieId            4886
rating              4.5
timestamp    1168878351
Name: 1328, dtype: object,userId              468
movieId            1136
rating                4
timestamp    1296193302
Name: 5083, dtype: object,userId              380
movieId             539
rating              1.5
timestamp    1115007209
Name: 3413, dtype: object
y: FloatList
3.0,2.0,4.5,4.0,1.5
Path: .

In [15]:
u, v = data.train_ds.x.classes.values()
u, v
len(u), len(v)

(array(['#na#', '15', '17', '19', ..., '652', '654', '664', '665'], dtype='<U21'),
 array(['#na#', '1', '10', '32', ..., '6539', '7153', '8961', '58559'], dtype='<U21'))

(101, 101)

In [16]:
data.train_ds.x.classes['userId']

array(['#na#', '15', '17', '19', ..., '652', '654', '664', '665'], dtype='<U21')

In [17]:
sorted(ratings.userId.unique())

[15,
 17,
 19,
 23,
 30,
 48,
 56,
 73,
 77,
 78,
 88,
 95,
 102,
 105,
 111,
 119,
 128,
 130,
 134,
 150,
 157,
 165,
 176,
 187,
 195,
 199,
 212,
 213,
 220,
 232,
 239,
 242,
 243,
 247,
 262,
 268,
 285,
 292,
 294,
 299,
 306,
 311,
 312,
 313,
 346,
 353,
 355,
 358,
 380,
 382,
 384,
 387,
 388,
 402,
 405,
 407,
 423,
 427,
 430,
 431,
 439,
 452,
 457,
 460,
 461,
 463,
 468,
 472,
 475,
 480,
 481,
 505,
 509,
 514,
 518,
 529,
 534,
 537,
 544,
 547,
 561,
 564,
 574,
 575,
 577,
 580,
 585,
 587,
 596,
 598,
 605,
 607,
 608,
 615,
 624,
 648,
 652,
 654,
 664,
 665]

In [18]:
help(sorted)

Help on built-in function sorted in module builtins:

sorted(iterable, /, *, key=None, reverse=False)
    Return a new list containing all items from the iterable in ascending order.
    
    A custom key function can be supplied to customize the sort order, and the
    reverse flag can be set to request the result in descending order.



In [19]:
help(learn.get_preds)

Help on method get_preds in module fastai.basic_train:

get_preds(ds_type:fastai.basic_data.DatasetType=<DatasetType.Valid: 2>, with_loss:bool=False, n_batch:Union[int, NoneType]=None, pbar:Union[fastprogress.fastprogress.MasterBar, fastprogress.fastprogress.ProgressBar, NoneType]=None) -> List[torch.Tensor] method of fastai.collab.CollabLearner instance
    Return predictions and targets on `ds_type` dataset.



In [20]:
learn.get_preds() # pred target

[tensor([3.4290, 3.1171, 4.0019,  ..., 4.0484, 4.3907, 3.9312]),
 tensor([3.0000, 2.0000, 4.5000,  ..., 4.0000, 4.5000, 4.0000])]

In [26]:
learn.get_preds(DatasetType.Valid)

[tensor([3.4290, 3.1171, 4.0019,  ..., 4.0484, 4.3907, 3.9312]),
 tensor([3.0000, 2.0000, 4.5000,  ..., 4.0000, 4.5000, 4.0000])]

In [22]:
data

TabularDataBunch;

Train: LabelList (4825 items)
x: CollabList
userId 73; movieId 1097; ,userId 561; movieId 924; ,userId 157; movieId 260; ,userId 358; movieId 1210; ,userId 130; movieId 316; 
y: FloatList
4.0,3.5,3.5,5.0,2.0
Path: .;

Valid: LabelList (1206 items)
x: CollabList
userId 306; movieId 2628; ,userId 605; movieId 3793; ,userId 313; movieId 4886; ,userId 468; movieId 1136; ,userId 380; movieId 539; 
y: FloatList
3.0,2.0,4.5,4.0,1.5
Path: .;

Test: None