In [1]:
import pandas as pd
import numpy as np

In [2]:
user_cols = ['UserID', 'Gender', 'Age', 'Occupation', 'Zip-code']
movie_cols = ['MovieID', 'Title', 'Genres']
rating_cols=['UserID','MovieID', 'Rating', 'ts']

user_df = pd.read_csv('ml-1m/users.dat', sep='::',
                      header=None, names=user_cols, engine='python')
movie_df = pd.read_csv('ml-1m/movies.dat', sep='::',
                       header=None, names=movie_cols, engine='python')
rating_df=pd.read_csv('ml-1m/ratings.dat', sep='::',
                       header=None, names=rating_cols, engine='python')

In [3]:
# user_df.sample(5)
# user_df.info()
# user_df.describe()

# movie_df.sample(5)
# movie_df.info()
# movie_df.describe()    # 看出MovieID有断裂情况，跟README描述相符

# rating_df.sample(5)
# rating_df.info()
# rating_df.describe()

## Encode
将数据编码。

### User
邮编zc还不知道怎么用，这里仅使用前四个特征。

In [4]:
user_df.drop(['Zip-code'], axis=1, inplace=True)

gender2id = {gender: idx
             for idx, gender in enumerate(sorted(user_df.loc[:, 'Gender'].unique()))}
age2id = {age: idx
          for idx, age in enumerate(sorted(user_df.loc[:, 'Age'].unique()))}
occupation2id = {occu: idx
                 for idx, occu in enumerate(sorted(user_df.loc[:, 'Occupation'].unique()))}

user_df.loc[:, 'Gender'] = user_df.loc[:, 'Gender'].map(gender2id)
user_df.loc[:, 'Age'] = user_df.loc[:, 'Age'].map(age2id)
user_df.loc[:, 'Occupation'] = user_df.loc[:, 'Occupation'].map(occupation2id)
user_df.loc[:, 'UserID'] -= 1    # 从1开始变成从0开始

user_df.loc[:, 'Age_Gender'] = user_df.loc[:, 'Age'].map(
    str)+user_df.loc[:, 'Gender'].map(str)
agegen2id = {agegen: idx
             for idx, agegen in enumerate(sorted(user_df.loc[:, 'Age_Gender'].unique()))}
user_df.loc[:, 'Age_Gender'] = user_df.loc[:, 'Age_Gender'].map(agegen2id)

user_df.drop(['Age', 'Gender'], axis=1, inplace=True)

user_df.sample(5)

Unnamed: 0,UserID,Occupation,Age_Gender
2519,2519,2,12
5343,5343,1,10
2028,2028,12,4
1923,1923,6,12
1721,1721,0,7


### Movie
对于movie数据，设定是提取出'Year'特征，还要对其他特征进行编码。需要注意的是'Genres'特征有多个取值，需要编码成等长列表；而'Title'特征有两种编码方式，一种是ID编码，另一种是词编码。如果使用ID编码就'MovieID'互相冗余，所以使用词编码。还注意到'Title'中存在副标题的数据，这里删除副标题，只使用首标题。

In [5]:
# year_pat='\([\d]{4}\)'
# second_title_pat='\(\D+\)'
movie_df.loc[:, 'Year'] = movie_df.loc[:, 'Title'].str.extract('({})'.format(
    '\([\d]{4}\)'), expand=False).str.replace('[\(\)]', '').astype('int32')    # 提取年份
movie_df.loc[:, 'Title'] = movie_df.loc[:, 'Title'].str.replace(
    '\ \([\d]{4}\)', '')    # 删除年份
movie_df.loc[:, 'Title'] = movie_df.loc[:, 'Title'].str.replace(
    '\ \(\D+\)', '')    # 删除第二title
movie_df.loc[:, 'Title'] = movie_df.loc[:, 'Title'].str.replace(
    ', (The|An|A)$', '')    # 删掉末尾的冠词

mid2id = {mid: idx
          for idx, mid in enumerate(sorted(movie_df.loc[:, 'MovieID'].unique()))}

# 类别集
genre_set = set()
for genre in movie_df.loc[:, 'Genres'].str.split('|'):
    genre_set.update(genre)
genre_set.add('<PAD>')
# 类别转id
genre2id = {genre: idx for idx, genre in enumerate(genre_set)}
# 多类别转list
genres2list = {genres: [genre2id[genre] for genre in genres.split('|')]
               for idx, genres in enumerate(movie_df.loc[:, 'Genres'].unique())}
# 规整化处理
genres_max = 5    # 设定最大允许的类别数
for genres in genres2list.keys():
    genres2list[genres] = genres2list[genres][:genres_max]    # 超出长度做截断
    for pad_num in range(genres_max-len(genres2list[genres])):    # 需要填充的数量
        genres2list[genres].append(genre2id['<PAD>'])

# 单词集
word_set = set()
for word in movie_df.loc[:, 'Title'].str.split():
    word_set.update(word)
word_set.add('<PAD>')
word2id = {word: idx for idx, word in enumerate(word_set)}
title2list = {title: [word2id[word] for word in title.split()]
              for idx, title in enumerate(movie_df.loc[:, 'Title'].unique())}
# 规整化处理
title_max = 8
for title in title2list.keys():
    title2list[title] = title2list[title][:title_max]    # 超出长度做截断
    for pad_num in range(title_max-len(title2list[title])):
        title2list[title].append(word2id['<PAD>'])

year2id = {year: idx
           for idx, year in enumerate(sorted(movie_df.loc[:, 'Year'].unique()))}

movie_df.loc[:, 'MovieID'] = movie_df.loc[:, 'MovieID'].map(mid2id)
movie_df.loc[:, 'Title'] = movie_df.loc[:, 'Title'].map(title2list)
movie_df.loc[:, 'Genres'] = movie_df.loc[:, 'Genres'].map(genres2list)
movie_df.loc[:, 'Year'] = movie_df.loc[:, 'Year'].map(year2id)

movie_df.sample(5)

Unnamed: 0,MovieID,Title,Genres,Year
3235,3235,"[1145, 1387, 176, 176, 176, 176, 176, 176]","[0, 14, 16, 16, 16]",58
103,103,"[2703, 3136, 1407, 260, 176, 176, 176, 176]","[14, 7, 16, 16, 16]",75
2323,2323,"[2623, 316, 176, 176, 176, 176, 176, 176]","[4, 14, 16, 16, 16]",78
3483,3483,"[2607, 176, 176, 176, 176, 176, 176, 176]","[4, 16, 16, 16, 16]",60
351,351,"[2258, 176, 176, 176, 176, 176, 176, 176]","[9, 4, 16, 16, 16]",74


In [17]:
title2list['Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb']

[2432, 2878, 4252, 1142, 416, 2468, 1746, 940, 2037, 2215, 1944, 747, 306]

In [15]:
# len(word2id)
# len(genre2id)
# len(year2id)

81

### Merge
根据ratings表将所有数据合并起来。

In [6]:
rating_df.loc[:, 'UserID'] -= 1
rating_df.loc[:, 'MovieID'] = rating_df.loc[:, 'MovieID'].map(mid2id)
rating_df.drop(['ts'], axis=1, inplace=True)

data = pd.merge(pd.merge(user_df, rating_df), movie_df)
data = data[['UserID', 'Age_Gender', 'Occupation',
             'MovieID', 'Title', 'Genres', 'Year', 'Rating']]
data = data.sample(frac=1)     # shuffle

data.sample(5)

Unnamed: 0,UserID,Age_Gender,Occupation,MovieID,Title,Genres,Year,Rating
8023,5579,9,7,1179,"[1458, 960, 176, 176, 176, 176, 176, 176]","[2, 8, 4, 7, 16]",67,5
16380,2859,7,17,1022,"[1990, 3136, 2187, 176, 176, 176, 176, 176]","[12, 16, 16, 16, 16]",45,5
3672,3148,3,4,3339,"[540, 1503, 176, 176, 176, 176, 176, 176]","[14, 16, 16, 16, 16]",80,5
368999,3096,4,5,1257,"[1136, 1230, 792, 176, 176, 176, 176, 176]","[2, 14, 7, 16, 16]",70,5
572983,1178,11,1,3293,"[3270, 3014, 3899, 176, 176, 176, 176, 176]","[4, 0, 14, 16, 16]",55,4


## 保存

In [7]:
n_samples = len(data)
train_ratio = 0.8
cut_idx = int(n_samples*train_ratio)
train_df, test_df = data[:cut_idx], data[cut_idx:]
np.save('train.npy', train_df.values)
np.save('test.npy', test_df.values)

## 打包数据类

In [None]:
class MLData:
    def __init__(self, path, batch_size=32, shuffle=True):
        self._data = list()
        self._target = list()
        self._n_samples = 0
        self._n_features = 0

        self._idx = 0    # mini-batch的游标
        self._batch_size = batch_size

        self._load(path)

        if shuffle:
            self._shuffle_data()

        print(self._data.shape, self._target.shape)

    def _load(self, path):
        tmp = np.load(path, allow_pickle=True)
        self._data = tmp[:, :-1]
        self._target = tmp[:, -1]

        self._n_samples, self.n_features = self._data.shape[0], self._data.shape[1]

    def _shuffle_data(self):
        '''
        打乱数据
        '''
        idxs = np.random.permutation(self._n_samples)
        self._data = self._data[idxs]
        self._target = self._target[idxs]

    def next_batch(self):
        '''
        生成mini-batch
        '''
        while self._idx < self._n_samples:
            yield self._data[self._idx: (self._idx+self._batch_size)], self._target[self._idx: (self._idx+self._batch_size)]
            self._idx += self._batch_size

        self._idx = 0
        self._shuffle_data()

    @property
    def u_id(self):
        return np.array(self._data[:, 0], dtype=np.int32)

    @property
    def u_occu(self):
        return np.array(self._data[:, 2], dtype=np.int32)

    @property
    def u_age_gender(self):
        return np.array(self._data[:, 1],dtype=np.int32)

    @property
    def m_id(self):
        return np.array(self._data[:, 3], dtype=np.int32)

    @property
    def m_title(self):
        return self._data[:, 4]

    @property
    def m_genres(self):
        return self._data[:, 5]

    @property
    def m_year(self):
        return np.array(self._data[:, 6], dtype=np.int32)