In [1]:
import re
import pandas as pd
import numpy as np

df = pd.read_csv('../data/ml-100k/u.data', sep='\t', names=['u', 'i', 't'], usecols=[0, 1, 3])
df

Unnamed: 0,u,i,t
0,196,242,881250949
1,186,302,891717742
2,22,377,878887116
3,244,51,880606923
4,166,346,886397596
...,...,...,...
99995,880,476,880175444
99996,716,204,879795543
99997,276,1090,874795795
99998,13,225,882399156


In [2]:
df = df[df['u'].isin(df['u'].value_counts()[df['u'].value_counts()>=5].index)]
df = df[df['i'].isin(df['i'].value_counts()[df['i'].value_counts()>=5].index)]
df

Unnamed: 0,u,i,t
0,196,242,881250949
1,186,302,891717742
2,22,377,878887116
3,244,51,880606923
4,166,346,886397596
...,...,...,...
99995,880,476,880175444
99996,716,204,879795543
99997,276,1090,874795795
99998,13,225,882399156


In [3]:
df = df.sort_values('t')
df

Unnamed: 0,u,i,t
214,259,255,874724710
83965,259,286,874724727
43027,259,298,874724754
21396,259,185,874724781
82655,259,173,874724843
...,...,...,...
52134,729,333,893286638
79208,729,272,893286638
73008,729,313,893286638
46574,729,328,893286638


In [4]:
u2id = dict()
i2id = dict()
u_index, i_index = 0, 0
for u, i in zip(df['u'], df['i']):
    if u2id.get(u) is None:
        u2id[u] = u_index
        u_index += 1
    if i2id.get(i) is None:
        i2id[i] = i_index
        i_index += 1     
print('# Users:', u_index)
print('# Items:', i_index)

# Users: 943
# Items: 1349


In [5]:
df['u'] = df['u'].map(u2id)
df['i'] = df['i'].map(i2id)
df

Unnamed: 0,u,i,t
214,0,0,874724710
83965,0,1,874724727
43027,0,2,874724754
21396,0,3,874724781
82655,0,4,874724843
...,...,...,...
52134,942,966,893286638
79208,942,1291,893286638
73008,942,1286,893286638
46574,942,299,893286638


In [6]:
df = df.reset_index(drop=True)
df

Unnamed: 0,u,i,t
0,0,0,874724710
1,0,1,874724727
2,0,2,874724754
3,0,3,874724781
4,0,4,874724843
...,...,...,...
99282,942,966,893286638
99283,942,1291,893286638
99284,942,1286,893286638
99285,942,299,893286638


In [7]:
df.to_csv('../data/ml-100k.csv', header=None, index=False)

### The following cells will process the user attribute

In [8]:
u_df = pd.read_table('../data/ml-100k/u.user', sep='|', names=['u', 'a', 'g', 'o'], usecols=[0, 1, 2, 3])
u_df

Unnamed: 0,u,a,g,o
0,1,24,M,technician
1,2,53,F,other
2,3,23,M,writer
3,4,24,M,technician
4,5,33,F,other
...,...,...,...,...
938,939,26,F,student
939,940,32,M,administrator
940,941,20,M,student
941,942,48,F,librarian


In [9]:
u_df = u_df[u_df['u'].isin(u2id.keys())]
u_df

Unnamed: 0,u,a,g,o
0,1,24,M,technician
1,2,53,F,other
2,3,23,M,writer
3,4,24,M,technician
4,5,33,F,other
...,...,...,...,...
938,939,26,F,student
939,940,32,M,administrator
940,941,20,M,student
941,942,48,F,librarian


In [10]:
u_df = u_df.reset_index(drop=True)
u_df

Unnamed: 0,u,a,g,o
0,1,24,M,technician
1,2,53,F,other
2,3,23,M,writer
3,4,24,M,technician
4,5,33,F,other
...,...,...,...,...
938,939,26,F,student
939,940,32,M,administrator
940,941,20,M,student
941,942,48,F,librarian


In [11]:
u_df['u'] = u_df['u'].map(u2id)
u_df

Unnamed: 0,u,a,g,o
0,25,24,M,technician
1,722,53,F,other
2,750,23,M,writer
3,921,24,M,technician
4,74,33,F,other
...,...,...,...,...
938,387,26,F,student
939,593,32,M,administrator
940,31,20,M,student
941,845,48,F,librarian


In [12]:
u_df = u_df.sort_values('u')
u_df

Unnamed: 0,u,a,g,o
258,0,21,M,student
850,1,18,M,other
711,2,22,F,student
118,3,32,M,programmer
639,4,20,M,student
...,...,...,...,...
586,938,26,M,other
246,939,28,M,engineer
188,940,32,M,artist
682,941,42,M,librarian


In [13]:
u_df = u_df.reset_index(drop=True)
u_df

Unnamed: 0,u,a,g,o
0,0,21,M,student
1,1,18,M,other
2,2,22,F,student
3,3,32,M,programmer
4,4,20,M,student
...,...,...,...,...
938,938,26,M,other
939,939,28,M,engineer
940,940,32,M,artist
941,941,42,M,librarian


In [14]:
u_df['a'] //= 10
u_df

Unnamed: 0,u,a,g,o
0,0,2,M,student
1,1,1,M,other
2,2,2,F,student
3,3,3,M,programmer
4,4,2,M,student
...,...,...,...,...
938,938,2,M,other
939,939,2,M,engineer
940,940,3,M,artist
941,941,4,M,librarian


In [15]:
u_df['g'] = u_df['g'].map(dict(F=0, M=1))
u_df

Unnamed: 0,u,a,g,o
0,0,2,1,student
1,1,1,1,other
2,2,2,0,student
3,3,3,1,programmer
4,4,2,1,student
...,...,...,...,...
938,938,2,1,other
939,939,2,1,engineer
940,940,3,1,artist
941,941,4,1,librarian


In [16]:
o2id = dict()
index = 0
with open('../data/ml-100k/u.occupation') as f:
    for line in f.readlines():
        line = line.strip('\n')
        o2id[line] = index
        index += 1
o2id

{'administrator': 0,
 'artist': 1,
 'doctor': 2,
 'educator': 3,
 'engineer': 4,
 'entertainment': 5,
 'executive': 6,
 'healthcare': 7,
 'homemaker': 8,
 'lawyer': 9,
 'librarian': 10,
 'marketing': 11,
 'none': 12,
 'other': 13,
 'programmer': 14,
 'retired': 15,
 'salesman': 16,
 'scientist': 17,
 'student': 18,
 'technician': 19,
 'writer': 20}

In [17]:
u_df['o'] = u_df['o'].map(o2id)
u_df

Unnamed: 0,u,a,g,o
0,0,2,1,18
1,1,1,1,13
2,2,2,0,18
3,3,3,1,14
4,4,2,1,18
...,...,...,...,...
938,938,2,1,13
939,939,2,1,4
940,940,3,1,1
941,941,4,1,10


In [18]:
a_list = u_df['a'].values.tolist()
a_classes = u_df['a'].nunique()
g_list = u_df['g'].values.tolist()
g_classes = u_df['g'].nunique()
o_list = u_df['o'].values.tolist()
o_classes = u_df['o'].nunique()
print(len(a_list), len(g_list), len(o_list))
print(a_classes, g_classes, o_classes)

943 943 943
8 2 21


In [19]:
def onehot(labels, num_classes):
    onehots = []
    for label in labels:
        onehot = np.eye(num_classes)[label]
        onehots.append(onehot)
    return np.array(onehots)

In [20]:
a_onehots = onehot(a_list, a_classes)
g_onehots = onehot(g_list, g_classes)
o_onehots = onehot(o_list, o_classes)
print(a_onehots.shape, g_onehots.shape, o_onehots.shape)

(943, 8) (943, 2) (943, 21)


In [21]:
user_array = np.hstack((a_onehots, g_onehots, o_onehots))
user_array.shape

(943, 31)

In [22]:
np.save('../data/ml-100k_user_attribute.npy', user_array)

### The following cells will process the item attribute

In [23]:
names = ['i', 't'] + ['c' + str(_) for _ in range(19)]
usecols=[0, 2] + [5 + _ for _ in range(19)]
i_df = pd.read_table('../data/ml-100k/u.item', sep='|', names=names, usecols=usecols, encoding ='ISO-8859-1')
i_df

Unnamed: 0,i,t,c0,c1,c2,c3,c4,c5,c6,c7,...,c9,c10,c11,c12,c13,c14,c15,c16,c17,c18
0,1,01-Jan-1995,0,0,0,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,01-Jan-1995,0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,01-Jan-1995,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,01-Jan-1995,0,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,01-Jan-1995,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1677,1678,06-Feb-1998,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1678,1679,06-Feb-1998,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,0
1679,1680,01-Jan-1998,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1680,1681,01-Jan-1994,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [24]:
i_df = i_df[i_df['i'].isin(i2id.keys())]
i_df

Unnamed: 0,i,t,c0,c1,c2,c3,c4,c5,c6,c7,...,c9,c10,c11,c12,c13,c14,c15,c16,c17,c18
0,1,01-Jan-1995,0,0,0,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,01-Jan-1995,0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,01-Jan-1995,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,01-Jan-1995,0,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,01-Jan-1995,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1591,1592,30-Jan-1998,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1596,1597,01-Jan-1992,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1597,1598,14-Mar-1997,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
1614,1615,02-May-1997,0,1,1,0,1,0,0,0,...,1,0,0,0,0,0,0,0,0,0


In [25]:
i_df = i_df.reset_index(drop=True)
i_df

Unnamed: 0,i,t,c0,c1,c2,c3,c4,c5,c6,c7,...,c9,c10,c11,c12,c13,c14,c15,c16,c17,c18
0,1,01-Jan-1995,0,0,0,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,01-Jan-1995,0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,01-Jan-1995,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,01-Jan-1995,0,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,01-Jan-1995,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1344,1592,30-Jan-1998,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1345,1597,01-Jan-1992,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1346,1598,14-Mar-1997,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
1347,1615,02-May-1997,0,1,1,0,1,0,0,0,...,1,0,0,0,0,0,0,0,0,0


In [26]:
i_df['i'] = i_df['i'].map(i2id)
i_df

Unnamed: 0,i,t,c0,c1,c2,c3,c4,c5,c6,c7,...,c9,c10,c11,c12,c13,c14,c15,c16,c17,c18
0,266,01-Jan-1995,0,0,0,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,207,01-Jan-1995,0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,332,01-Jan-1995,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,68,01-Jan-1995,0,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,487,01-Jan-1995,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1344,1304,30-Jan-1998,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1345,860,01-Jan-1992,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1346,1173,14-Mar-1997,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
1347,1084,02-May-1997,0,1,1,0,1,0,0,0,...,1,0,0,0,0,0,0,0,0,0


In [27]:
i_df = i_df.sort_values('i')
i_df

Unnamed: 0,i,t,c0,c1,c2,c3,c4,c5,c6,c7,...,c9,c10,c11,c12,c13,c14,c15,c16,c17,c18
253,0,20-Jun-1997,0,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
284,1,15-Nov-1996,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0
296,2,27-Jun-1997,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,1,1,0,0
183,3,01-Jan-1960,0,0,0,0,0,0,0,0,...,0,0,1,0,0,1,0,1,0,0
171,4,01-Jan-1987,0,1,1,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
882,1344,10-Mar-1998,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1320,1345,17-Mar-1998,0,1,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
885,1346,27-Mar-1998,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,1,1,0,0
886,1347,27-Mar-1998,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [28]:
i_df = i_df.reset_index(drop=True)
i_df

Unnamed: 0,i,t,c0,c1,c2,c3,c4,c5,c6,c7,...,c9,c10,c11,c12,c13,c14,c15,c16,c17,c18
0,0,20-Jun-1997,0,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
1,1,15-Nov-1996,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0
2,2,27-Jun-1997,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,1,1,0,0
3,3,01-Jan-1960,0,0,0,0,0,0,0,0,...,0,0,1,0,0,1,0,1,0,0
4,4,01-Jan-1987,0,1,1,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1344,1344,10-Mar-1998,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1345,1345,17-Mar-1998,0,1,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1346,1346,27-Mar-1998,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,1,1,0,0
1347,1347,27-Mar-1998,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [29]:
for i in range(len(i_df)):
    try:
        match = re.match(r'(.*)-(.*)-(.*)', i_df.loc[i, 't'])
        i_df.loc[i, 't'] = match.group(3)
    except:
        # There is a missing value. Acording to the maximum likelihood principle,
        # we estimate that the year is the most frequent value, that is, 1996
        i_df.loc[i, 't'] = '1996'  
i_df

Unnamed: 0,i,t,c0,c1,c2,c3,c4,c5,c6,c7,...,c9,c10,c11,c12,c13,c14,c15,c16,c17,c18
0,0,1997,0,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
1,1,1996,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0
2,2,1997,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,1,1,0,0
3,3,1960,0,0,0,0,0,0,0,0,...,0,0,1,0,0,1,0,1,0,0
4,4,1987,0,1,1,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1344,1344,1998,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1345,1345,1998,0,1,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1346,1346,1998,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,1,1,0,0
1347,1347,1998,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [30]:
i_df['t'] = i_df['t'].astype(int)
i_df['t'] -= i_df['t'].min()
i_df['t'] //= 10
i_df

Unnamed: 0,i,t,c0,c1,c2,c3,c4,c5,c6,c7,...,c9,c10,c11,c12,c13,c14,c15,c16,c17,c18
0,0,7,0,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
1,1,7,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0
2,2,7,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,1,1,0,0
3,3,3,0,0,0,0,0,0,0,0,...,0,0,1,0,0,1,0,1,0,0
4,4,6,0,1,1,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1344,1344,7,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1345,1345,7,0,1,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1346,1346,7,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,1,1,0,0
1347,1347,7,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [31]:
t_list = i_df['t'].values.tolist()
t_classes = i_df['t'].nunique()
print(len(t_list), t_classes)

1349 8


In [32]:
t_onehots = onehot(t_list, t_classes)
print(t_onehots.shape)

(1349, 8)


In [33]:
g_multihots = np.array(i_df[['c{}'.format(_) for _ in range(19)]].values.tolist())
g_multihots.shape

(1349, 19)

In [34]:
item_array = np.hstack((t_onehots, g_multihots))
item_array.shape

(1349, 27)

In [35]:
np.save('../data/ml-100k_item_attribute.npy', item_array)