In [1]:
import re
import pandas as pd
import numpy as np

df = pd.read_table('../data/ml-1m/ratings.dat', sep='::', names=['u', 'i', 't'], usecols=[0, 1, 3], engine='python')
df

Unnamed: 0,u,i,t
0,1,1193,978300760
1,1,661,978302109
2,1,914,978301968
3,1,3408,978300275
4,1,2355,978824291
...,...,...,...
1000204,6040,1091,956716541
1000205,6040,1094,956704887
1000206,6040,562,956704746
1000207,6040,1096,956715648


In [2]:
df = df[df['u'].isin(df['u'].value_counts()[df['u'].value_counts()>=5].index)]
df = df[df['i'].isin(df['i'].value_counts()[df['i'].value_counts()>=5].index)]
df

Unnamed: 0,u,i,t
0,1,1193,978300760
1,1,661,978302109
2,1,914,978301968
3,1,3408,978300275
4,1,2355,978824291
...,...,...,...
1000204,6040,1091,956716541
1000205,6040,1094,956704887
1000206,6040,562,956704746
1000207,6040,1096,956715648


In [3]:
df = df.sort_values('t')
df

Unnamed: 0,u,i,t
1000138,6040,858,956703932
999873,6040,593,956703954
1000153,6040,2384,956703954
1000192,6040,2019,956703977
1000007,6040,1961,956703977
...,...,...,...
825793,4958,2399,1046454338
825438,4958,1407,1046454443
825724,4958,3264,1046454548
825731,4958,2634,1046454548


In [4]:
u2id = dict()
i2id = dict()
u_index, i_index = 0, 0
for u, i in zip(df['u'], df['i']):
    if u2id.get(u) is None:
        u2id[u] = u_index
        u_index += 1
    if i2id.get(i) is None:
        i2id[i] = i_index
        i_index += 1     
print('# Users:', u_index)
print('# Items:', i_index)

# Users: 6040
# Items: 3416


In [5]:
df['u'] = df['u'].map(u2id)
df['i'] = df['i'].map(i2id)
df

Unnamed: 0,u,i,t
1000138,0,0,956703932
999873,0,1,956703954
1000153,0,2,956703954
1000192,0,3,956703977
1000007,0,4,956703977
...,...,...,...
825793,1081,1864,1046454338
825438,1081,1027,1046454443
825724,1081,1141,1046454548
825731,1081,2185,1046454548


In [6]:
df = df.reset_index(drop=True)
df

Unnamed: 0,u,i,t
0,0,0,956703932
1,0,1,956703954
2,0,2,956703954
3,0,3,956703977
4,0,4,956703977
...,...,...,...
999606,1081,1864,1046454338
999607,1081,1027,1046454443
999608,1081,1141,1046454548
999609,1081,2185,1046454548


In [7]:
df.to_csv('../data/ml-1m.csv', header=None, index=False)

### The following cells will process the user attribute

In [8]:
u_df = pd.read_table('../data/ml-1m/users.dat', sep='::', names=['u', 'g', 'a', 'o'], usecols=[0, 1, 2, 3], engine = 'python')
u_df

Unnamed: 0,u,g,a,o
0,1,F,1,10
1,2,M,56,16
2,3,M,25,15
3,4,M,45,7
4,5,M,25,20
...,...,...,...,...
6035,6036,F,25,15
6036,6037,F,45,1
6037,6038,F,56,1
6038,6039,F,45,0


In [9]:
u_df = u_df[u_df['u'].isin(u2id.keys())]
u_df

Unnamed: 0,u,g,a,o
0,1,F,1,10
1,2,M,56,16
2,3,M,25,15
3,4,M,45,7
4,5,M,25,20
...,...,...,...,...
6035,6036,F,25,15
6036,6037,F,45,1
6037,6038,F,56,1
6038,6039,F,45,0


In [10]:
u_df = u_df.reset_index(drop=True)
u_df

Unnamed: 0,u,g,a,o
0,1,F,1,10
1,2,M,56,16
2,3,M,25,15
3,4,M,45,7
4,5,M,25,20
...,...,...,...,...
6035,6036,F,25,15
6036,6037,F,45,1
6037,6038,F,56,1
6038,6039,F,45,0


In [11]:
u_df['u'] = u_df['u'].map(u2id)
u_df

Unnamed: 0,u,g,a,o
0,6033,F,1,10
1,6032,M,56,16
2,6031,M,25,15
3,6030,M,45,7
4,6029,M,25,20
...,...,...,...,...
6035,4,F,25,15
6036,3,F,45,1
6037,2,F,56,1
6038,1,F,45,0


In [12]:
u_df = u_df.sort_values('u')
u_df

Unnamed: 0,u,g,a,o
6039,0,M,25,6
6038,1,F,45,0
6037,2,F,56,1
6036,3,F,45,1
6035,4,F,25,15
...,...,...,...,...
158,6035,F,45,0
1264,6036,F,18,20
348,6037,M,1,10
2909,6038,M,1,19


In [13]:
u_df = u_df.reset_index(drop=True)
u_df

Unnamed: 0,u,g,a,o
0,0,M,25,6
1,1,F,45,0
2,2,F,56,1
3,3,F,45,1
4,4,F,25,15
...,...,...,...,...
6035,6035,F,45,0
6036,6036,F,18,20
6037,6037,M,1,10
6038,6038,M,1,19


In [14]:
u_df['a'] //= 10
u_df

Unnamed: 0,u,g,a,o
0,0,M,2,6
1,1,F,4,0
2,2,F,5,1
3,3,F,4,1
4,4,F,2,15
...,...,...,...,...
6035,6035,F,4,0
6036,6036,F,1,20
6037,6037,M,0,10
6038,6038,M,0,19


In [15]:
u_df['g'] = u_df['g'].map(dict(F=0, M=1))
u_df

Unnamed: 0,u,g,a,o
0,0,1,2,6
1,1,0,4,0
2,2,0,5,1
3,3,0,4,1
4,4,0,2,15
...,...,...,...,...
6035,6035,0,4,0
6036,6036,0,1,20
6037,6037,1,0,10
6038,6038,1,0,19


In [16]:
a_list = u_df['a'].values.tolist()
a_classes = u_df['a'].nunique()
g_list = u_df['g'].values.tolist()
g_classes = u_df['g'].nunique()
o_list = u_df['o'].values.tolist()
o_classes = u_df['o'].nunique()
print(len(a_list), len(g_list), len(o_list))
print(a_classes, g_classes, o_classes)

6040 6040 6040
6 2 21


In [17]:
def onehot(labels, num_classes):
    onehots = []
    for label in labels:
        onehot = np.eye(num_classes)[label]
        onehots.append(onehot)
    return np.array(onehots)

In [18]:
a_onehots = onehot(a_list, a_classes)
g_onehots = onehot(g_list, g_classes)
o_onehots = onehot(o_list, o_classes)
print(a_onehots.shape, g_onehots.shape, o_onehots.shape)

(6040, 6) (6040, 2) (6040, 21)


In [19]:
user_array = np.hstack((a_onehots, g_onehots, o_onehots))
user_array.shape

(6040, 29)

In [20]:
np.save('../data/ml-1m_user_attribute.npy', user_array)

### The following cells will process the item attribute

In [21]:
i_df = pd.read_table('../data/ml-1m/movies.dat', sep='::', names=['i', 't', 'g'], encoding ='ISO-8859-1', engine='python')
i_df

Unnamed: 0,i,t,g
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
3878,3948,Meet the Parents (2000),Comedy
3879,3949,Requiem for a Dream (2000),Drama
3880,3950,Tigerland (2000),Drama
3881,3951,Two Family House (2000),Drama


In [22]:
i_df = i_df[i_df['i'].isin(i2id.keys())]
i_df

Unnamed: 0,i,t,g
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
3878,3948,Meet the Parents (2000),Comedy
3879,3949,Requiem for a Dream (2000),Drama
3880,3950,Tigerland (2000),Drama
3881,3951,Two Family House (2000),Drama


In [23]:
i_df = i_df.reset_index(drop=True)
i_df

Unnamed: 0,i,t,g
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
3411,3948,Meet the Parents (2000),Comedy
3412,3949,Requiem for a Dream (2000),Drama
3413,3950,Tigerland (2000),Drama
3414,3951,Two Family House (2000),Drama


In [24]:
i_df['i'] = i_df['i'].map(i2id)
i_df

Unnamed: 0,i,t,g
0,654,Toy Story (1995),Animation|Children's|Comedy
1,1052,Jumanji (1995),Adventure|Children's|Fantasy
2,728,Grumpier Old Men (1995),Comedy|Romance
3,567,Waiting to Exhale (1995),Comedy|Drama
4,697,Father of the Bride Part II (1995),Comedy
...,...,...,...
3411,3372,Meet the Parents (2000),Comedy
3412,3391,Requiem for a Dream (2000),Drama
3413,3394,Tigerland (2000),Drama
3414,3399,Two Family House (2000),Drama


In [25]:
i_df = i_df.sort_values('i')
i_df

Unnamed: 0,i,t,g
720,0,"Godfather, The (1972)",Action|Crime|Drama
549,1,"Silence of the Lambs, The (1991)",Drama|Thriller
2020,2,Babe: Pig in the City (1998),Children's|Comedy
1688,3,Seven Samurai (The Magnificent Seven) (Shichin...,Action|Drama
1630,4,Rain Man (1988),Drama
...,...,...,...
1599,3411,Cavalcade (1933),Drama
2796,3412,I Am Cuba (Soy Cuba/Ya Kuba) (1964),Drama
3369,3413,"Specials, The (2000)",Comedy
3333,3414,About Adam (2000),Comedy


In [26]:
i_df = i_df.reset_index(drop=True)
i_df

Unnamed: 0,i,t,g
0,0,"Godfather, The (1972)",Action|Crime|Drama
1,1,"Silence of the Lambs, The (1991)",Drama|Thriller
2,2,Babe: Pig in the City (1998),Children's|Comedy
3,3,Seven Samurai (The Magnificent Seven) (Shichin...,Action|Drama
4,4,Rain Man (1988),Drama
...,...,...,...
3411,3411,Cavalcade (1933),Drama
3412,3412,I Am Cuba (Soy Cuba/Ya Kuba) (1964),Drama
3413,3413,"Specials, The (2000)",Comedy
3414,3414,About Adam (2000),Comedy


In [27]:
for i in range(len(i_df)):
    i_df.loc[i, 't'] = int(re.findall(r'[^()]+', i_df.loc[i, 't'])[-1])
i_df

Unnamed: 0,i,t,g
0,0,1972,Action|Crime|Drama
1,1,1991,Drama|Thriller
2,2,1998,Children's|Comedy
3,3,1954,Action|Drama
4,4,1988,Drama
...,...,...,...
3411,3411,1933,Drama
3412,3412,1964,Drama
3413,3413,2000,Comedy
3414,3414,2000,Comedy


In [28]:
i_df['t'] = i_df['t'].astype(int)
i_df['t'] -= i_df['t'].min()
i_df['t'] //= 10
i_df

Unnamed: 0,i,t,g
0,0,5,Action|Crime|Drama
1,1,7,Drama|Thriller
2,2,7,Children's|Comedy
3,3,3,Action|Drama
4,4,6,Drama
...,...,...,...
3411,3411,1,Drama
3412,3412,4,Drama
3413,3413,8,Comedy
3414,3414,8,Comedy


In [29]:
t_list = i_df['t'].values.tolist()
t_classes = i_df['t'].nunique()
print(len(t_list), t_classes)

3416 9


In [30]:
t_onehots = onehot(t_list, t_classes)
print(t_onehots.shape)

(3416, 9)


In [31]:
g2d = {'Action': 0, 'Adventure': 1, 'Animation': 2,  'Children\'s': 3, 'Comedy': 4, 'Crime': 5,
    'Documentary': 6, 'Drama': 7, 'Fantasy': 8, 'Film-Noir': 9, 'Horror': 10, 'Musical': 11,
    'Mystery': 12, 'Romance': 13, 'Sci-Fi': 14, 'Thriller': 15, 'War': 16, 'Western': 17}

In [32]:
g_multihots = []
for i in range(len(i_df)):
    g_list = i_df.loc[i, 'g'].split('|')
    g_id_list = []
    for _ in g_list:
        g_id_list.append(g2d[_])
    g_multihot = [1 if _ in g_id_list else 0 for _ in range(len(g2d))]
    g_multihots.append(g_multihot)
g_multihots = np.array(g_multihots)
print(g_multihots.shape)

(3416, 18)


In [33]:
item_array = np.hstack((t_onehots, g_multihots))
item_array.shape

(3416, 27)

In [34]:
np.save('../data/ml-1m_item_attribute.npy', item_array)