In [1]:
import numpy as np
import tensorflow as tf

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder
import json
import pickle
import pandas as pd
from sklearn.externals import joblib

%pylab inline

Populating the interactive namespace from numpy and matplotlib


# Get data

In [6]:
! mkdir ./tmp

In [7]:
! wget http://files.grouplens.org/datasets/movielens/ml-100k.zip -O ./ml-100k.zip

In [8]:
! unzip -o ./ml-100k.zip

In [9]:
! cat ./ml-100k/README

SUMMARY & USAGE LICENSE

MovieLens data sets were collected by the GroupLens Research Project
at the University of Minnesota.
 
This data set consists of:
	* 100,000 ratings (1-5) from 943 users on 1682 movies. 
	* Each user has rated at least 20 movies. 
        * Simple demographic info for the users (age, gender, occupation, zip)

The data was collected through the MovieLens web site
(movielens.umn.edu) during the seven-month period from September 19th, 
1997 through April 22nd, 1998. This data has been cleaned up - users
who had less than 20 ratings or did not have complete demographic
information were removed from this data set. Detailed descriptions of
the data file can be found at the end of this file.

Neither the University of Minnesota nor any of the researchers
involved can guarantee the correctness of the data, its suitability
for any particular purpose, or the validity of results based on the
use of the data set.  The data set may be used for any resear

In [2]:
#remove broken symbols
! iconv -f utf-8 -t utf-8 -c ml-100k/u.item >  ml-100k/u.item2

# user part

In [3]:
! head -3 ./ml-100k/u.user

1|24|M|technician|85711
2|53|F|other|94043
3|23|M|writer|32067


In [4]:
df_user = pd.read_csv('./ml-100k/u.user', sep='|', names='user id | age | gender | occupation | zip code'.split(' | '))
df_user['living_area'] = df_user['zip code'].map(lambda x: x[0])
del df_user['zip code']
df_user.head()

Unnamed: 0,user id,age,gender,occupation,living_area
0,1,24,M,technician,8
1,2,53,F,other,9
2,3,23,M,writer,3
3,4,24,M,technician,4
4,5,33,F,other,1


In [5]:
res = []
for age in list(map(str, df_user['age'].values)):
    res.append(int(round(int(age), -1)))
df_user['age'] = res

In [6]:
for f in ['age', 'gender', 'occupation', 'living_area']:
    print(f)
    print(df_user[f].nunique())
    print('----')

age
7
----
gender
2
----
occupation
21
----
living_area
19
----


In [7]:
features_list = ['age', 'gender', 'occupation', 'living_area']
s_users = []
le = LabelEncoder()

users_mat = []
for feature in features_list:
    col = le.fit_transform(df_user[feature].values)
    users_mat.append(col)
    s_users.append(len(le.classes_))
users_mat = np.array(users_mat).T
print(users_mat.shape)

(943, 4)


In [8]:
users = {}
for i, id in enumerate(df_user['user id'].values):
    users[id] = users_mat[i]

# item part

In [9]:
! head -3 ./ml-100k/u.item2

1|Toy Story (1995)|01-Jan-1995||http://us.imdb.com/M/title-exact?Toy%20Story%20(1995)|0|0|0|1|1|1|0|0|0|0|0|0|0|0|0|0|0|0|0
2|GoldenEye (1995)|01-Jan-1995||http://us.imdb.com/M/title-exact?GoldenEye%20(1995)|0|1|1|0|0|0|0|0|0|0|0|0|0|0|0|0|1|0|0
3|Four Rooms (1995)|01-Jan-1995||http://us.imdb.com/M/title-exact?Four%20Rooms%20(1995)|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|1|0|0


In [10]:
df_item = pd.read_csv('./ml-100k/u.item2', 
                      sep='|', 
                      names=(['id', 'title', 'release_date', 'video_release_date', 'url'] + 
                             ['g{}'.format(i) for i in range(19)])
                     )
df_item['year'] = df_item['release_date'].map(lambda x: str(x).split('-')[-1])

In [11]:
res = []
for age in list(map(str, df_item['year'].values)):
    if age == 'nan':
        age='1600'
    res.append(int(round(int(age), -1)))
df_item['decade'] = res

In [12]:
for f in ['decade']:
    print(f)
    print(df_item[f].nunique())
    print('----')

decade
10
----


In [19]:
features_list = ['decade'] + ['g{}'.format(i) for i in range(19)]
s_item = []

items_mat = []
for feature in features_list:
    col = le.fit_transform(df_item[feature].values)
    items_mat.append(col)
    s_item.append(len(le.classes_))
items_mat = np.array(items_mat).T
print(items_mat.shape)

(1682, 20)


In [20]:
items = {}
for i, id in enumerate(df_item['id'].values):
    items[id] = items_mat[i]

# ratings part

In [21]:
! head -3 ./ml-100k/u.data

196	242	3	881250949
186	302	3	891717742
22	377	1	878887116


In [23]:
df_data = pd.read_csv('./ml-100k/u.data', 
                      sep='\t', 
                      names='user id | item id | rating | timestamp'.split(' | ')
                     )

In [24]:
df_data['target'] = df_data['rating'] > 4.5
data = df_data[['user id', 'item id']].as_matrix()
target = df_data['target'].values
print('Mean target: {}'.format(np.mean(target==True)))

Mean target: 0.21201


In [25]:
# split to pos/neg samples
positive_idx = np.where(target==True)[0]
negative_idx = np.where(target!=True)[0]

In [28]:
from sklearn.cross_validation import train_test_split
pos_idx_tr, pos_idx_te = train_test_split(positive_idx, random_state=42, test_size=0.5)
neg_idx_tr, neg_idx_te = train_test_split(negative_idx, random_state=42, train_size=len(pos_idx_tr)) 

In [30]:
def build_matrix(pos_idx, neg_idx):
    rows_user = []
    rows_item = []
    rows_pair = []
    for idx in list(pos_idx) + list(neg_idx):
        u, i = data[idx]
        # values should be 1-based 
        rows_user.append(users[u] + 1)
        rows_item.append(items[i] + 1)
        # u and i already 1-based
        rows_pair.append(data[idx])
    X = np.hstack(map(np.array, [rows_user, rows_pair, rows_item]))
    Y = np.zeros(len(pos_idx) + len(neg_idx))
    Y[:len(pos_idx)] = 1
    perm = np.random.permutation(X.shape[0])
    return X[perm], Y[perm]

In [31]:
n_users = 943
n_items = 1682


X_tr, Y_tr = build_matrix(pos_idx_tr, neg_idx_tr)
X_te, Y_te = build_matrix(pos_idx_te, neg_idx_te)

# sizes of categorical features
s_features = s_users + [n_users, n_items] + s_item 

In [35]:
print('X_tr shape: ', X_tr.shape)
print('X_te shape: ', X_te.shape)
print('Num of features: ', len(s_features))
print('Size of feature space: ', np.prod(s_features))
print('Sizes of features: ', s_features)

X_tr shape:  (21200, 26)
X_te shape:  (78800, 26)
Num of features:  26
Size of feature space:  2914558821173035008
Sizes of features:  [61, 2, 21, 19, 943, 1682, 72, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]


In [36]:
# dump to disk
joblib.dump((X_tr, Y_tr, s_features), './tmp/train_categotical.jl')
joblib.dump((X_te, Y_te, s_features), './tmp/test_categorical.jl')

['./tmp/test_categorical.jl']