In [2]:
%%sh
wget http://files.grouplens.org/datasets/movielens/ml-100k.zip
unzip ml-100k.zip

Archive:  ml-100k.zip
   creating: ml-100k/
  inflating: ml-100k/allbut.pl       
  inflating: ml-100k/mku.sh          
  inflating: ml-100k/README          
  inflating: ml-100k/u.data          
  inflating: ml-100k/u.genre         
  inflating: ml-100k/u.info          
  inflating: ml-100k/u.item          
  inflating: ml-100k/u.occupation    
  inflating: ml-100k/u.user          
  inflating: ml-100k/u1.base         
  inflating: ml-100k/u1.test         
  inflating: ml-100k/u2.base         
  inflating: ml-100k/u2.test         
  inflating: ml-100k/u3.base         
  inflating: ml-100k/u3.test         
  inflating: ml-100k/u4.base         
  inflating: ml-100k/u4.test         
  inflating: ml-100k/u5.base         
  inflating: ml-100k/u5.test         
  inflating: ml-100k/ua.base         
  inflating: ml-100k/ua.test         
  inflating: ml-100k/ub.base         
  inflating: ml-100k/ub.test         


--2021-04-17 12:02:58--  http://files.grouplens.org/datasets/movielens/ml-100k.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.65.152
Connecting to files.grouplens.org (files.grouplens.org)|128.101.65.152|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4924029 (4.7M) [application/zip]
Saving to: ‘ml-100k.zip’

     0K .......... .......... .......... .......... ..........  1%  637K 7s
    50K .......... .......... .......... .......... ..........  2% 1.14M 6s
   100K .......... .......... .......... .......... ..........  3% 72.6M 4s
   150K .......... .......... .......... .......... ..........  4%  271M 3s
   200K .......... .......... .......... .......... ..........  5% 1.15M 3s
   250K .......... .......... .......... .......... ..........  6% 66.4M 2s
   300K .......... .......... .......... .......... ..........  7%  175M 2s
   350K .......... .......... .......... .......... ..........  8% 1.16M 2s
   400K .......... .......... ......

In [1]:
%cd ml-100k
!shuf ua.base -o ua.base.shuffled
!head -5 ua.base.shuffled

/root/Chap-No-4(Factorization machine Algorithm)/ml-100k
683	312	3	893284183
933	166	3	874854062
222	8	1	878182307
621	559	5	874964915
13	519	5	882140355


In [3]:
num_users = 943
num_movies = 1682
num_features = num_users+num_movies
num_ratings_train = 90570
num_ratings_test = 9430

In [4]:
import csv
import numpy as np
from scipy.sparse import lil_matrix

In [6]:
def loadDataset(filename, lines, columns):
    X = lil_matrix((lines, columns)).astype('float32')
    Y = []
    line=0
    with open(filename,'r') as f:
        samples=csv.reader(f,delimiter='\t')
        for userId,movieId,rating,timestamp in samples:
            X[line,int(userId)-1] = 1
            X[line,int(num_users)+int(movieId)-1] = 1
            Y.append(int(rating))
            line=line+1
    Y=np.array(Y).astype('float32')
    return X,Y

In [7]:
X_train, Y_train = loadDataset('ua.base.shuffled',num_ratings_train,num_features)
X_test, Y_test = loadDataset('ua.test',num_ratings_test,num_features)

In [8]:
print(X_train.shape)
print(Y_train.shape)
print(X_test.shape)
print(Y_test.shape)

(90570, 2625)
(90570,)
(9430, 2625)
(9430,)


In [9]:
import io, boto3
import sagemaker.amazon.common as smac

In [11]:
def writeDatasetToProtobuf(X, Y, bucket, prefix, key):
    buf = io.BytesIO()
    smac.write_spmatrix_to_sparse_tensor(buf, X, Y)
    buf.seek(0)
    obj = '{}/{}'.format(prefix, key)

    boto3.resource('s3').Bucket(bucket).Object(obj).upload_fileobj(buf)
    return 's3://{}/{}'.format(bucket,obj)

In [13]:
import sagemaker
bucket = sagemaker.Session().default_bucket()
prefix = 'fm-movielens'
train_key = 'train.protobuf'
train_prefix = '{}/{}'.format(prefix, 'train')
test_key = 'test.protobuf'
test_prefix = '{}/{}'.format(prefix, 'test')
output_prefix = 's3://{}/{}/output'.format(bucket,prefix)

In [15]:
train_data = writeDatasetToProtobuf(X_train, Y_train,bucket, train_prefix, train_key)
test_data = writeDatasetToProtobuf(X_test, Y_test,bucket, test_prefix, test_key) 

In [22]:
from sagemaker import image_uris
region=boto3.Session().region_name
container=image_uris.retrieve('factorization-machines',region)

In [23]:
fm=sagemaker.estimator.Estimator(
 container,
 role=sagemaker.get_execution_role(),
 instance_count=1,
 instance_type='ml.m5.xlarge',
 output_path=output_prefix)

In [24]:
fm.set_hyperparameters(
 feature_dim=num_features,
 predictor_type='regressor',
 num_factors=64,
 epochs=10)

In [25]:
fm.fit({'train': train_data, 'test': test_data})

2021-04-17 13:03:35 Starting - Starting the training job...
2021-04-17 13:03:59 Starting - Launching requested ML instancesProfilerReport-1618664615: InProgress
......
2021-04-17 13:04:59 Starting - Preparing the instances for training...
2021-04-17 13:05:36 Downloading - Downloading input data...
2021-04-17 13:05:59 Training - Downloading the training image..[34mDocker entrypoint called with argument(s): train[0m
[34mRunning default environment configuration script[0m
  from collections import Mapping, MutableMapping, Sequence[0m
  """[0m
  """[0m
[34m[04/17/2021 13:06:20 INFO 140634086573888] Reading default configuration from /opt/amazon/lib/python3.7/site-packages/algorithm/resources/default-conf.json: {'epochs': 1, 'mini_batch_size': '1000', 'use_bias': 'true', 'use_linear': 'true', 'bias_lr': '0.1', 'linear_lr': '0.001', 'factors_lr': '0.0001', 'bias_wd': '0.01', 'linear_wd': '0.001', 'factors_wd': '0.00001', 'bias_init_method': 'normal', 'bias_init_sigma': '0.01', 'linea

In [None]:
endpoint_name = 'fm-movielens-100k'
fm_predictor = fm.deploy(
 endpoint_name=endpoint_name,
 instance_type='ml.t2.medium',
 initial_instance_count=1)

----------------