Rename the userId and movieId columns to USER_ID and ITEM_ID respectively.
Add an EVENT_TYPE column set the value for every record to watch. If you're using Microsoft Excel, you can set the EVENT_TYPE for every record by entering watch in the first cell in the column and then double-clicking the bottom-right corner of the cell. Your header should be the following:

In [1]:
# Imports
import numpy as np
import pandas as pd
import datetime

### rating data (aws personalize interaction data)

In [2]:
interactions_data = pd.read_csv('./ml-latest-small/ratings.csv')
pd.set_option('display.max_rows', 5)
interactions_data

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
...,...,...,...,...
100834,610,168252,5.0,1493846352
100835,610,170875,3.0,1493846415


In [3]:
# remove the items with rating less than 3
interactions_data = interactions_data[interactions_data['rating'] > 3]               
# remove the Rating column 
interactions_data = interactions_data[['userId', 'movieId', 'timestamp']]
# Rename the userId and movieId columns to USER_ID and ITEM_ID 
interactions_data.rename(columns = {'userId':'USER_ID', 'movieId':'ITEM_ID', 'timestamp':'TIMESTAMP'}, inplace = True)
# `Add an EVENT_TYPE column set the value for every record to watch`
interactions_data['EVENT_TYPE']='watch'
interactions_data.head()

Unnamed: 0,USER_ID,ITEM_ID,TIMESTAMP,EVENT_TYPE
0,1,1,964982703,watch
1,1,3,964981247,watch
2,1,6,964982224,watch
3,1,47,964983815,watch
4,1,50,964982931,watch


In [32]:
# save modified rating data
interactions_data.to_csv('./data_prepared/interactions.csv', index=False)

### movie data (aws personalize item data)
data about each movie

In [5]:
items_data = pd.read_csv('./ml-latest-small/movies.csv')
items_data.head(5)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [6]:
# seperate year data out of title
items_data['year'] = items_data['title'].str.extract('.*\((.*)\).*',expand = False)
items_data.head(5)

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,1995
2,3,Grumpier Old Men (1995),Comedy|Romance,1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,1995
4,5,Father of the Bride Part II (1995),Comedy,1995


In [17]:
# Because the actual creation timestamp for each movie is unknown, the following adds a modern date as the creation timestamp.
ts= datetime.datetime(2023, 11, 5, 0, 0).strftime('%s')
items_data["CREATION_TIMESTAMP"] = ts
items_data.head()

Unnamed: 0,movieId,title,genres,year,CREATION_TIMESTAMP
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995,1699110000
1,2,Jumanji (1995),Adventure|Children|Fantasy,1995,1699110000
2,3,Grumpier Old Men (1995),Comedy|Romance,1995,1699110000
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,1995,1699110000
4,5,Father of the Bride Part II (1995),Comedy,1995,1699110000


In [18]:
# removing the title
items_data.drop(columns="title", inplace = True)

# renaming the columns to match schema
items_data.rename(columns = { 'movieId':'ITEM_ID', 'genres':'GENRES','year':'YEAR'}, inplace = True)
items_data.head()

Unnamed: 0,ITEM_ID,GENRES,YEAR,CREATION_TIMESTAMP
0,1,Adventure|Animation|Children|Comedy|Fantasy,1995,1699110000
1,2,Adventure|Children|Fantasy,1995,1699110000
2,3,Comedy|Romance,1995,1699110000
3,4,Comedy|Drama|Romance,1995,1699110000
4,5,Comedy,1995,1699110000


In [33]:
# save modified item data
items_data.to_csv('./data_prepared/items.csv', index=False)

### fake user data (aws personalize User Metadata)
data about each user

In [20]:
# get user ids from the interaction dataset
user_ids = interactions_data['USER_ID'].unique()
user_data = pd.DataFrame()
user_data["USER_ID"]=user_ids
user_data.head()

Unnamed: 0,USER_ID
0,1
1,2
2,3
3,4
4,5


In [21]:
possible_genders = ['female', 'male']
random = np.random.choice(possible_genders, len(user_data.index), p=[0.5, 0.5])
user_data["GENDER"] = random
user_data.head()

Unnamed: 0,USER_ID,GENDER
0,1,male
1,2,male
2,3,female
3,4,male
4,5,female


In [30]:
# random age between 5 and 95
age = np.random.randint(5, 95, len(user_data.index))
user_data["AGE"] = age
user_data.head()

Unnamed: 0,USER_ID,GENDER,AGE
0,1,male,23
1,2,male,34
2,3,female,51
3,4,male,47
4,5,female,48


In [34]:
# save user  data
user_data.to_csv('./data_prepared/users.csv', index=False)