In [1]:
from psutil import virtual_memory
from tensorflow.python.client import device_lib
dev_sec = device_lib.list_local_devices()
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('To enable a high-RAM runtime, select the Runtime > "Change runtime type"')
  print('menu, and then select High-RAM in the Runtime shape dropdown. Then, ')
  print('re-execute this cell.')
else:
  print('You are using a high-RAM runtime!')
  print(dev_sec[-1].physical_device_desc)

Your runtime has 27.4 gigabytes of available RAM

You are using a high-RAM runtime!
device: 0, name: Tesla P100-PCIE-16GB, pci bus id: 0000:00:04.0, compute capability: 6.0


In [2]:
!wget "http://files.grouplens.org/datasets/movielens/ml-100k.zip"
!unzip ml-100k.zip
!ls

--2020-11-04 01:26:05--  http://files.grouplens.org/datasets/movielens/ml-100k.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.65.152
Connecting to files.grouplens.org (files.grouplens.org)|128.101.65.152|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4924029 (4.7M) [application/zip]
Saving to: ‘ml-100k.zip’


2020-11-04 01:26:06 (12.3 MB/s) - ‘ml-100k.zip’ saved [4924029/4924029]

Archive:  ml-100k.zip
   creating: ml-100k/
  inflating: ml-100k/allbut.pl       
  inflating: ml-100k/mku.sh          
  inflating: ml-100k/README          
  inflating: ml-100k/u.data          
  inflating: ml-100k/u.genre         
  inflating: ml-100k/u.info          
  inflating: ml-100k/u.item          
  inflating: ml-100k/u.occupation    
  inflating: ml-100k/u.user          
  inflating: ml-100k/u1.base         
  inflating: ml-100k/u1.test         
  inflating: ml-100k/u2.base         
  inflating: ml-100k/u2.test         
  inflating: ml-100k/u3.base    

In [3]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.optimizers import Adam
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from datetime import datetime, timedelta
import math
import random
from collections import deque

In [4]:
rating_data = pd.read_csv("ml-100k/ua.base", sep='\t',names="userId,movieId,rating,timestamp".split(",")) 
# need to sort the timestamp ascending?
rating_data

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,5,874965758
1,1,2,3,876893171
2,1,3,4,878542960
3,1,4,3,876893119
4,1,5,3,889751712
...,...,...,...,...
90565,943,1047,2,875502146
90566,943,1074,4,888640250
90567,943,1188,3,888640250
90568,943,1228,3,888640275


In [5]:
user_list = rating_data['userId']
user_list = user_list.unique()
movie_rated_by_user = rating_data.groupby('movieId')['userId'].count().reset_index(name='userIdCount')
movie_rated_by_user = movie_rated_by_user.set_index('movieId')
movie_rated_by_user

Unnamed: 0_level_0,userIdCount
movieId,Unnamed: 1_level_1
1,392
2,121
3,85
4,198
5,79
...,...
1678,1
1679,1
1680,1
1681,1


In [6]:
item_column_name = "movieId,movie_title,release_date,video_release_date,IMDb_URL,unknown,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western"
item = pd.read_csv("ml-100k/u.item",sep='|',names=item_column_name.split(","),encoding='latin-1')
col_to_removed = ['movie_title', 'release_date', 'video_release_date', 'IMDb_URL']
clear_item = item.drop(col_to_removed, axis=1).set_index('movieId')

# the number of the kinds of topics in the whole recommendation system.
N = len(clear_item.columns)

# sum up the genre for each movieId
genre_num = clear_item.sum(axis = 1)

# group table by index and creates a dict with lists of clear_item as values
df_dict = dict(list(clear_item.groupby(clear_item.index)))

# Gather all the genres that related to all movies respectively
movieid = []
genre_list = []

for u, v in df_dict.items():
    check = v.columns[(v == 1).any()]
    if len(check) > 0:
      movieid.append(u)
      genre_list.append(check.to_list())

d = {'movieId' : movieid, 'Genres' : genre_list}

# compile in DataFrame
genre_list_by_movieid = pd.DataFrame(d)
genre_list_by_movieid.set_index('movieId', inplace=True)
genre_list_by_movieid

Unnamed: 0_level_0,Genres
movieId,Unnamed: 1_level_1
1,"[Animation, Children, Comedy]"
2,"[Action, Adventure, Thriller]"
3,[Thriller]
4,"[Action, Comedy, Drama]"
5,"[Crime, Drama, Thriller]"
...,...
1678,[Drama]
1679,"[Romance, Thriller]"
1680,"[Drama, Romance]"
1681,[Comedy]


In [7]:
clear_item

Unnamed: 0_level_0,unknown,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
1,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
4,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1678,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
1679,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0
1680,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0
1681,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0


In [8]:
rating_genre = rating_data.join(clear_item, on='movieId')
rating_genre

Unnamed: 0,userId,movieId,rating,timestamp,unknown,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,1,5,874965758,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1,2,3,876893171,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
2,1,3,4,878542960,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
3,1,4,3,876893119,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0
4,1,5,3,889751712,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
90565,943,1047,2,875502146,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
90566,943,1074,4,888640250,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0
90567,943,1188,3,888640250,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1
90568,943,1228,3,888640275,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [9]:
user_data = pd.read_csv("ml-100k/u.user", sep='|',names="userId,age,gender,occupation,zipcode".split(",")).set_index('userId') 
user_data

Unnamed: 0_level_0,age,gender,occupation,zipcode
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,24,M,technician,85711
2,53,F,other,94043
3,23,M,writer,32067
4,24,M,technician,43537
5,33,F,other,15213
...,...,...,...,...
939,26,F,student,33319
940,32,M,administrator,02215
941,20,M,student,97229
942,48,F,librarian,78209


In [10]:
raw = rating_genre.join(user_data, on='userId')
raw.head()

Unnamed: 0,userId,movieId,rating,timestamp,unknown,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,age,gender,occupation,zipcode
0,1,1,5,874965758,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,24,M,technician,85711
1,1,2,3,876893171,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,24,M,technician,85711
2,1,3,4,878542960,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,24,M,technician,85711
3,1,4,3,876893119,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,24,M,technician,85711
4,1,5,3,889751712,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,24,M,technician,85711


In [13]:
# All categorical columns
object_cols = [col for col in raw.columns if raw[col].dtype == "object"]
object_cols

['gender', 'occupation', 'zipcode']

In [14]:
pip install category_encoders

Collecting category_encoders
[?25l  Downloading https://files.pythonhosted.org/packages/44/57/fcef41c248701ee62e8325026b90c432adea35555cbc870aff9cfba23727/category_encoders-2.2.2-py2.py3-none-any.whl (80kB)
[K     |████                            | 10kB 26.3MB/s eta 0:00:01[K     |████████▏                       | 20kB 3.1MB/s eta 0:00:01[K     |████████████▏                   | 30kB 4.1MB/s eta 0:00:01[K     |████████████████▎               | 40kB 4.3MB/s eta 0:00:01[K     |████████████████████▎           | 51kB 3.6MB/s eta 0:00:01[K     |████████████████████████▍       | 61kB 4.0MB/s eta 0:00:01[K     |████████████████████████████▍   | 71kB 4.3MB/s eta 0:00:01[K     |████████████████████████████████| 81kB 3.6MB/s 
Installing collected packages: category-encoders
Successfully installed category-encoders-2.2.2


In [15]:
import category_encoders as ce
encoder = LabelEncoder()

  import pandas.util.testing as tm


In [16]:
encoder = ce.BackwardDifferenceEncoder(cols=['gender', 'occupation'])
encoded_raw = encoder.fit_transform(raw)
encoded_raw

  elif pd.api.types.is_categorical(cols):


Unnamed: 0,intercept,userId,movieId,rating,timestamp,unknown,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,age,gender_0,occupation_0,occupation_1,occupation_2,occupation_3,occupation_4,occupation_5,occupation_6,occupation_7,occupation_8,occupation_9,occupation_10,occupation_11,occupation_12,occupation_13,occupation_14,occupation_15,occupation_16,occupation_17,occupation_18,occupation_19,zipcode
0,1,1,1,5,874965758,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,24,-0.5,-0.952381,-0.904762,-0.857143,-0.809524,-0.761905,-0.714286,-0.666667,-0.619048,-0.571429,-0.52381,-0.47619,-0.428571,-0.380952,-0.333333,-0.285714,-0.238095,-0.190476,-0.142857,-0.095238,-0.047619,85711
1,1,1,2,3,876893171,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,24,-0.5,-0.952381,-0.904762,-0.857143,-0.809524,-0.761905,-0.714286,-0.666667,-0.619048,-0.571429,-0.52381,-0.47619,-0.428571,-0.380952,-0.333333,-0.285714,-0.238095,-0.190476,-0.142857,-0.095238,-0.047619,85711
2,1,1,3,4,878542960,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,24,-0.5,-0.952381,-0.904762,-0.857143,-0.809524,-0.761905,-0.714286,-0.666667,-0.619048,-0.571429,-0.52381,-0.47619,-0.428571,-0.380952,-0.333333,-0.285714,-0.238095,-0.190476,-0.142857,-0.095238,-0.047619,85711
3,1,1,4,3,876893119,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,24,-0.5,-0.952381,-0.904762,-0.857143,-0.809524,-0.761905,-0.714286,-0.666667,-0.619048,-0.571429,-0.52381,-0.47619,-0.428571,-0.380952,-0.333333,-0.285714,-0.238095,-0.190476,-0.142857,-0.095238,-0.047619,85711
4,1,1,5,3,889751712,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,24,-0.5,-0.952381,-0.904762,-0.857143,-0.809524,-0.761905,-0.714286,-0.666667,-0.619048,-0.571429,-0.52381,-0.47619,-0.428571,-0.380952,-0.333333,-0.285714,-0.238095,-0.190476,-0.142857,-0.095238,-0.047619,85711
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
90565,1,943,1047,2,875502146,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,22,-0.5,0.047619,0.095238,0.142857,0.190476,0.238095,-0.714286,-0.666667,-0.619048,-0.571429,-0.52381,-0.47619,-0.428571,-0.380952,-0.333333,-0.285714,-0.238095,-0.190476,-0.142857,-0.095238,-0.047619,77841
90566,1,943,1074,4,888640250,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,22,-0.5,0.047619,0.095238,0.142857,0.190476,0.238095,-0.714286,-0.666667,-0.619048,-0.571429,-0.52381,-0.47619,-0.428571,-0.380952,-0.333333,-0.285714,-0.238095,-0.190476,-0.142857,-0.095238,-0.047619,77841
90567,1,943,1188,3,888640250,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,22,-0.5,0.047619,0.095238,0.142857,0.190476,0.238095,-0.714286,-0.666667,-0.619048,-0.571429,-0.52381,-0.47619,-0.428571,-0.380952,-0.333333,-0.285714,-0.238095,-0.190476,-0.142857,-0.095238,-0.047619,77841
90568,1,943,1228,3,888640275,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,22,-0.5,0.047619,0.095238,0.142857,0.190476,0.238095,-0.714286,-0.666667,-0.619048,-0.571429,-0.52381,-0.47619,-0.428571,-0.380952,-0.333333,-0.285714,-0.238095,-0.190476,-0.142857,-0.095238,-0.047619,77841


In [17]:
scaler = MinMaxScaler() 
encoded_raw['scaled_age'] = scaler.fit_transform(encoded_raw[['age']])
encoded_raw

Unnamed: 0,intercept,userId,movieId,rating,timestamp,unknown,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,age,gender_0,occupation_0,occupation_1,occupation_2,occupation_3,occupation_4,occupation_5,occupation_6,occupation_7,occupation_8,occupation_9,occupation_10,occupation_11,occupation_12,occupation_13,occupation_14,occupation_15,occupation_16,occupation_17,occupation_18,occupation_19,zipcode,scaled_age
0,1,1,1,5,874965758,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,24,-0.5,-0.952381,-0.904762,-0.857143,-0.809524,-0.761905,-0.714286,-0.666667,-0.619048,-0.571429,-0.52381,-0.47619,-0.428571,-0.380952,-0.333333,-0.285714,-0.238095,-0.190476,-0.142857,-0.095238,-0.047619,85711,0.257576
1,1,1,2,3,876893171,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,24,-0.5,-0.952381,-0.904762,-0.857143,-0.809524,-0.761905,-0.714286,-0.666667,-0.619048,-0.571429,-0.52381,-0.47619,-0.428571,-0.380952,-0.333333,-0.285714,-0.238095,-0.190476,-0.142857,-0.095238,-0.047619,85711,0.257576
2,1,1,3,4,878542960,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,24,-0.5,-0.952381,-0.904762,-0.857143,-0.809524,-0.761905,-0.714286,-0.666667,-0.619048,-0.571429,-0.52381,-0.47619,-0.428571,-0.380952,-0.333333,-0.285714,-0.238095,-0.190476,-0.142857,-0.095238,-0.047619,85711,0.257576
3,1,1,4,3,876893119,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,24,-0.5,-0.952381,-0.904762,-0.857143,-0.809524,-0.761905,-0.714286,-0.666667,-0.619048,-0.571429,-0.52381,-0.47619,-0.428571,-0.380952,-0.333333,-0.285714,-0.238095,-0.190476,-0.142857,-0.095238,-0.047619,85711,0.257576
4,1,1,5,3,889751712,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,24,-0.5,-0.952381,-0.904762,-0.857143,-0.809524,-0.761905,-0.714286,-0.666667,-0.619048,-0.571429,-0.52381,-0.47619,-0.428571,-0.380952,-0.333333,-0.285714,-0.238095,-0.190476,-0.142857,-0.095238,-0.047619,85711,0.257576
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
90565,1,943,1047,2,875502146,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,22,-0.5,0.047619,0.095238,0.142857,0.190476,0.238095,-0.714286,-0.666667,-0.619048,-0.571429,-0.52381,-0.47619,-0.428571,-0.380952,-0.333333,-0.285714,-0.238095,-0.190476,-0.142857,-0.095238,-0.047619,77841,0.227273
90566,1,943,1074,4,888640250,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,22,-0.5,0.047619,0.095238,0.142857,0.190476,0.238095,-0.714286,-0.666667,-0.619048,-0.571429,-0.52381,-0.47619,-0.428571,-0.380952,-0.333333,-0.285714,-0.238095,-0.190476,-0.142857,-0.095238,-0.047619,77841,0.227273
90567,1,943,1188,3,888640250,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,22,-0.5,0.047619,0.095238,0.142857,0.190476,0.238095,-0.714286,-0.666667,-0.619048,-0.571429,-0.52381,-0.47619,-0.428571,-0.380952,-0.333333,-0.285714,-0.238095,-0.190476,-0.142857,-0.095238,-0.047619,77841,0.227273
90568,1,943,1228,3,888640275,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,22,-0.5,0.047619,0.095238,0.142857,0.190476,0.238095,-0.714286,-0.666667,-0.619048,-0.571429,-0.52381,-0.47619,-0.428571,-0.380952,-0.333333,-0.285714,-0.238095,-0.190476,-0.142857,-0.095238,-0.047619,77841,0.227273


In [18]:
drop_col = ['userId','intercept','movieId','timestamp','age','zipcode']
train_data = encoded_raw.drop(drop_col, axis=1)
train_data

Unnamed: 0,rating,unknown,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,gender_0,occupation_0,occupation_1,occupation_2,occupation_3,occupation_4,occupation_5,occupation_6,occupation_7,occupation_8,occupation_9,occupation_10,occupation_11,occupation_12,occupation_13,occupation_14,occupation_15,occupation_16,occupation_17,occupation_18,occupation_19,scaled_age
0,5,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,-0.5,-0.952381,-0.904762,-0.857143,-0.809524,-0.761905,-0.714286,-0.666667,-0.619048,-0.571429,-0.52381,-0.47619,-0.428571,-0.380952,-0.333333,-0.285714,-0.238095,-0.190476,-0.142857,-0.095238,-0.047619,0.257576
1,3,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,-0.5,-0.952381,-0.904762,-0.857143,-0.809524,-0.761905,-0.714286,-0.666667,-0.619048,-0.571429,-0.52381,-0.47619,-0.428571,-0.380952,-0.333333,-0.285714,-0.238095,-0.190476,-0.142857,-0.095238,-0.047619,0.257576
2,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,-0.5,-0.952381,-0.904762,-0.857143,-0.809524,-0.761905,-0.714286,-0.666667,-0.619048,-0.571429,-0.52381,-0.47619,-0.428571,-0.380952,-0.333333,-0.285714,-0.238095,-0.190476,-0.142857,-0.095238,-0.047619,0.257576
3,3,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,-0.5,-0.952381,-0.904762,-0.857143,-0.809524,-0.761905,-0.714286,-0.666667,-0.619048,-0.571429,-0.52381,-0.47619,-0.428571,-0.380952,-0.333333,-0.285714,-0.238095,-0.190476,-0.142857,-0.095238,-0.047619,0.257576
4,3,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,-0.5,-0.952381,-0.904762,-0.857143,-0.809524,-0.761905,-0.714286,-0.666667,-0.619048,-0.571429,-0.52381,-0.47619,-0.428571,-0.380952,-0.333333,-0.285714,-0.238095,-0.190476,-0.142857,-0.095238,-0.047619,0.257576
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
90565,2,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,-0.5,0.047619,0.095238,0.142857,0.190476,0.238095,-0.714286,-0.666667,-0.619048,-0.571429,-0.52381,-0.47619,-0.428571,-0.380952,-0.333333,-0.285714,-0.238095,-0.190476,-0.142857,-0.095238,-0.047619,0.227273
90566,4,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,-0.5,0.047619,0.095238,0.142857,0.190476,0.238095,-0.714286,-0.666667,-0.619048,-0.571429,-0.52381,-0.47619,-0.428571,-0.380952,-0.333333,-0.285714,-0.238095,-0.190476,-0.142857,-0.095238,-0.047619,0.227273
90567,3,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,-0.5,0.047619,0.095238,0.142857,0.190476,0.238095,-0.714286,-0.666667,-0.619048,-0.571429,-0.52381,-0.47619,-0.428571,-0.380952,-0.333333,-0.285714,-0.238095,-0.190476,-0.142857,-0.095238,-0.047619,0.227273
90568,3,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,-0.5,0.047619,0.095238,0.142857,0.190476,0.238095,-0.714286,-0.666667,-0.619048,-0.571429,-0.52381,-0.47619,-0.428571,-0.380952,-0.333333,-0.285714,-0.238095,-0.190476,-0.142857,-0.095238,-0.047619,0.227273


# **Extra**

In [None]:
def truncat_user(user_id):
    temp = raw[raw['userId'] == user_id]
    n = len(temp)
    print('Extract done...\n',n)
    return temp

In [None]:
# Further group numerical predictors such as "age" or "hours-per-week" to gain more insight!
# Group the "age" column
age_group = []
for age in income_df["age"]:
    if age < 25:
        age_group.append("<25")
    elif 25 <= age <= 34:
        age_group.append("25-34")
    elif 34 < age <= 44:
        age_group.append("35-44")
    elif 44 < age <= 54:
        age_group.append("45-54")
    elif 54 < age <= 65:
        age_group.append("55-64")
    else:
        age_group.append("65 and over")
new_income_df = income_df.copy()
new_income_df["age_group"] = age_group
del new_income_df["age"]

In [None]:
def convert(data, num_users, num_movies):
    ''' Making a User-Movie-Matrix'''
    
    new_data=[]
    
    for id_user in range(1, num_users+1):
        
        id_movie=data[:,1][data[:,0]==id_user]
        id_rating=data[:,2][data[:,0]==id_user]
        ratings=np.zeros(num_movies, dtype=np.uint32)
        ratings[id_movie-1]=id_rating
        if sum(ratings)==0:
            continue
        new_data.append(ratings)

        del id_movie
        del id_rating
        del ratings
        
    return new_data

In [None]:
def get_dataset():
    ''' For each train.dat and test.dat making a User-Movie-Matrix'''
    
    training_set=pd.read_csv("ml-100k/ua.base", sep='\t', header=None, engine='python', encoding='latin-1')
    training_set=np.array(training_set, dtype=np.uint32)
    
    test_set=pd.read_csv("ml-100k/ua.test", sep='\t', header=None, engine='python', encoding='latin-1')
    test_set=np.array(test_set, dtype=np.uint32)
    
      
    num_users=int(max(max(training_set[:,0]), max(test_set[:,0])))
    num_movies=int(max(max(training_set[:,1]), max(test_set[:,1])))

    training_set=convert(training_set,num_users, num_movies)
    test_set=convert(test_set,num_users, num_movies)
    
    return training_set, test_set

In [None]:
nani= get_dataset()
nani

In [None]:
training_set=pd.read_csv("ml-100k/ua.base", sep='\t', header=None, engine='python', encoding='latin-1')
training_set=np.array(training_set, dtype=np.uint32)
training_set

array([[        1,         1,         5, 874965758],
       [        1,         2,         3, 876893171],
       [        1,         3,         4, 878542960],
       ...,
       [      943,      1188,         3, 888640250],
       [      943,      1228,         3, 888640275],
       [      943,      1330,         3, 888692465]], dtype=uint32)

In [None]:
test_set=pd.read_csv("ml-100k/ua.test", sep='\t', header=None, engine='python', encoding='latin-1')
test_set=np.array(test_set, dtype=np.uint32)

In [None]:
test_set

array([[        1,        20,         4, 887431883],
       [        1,        33,         4, 878542699],
       [        1,        61,         4, 878542420],
       ...,
       [      943,       570,         1, 888640125],
       [      943,       808,         4, 888639868],
       [      943,      1067,         2, 875501756]], dtype=uint32)

In [None]:
num_users=int(max(max(training_set[:,0]), max(test_set[:,0])))
num_movies=int(max(max(training_set[:,1]), max(test_set[:,1])))

In [None]:
training_set=convert(training_set,num_users, num_movies)
test_set=convert(test_set,num_users, num_movies)

In [None]:
training_set

[array([5, 3, 4, ..., 0, 0, 0], dtype=uint32),
 array([4, 0, 0, ..., 0, 0, 0], dtype=uint32),
 array([0, 0, 0, ..., 0, 0, 0], dtype=uint32),
 array([0, 0, 0, ..., 0, 0, 0], dtype=uint32),
 array([0, 0, 0, ..., 0, 0, 0], dtype=uint32),
 array([4, 0, 0, ..., 0, 0, 0], dtype=uint32),
 array([0, 0, 0, ..., 0, 0, 0], dtype=uint32),
 array([0, 0, 0, ..., 0, 0, 0], dtype=uint32),
 array([0, 0, 0, ..., 0, 0, 0], dtype=uint32),
 array([4, 0, 0, ..., 0, 0, 0], dtype=uint32),
 array([0, 0, 0, ..., 0, 0, 0], dtype=uint32),
 array([0, 0, 0, ..., 0, 0, 0], dtype=uint32),
 array([3, 3, 0, ..., 0, 0, 0], dtype=uint32),
 array([0, 0, 0, ..., 0, 0, 0], dtype=uint32),
 array([1, 0, 0, ..., 0, 0, 0], dtype=uint32),
 array([5, 0, 0, ..., 0, 0, 0], dtype=uint32),
 array([0, 0, 0, ..., 0, 0, 0], dtype=uint32),
 array([5, 0, 0, ..., 0, 0, 0], dtype=uint32),
 array([0, 0, 0, ..., 0, 0, 0], dtype=uint32),
 array([3, 0, 0, ..., 0, 0, 0], dtype=uint32),
 array([5, 0, 0, ..., 0, 0, 0], dtype=uint32),
 array([0, 2,