In [2]:
# type: ignore
import pandas as pd 
from spotlight.datasets.movielens import get_movielens_dataset

In [3]:
# download the dataset 100K
dataset = get_movielens_dataset(variant='100K')

# create a dataframe from the dataset
ratings = pd.DataFrame({
    'user_id': dataset.user_ids,
    'movie_id': dataset.item_ids,
    'rating': dataset.ratings,
    'timestamp': dataset.timestamps
})

ratings.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,196,242,3.0,881250949
1,186,302,3.0,891717742
2,22,377,1.0,878887116
3,244,51,2.0,880606923
4,166,346,1.0,886397596


In [4]:
# print the number of missing values => equal to 0 because the dataset is clean by Spotlight
ratings.isnull().sum()

user_id      0
movie_id     0
rating       0
timestamp    0
dtype: int64

In [5]:
# to run the following code below you have to download the file ml-100k.zip of the movielens dataset 
# on the website https://grouplens.org/datasets/movielens/

In [6]:
# Load the information about the users 
users_columns = ['user_id', 'age', 'gender', 'occupation', 'zip_code']
users = pd.read_csv('ml-100k/u.user', sep='|', header=None, names=users_columns, encoding='ISO-8859-1')
users.head()

FileNotFoundError: [Errno 2] No such file or directory: 'ml-100k/u.user'

In [None]:
# Load the information about the movies
column_names = ['movie_id', 'movie_title', 'release_date', 'video_release_date', 'IMDb_URL'] + [f'genre_{i}' for i in range(19)]
movies_metadata = pd.read_csv('ml-100k/u.item', sep='|', encoding='ISO-8859-1', header=None, names=column_names)
movies_metadata.head()

Unnamed: 0,movie_id,movie_title,release_date,video_release_date,IMDb_URL,genre_0,genre_1,genre_2,genre_3,genre_4,...,genre_9,genre_10,genre_11,genre_12,genre_13,genre_14,genre_15,genre_16,genre_17,genre_18
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [None]:
# Load the information about the genres
genres = pd.read_csv('ml-100k/u.genre', sep='|', header=None, names=['genre', 'genre_id'])
genres.head()

Unnamed: 0,genre,genre_id
0,unknown,0
1,Action,1
2,Adventure,2
3,Animation,3
4,Children's,4


In [None]:
movies_metadata['genres'] = movies_metadata[[f'genre_{i}' for i in range(19)]].values.tolist()

# merge information about the rating and the movies name
full_data = pd.merge(ratings, movies_metadata[['movie_id', 'movie_title', 'release_date', 'genres']], on='movie_id')

# convert the numerical genres to the name of the genres
genre_dict = dict(zip(genres['genre_id'], genres['genre']))
full_data['genres_name'] = full_data['genres'].apply(lambda flags: [genre_dict[i] for i, flag in enumerate(flags) if flag == 1])
full_data.drop(columns=['genres'], inplace=True)

# merge information about the rating and the users
full_data = full_data.merge(users, on='user_id', how='left')

# convert the timestamp to a datetime object
full_data['timestamp'] = pd.to_datetime(full_data['timestamp'], unit='s')
full_data['release_date'] = pd.to_datetime(full_data['release_date'], errors='coerce')

In [120]:
full_data.head()

Unnamed: 0,user_id,movie_id,rating,timestamp,movie_title,release_date,genres_name,age,gender,occupation,zip_code
0,196,242,3.0,1997-12-04 15:55:49,Kolya (1996),1997-01-24,[Comedy],49,M,writer,55105
1,186,302,3.0,1998-04-04 19:22:22,L.A. Confidential (1997),1997-01-01,"[Crime, Film-Noir, Mystery, Thriller]",39,F,executive,0
2,22,377,1.0,1997-11-07 07:18:36,Heavyweights (1994),1994-01-01,"[Children's, Comedy]",25,M,writer,40206
3,244,51,2.0,1997-11-27 05:02:03,Legends of the Fall (1994),1994-01-01,"[Drama, Romance, War, Western]",28,M,technician,80525
4,166,346,1.0,1998-02-02 05:33:16,Jackie Brown (1997),1997-01-01,"[Crime, Drama]",47,M,educator,55113
