# Notebook to load the MovieLens1M dataset

In [1]:
import os
import numpy as np
import pandas as pd
from scipy import sparse as sp
from sklearn.model_selection import KFold, train_test_split
from imblearn.over_sampling import RandomOverSampler

In [2]:
movielens_dir = r"datasets\ml-1m"

In [3]:
df_movies = pd.read_csv(os.path.join(movielens_dir, "movies.dat"), 
                        sep="::", engine="python", encoding='latin-1',
                        names=["MovieID", "Title", "Genres"])
df_movies.head()

Unnamed: 0,MovieID,Title,Genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
df_ratings = pd.read_csv(os.path.join(movielens_dir, "ratings.dat"), 
                        sep="::", engine="python", encoding='latin-1',
                        names=["UserID", "MovieID", "Rating", "Timestamp"])
df_ratings.head()

Unnamed: 0,UserID,MovieID,Rating,Timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [5]:
df_ratings["Rating"].value_counts()

4    348971
3    261197
5    226310
2    107557
1     56174
Name: Rating, dtype: int64

In [6]:
df_users = pd.read_csv(os.path.join(movielens_dir, "users.dat"), 
                        sep="::", engine="python", encoding='latin-1',
                        names=["UserID", "Gender", "Age", "Occupation", "Zip-code"])
df_users.head()

Unnamed: 0,UserID,Gender,Age,Occupation,Zip-code
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


In [7]:
n_users = len(df_users)
n_movies = len(df_movies)
n_ratings = len(df_ratings)
density = n_ratings / (n_movies * n_users)

# Show some statistics about the dataset
print("Number of users:", n_users)
print("Number of movies:", n_movies)
print("\nCounts of users per gender:")
print(df_users["Gender"].value_counts())

print("\nNumber of interactions:", n_ratings)
print(f"Density: {density:.4f}")

Number of users: 6040
Number of movies: 3883

Counts of users per gender:
M    4331
F    1709
Name: Gender, dtype: int64

Number of interactions: 1000209
Density: 0.0426


### Data preparation
For our use-case our end-result should be binary interaction matrix, where ```1``` denotes that a user
rated a movie, and ```0``` that she did not.

In [8]:
# some movie (and their indices) are missing, lets therefore adjust / re-enumerate the indices
movie_rename_dict = {row["MovieID"]: i for i, row in df_movies.iterrows()}
df_movies["MovieID"] = df_movies["MovieID"].replace(movie_rename_dict)
df_ratings["MovieID"] = df_ratings["MovieID"].replace(movie_rename_dict)

In [9]:
# get user and movie ids from ratings df, -1 as the first user originally received the id 1
user_ids = df_ratings["UserID"] - 1
movie_ids = df_ratings["MovieID"]
values = np.ones(len(user_ids))

interaction_matrix = sp.csr_matrix((values, (user_ids, movie_ids)), shape=(n_users, n_movies))
display(interaction_matrix.shape)

# store results
storage_dir = os.path.join(movielens_dir, "full")
os.makedirs(storage_dir, exist_ok=True)
sp.save_npz(os.path.join(storage_dir, "interactions.npz"), interaction_matrix)

# check whether all interactions were actually kept
print("Number of interactions (again):", interaction_matrix.toarray().sum())

(6040, 3883)

Number of interactions (again): 1000209.0


In [10]:
# create new user file for our usage
df_user_info = df_users[["UserID", "Gender"]].copy()
df_user_info["UserID"] -= 1 # move start index from 1 to 0
df_user_info["Gender"] = df_user_info["Gender"].apply(lambda item: item.lower())
df_user_info.rename({"UserID": "userID", "Gender": "gender"}, inplace=True, axis=1)

df_user_info.to_csv(os.path.join(storage_dir, "user_info.csv"), index=False)
df_user_info.head()

Unnamed: 0,userID,gender
0,0,f
1,1,m
2,2,m
3,3,m
4,4,m


In [11]:
# filter users & tracks with too less interaction
min_interactions_user = 5
min_interactions_movie = 5

# Remove until there are enough interactions from each side
while True:
    n_interactions_per_user = np.array(interaction_matrix.sum(axis=1)).flatten()
    n_interactions_per_movie = np.array(interaction_matrix.sum(axis=0)).flatten()
    
    # filter movies with too less interactions
    enough_interactions_movie = n_interactions_per_movie >= min_interactions_movie
    interaction_matrix = interaction_matrix[:, enough_interactions_movie]
    
    # only keep those users with enough interactions
    enough_interactions_user = n_interactions_per_user >= min_interactions_user
    df_user_info = df_user_info.loc[enough_interactions_user]
    df_user_info.reset_index(drop=True, inplace=True)

    interaction_matrix = interaction_matrix[enough_interactions_user]

    # reassign index
    df_user_info["userID"] = df_user_info.index
    
    if np.sum(enough_interactions_movie == False) == 0 \
         and np.sum(enough_interactions_user == False) == 0:
        break
        
print("Final shape of interactions matrix is", interaction_matrix.shape)
print("==> {} users and {} movies are remaining.".format(*interaction_matrix.shape))
        
df_user_info["userID"] = df_user_info.index
df_user_info

Final shape of interactions matrix is (6040, 3416)
==> 6040 users and 3416 movies are remaining.


Unnamed: 0,userID,gender
0,0,f
1,1,m
2,2,m
3,3,m
4,4,m
...,...,...
6035,6035,f
6036,6036,f
6037,6037,f
6038,6038,f


In [12]:
n_users = interaction_matrix.shape[0]
n_items = interaction_matrix.shape[1]
n_interactions = int(interaction_matrix.sum())
density = n_interactions / (n_items * n_users)

print(f"New number of interactions is {n_interactions},")
print(f"which leads to a density of {density:.4f}.")

New number of interactions is 999611,
which leads to a density of 0.0484.


In [13]:
# store filtered results
storage_dir = os.path.join(movielens_dir, f"user_gte_{min_interactions_user}_movie_gte_{min_interactions_movie}")
os.makedirs(storage_dir, exist_ok=True)

df_user_info.to_csv(os.path.join(storage_dir, "user_info.csv"), index=False)
sp.save_npz(os.path.join(storage_dir, "interactions.npz"), interaction_matrix)