In [152]:
!pip3 install numpy pandas

Defaulting to user installation because normal site-packages is not writeable


In [153]:
import pandas as pd
import numpy as np
from config import datasetPath
from os import path
import math
from pickle import dump
import random
np.random.seed(31415) # lucky pi seed
random.seed(27182) # lucky euler's number seed
# https://www.reddit.com/r/MachineLearning/comments/rkewa3/d_what_are_your_machine_learning_superstitions/

In [154]:
ratingsPath = path.join(datasetPath, "ratings.csv")
itemFeaturesPath = path.join(datasetPath, 'movies.csv')
processedRatingsPath = path.join(datasetPath, "ratings.csv")
reformatItemFeaturesPath = path.join(datasetPath, 'item_features.csv')
trainDataPath = path.join(datasetPath, "training.csv")
testDataPath = path.join(datasetPath, "testing.csv")
validationDataPath = path.join(datasetPath, "validation.csv")

In [155]:
userInteractionData = pd.read_csv(ratingsPath)
itemFeatureData = pd.read_csv(itemFeaturesPath)
# drop timestamp feature
userInteractionData.drop("timestamp", axis=1, inplace=True)


In [156]:
numUsers = len(set(userInteractionData["userId"]))
numMovies = len(set(userInteractionData["movieId"]))
print("Users:", numUsers)
print("Movies:", numMovies)


Users: 610
Movies: 9724


In [157]:
# # We only count the interaction in the interaction matrix if rating >= ratingCutoff
# ratingCutoff = 3
# # filter out sparse data
# numRows = len(userInteractionData)
# userInteractionData.drop(
#     userInteractionData.index[userInteractionData["rating"] < ratingCutoff],
#     axis=0,
#     inplace=True,
# )
# userInteractionData.drop("rating", axis=1, inplace=True)
# newNumRows = len(userInteractionData)
# print(
#     f"Filtered out {numRows - newNumRows}/{numRows} ({100 * (numRows - newNumRows) / numRows}%) through the rating cuttoff of {ratingCutoff}."
# )


In [158]:
freqCutoff = 5
# if a user has less than freqCutoff interactions, remove user.
numRows = len(userInteractionData)
userInteractionData["user_freq"] = userInteractionData.groupby("userId")[
    "userId"
].transform("count")
userInteractionData.drop(
    userInteractionData.index[userInteractionData["user_freq"] <= freqCutoff],
    inplace=True,
)
userInteractionData.drop("user_freq", axis=1, inplace=True)
# if a movie has less than freqCutoff interactions, remove movie.
userInteractionData["movie_freq"] = userInteractionData.groupby("movieId")[
    "movieId"
].transform("count")
userInteractionData.drop(
    userInteractionData.index[userInteractionData["movie_freq"] <= freqCutoff],
    inplace=True,
)
userInteractionData.drop("movie_freq", axis=1, inplace=True)
newNumRows = len(userInteractionData)
print(
    f"Filtered out {numRows - newNumRows}/{numRows} ({100 * (numRows - newNumRows) / numRows}%) through the frequency cuttoff of {freqCutoff}."
)


Filtered out 12472/100836 (12.368598516402871%) through the frequency cuttoff of 5.


In [159]:
numUsers = len(set(userInteractionData["userId"]))
numMovies = len(set(userInteractionData["movieId"]))
print("Users:", numUsers)
print("Movies:", numMovies)


Users: 610
Movies: 3268


In [160]:
userInteractionData

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0
...,...,...,...
100830,610,166528,4.0
100831,610,166534,4.0
100832,610,168248,5.0
100833,610,168250,5.0


In [161]:
# shuffle dataset
len_df = len(userInteractionData)
train_ratio, test_ratio, validation_ratio = 0.7, 0.2, 0.1
train_n = math.floor(train_ratio * len_df)
test_n = math.floor(test_ratio * len_df)
validation_n = math.floor(validation_ratio * len_df)
row_indices = list(range(len_df))
np.random.shuffle(row_indices)
print("confirming shuffled.", row_indices[0:3])
train_split_indices = row_indices[:train_n]
test_split_indices = row_indices[train_n : train_n + test_n]
validation_split_indices = row_indices[train_n + test_n :]

confirming shuffled. [8619, 37623, 61390]


In [162]:
userInteractionData

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0
...,...,...,...
100830,610,166528,4.0
100831,610,166534,4.0
100832,610,168248,5.0
100833,610,168250,5.0


In [163]:
# remap user + movie ids to be sequential after filtering some movies/users
userInteractionData.head()
# renumber user and movie ids
userMapping = {}
movieMapping = {}
user_count = 0
movie_count = 0
# may the data science lords forgive my inefficient pandas sins
for userInteractionIndex in range(len_df):
    userInteractionRow = userInteractionData.iloc[userInteractionIndex]
    userId = userInteractionRow['userId']
    movieId = userInteractionRow['movieId']
    rating = userInteractionRow['rating']
    if not userId in userMapping:
        userMapping[userId] = user_count
        user_count += 1
    if not movieId in movieMapping:
        movieMapping[movieId] = movie_count
        movie_count += 1
    newUserId = userMapping[userId]
    newMovieId = movieMapping[movieId]
    userInteractionData.iloc[userInteractionIndex] = [newUserId, newMovieId, rating]

In [164]:
userInteractionData

Unnamed: 0,userId,movieId,rating
0,0,0,4.0
1,0,1,4.0
2,0,2,4.0
3,0,3,5.0
4,0,4,5.0
...,...,...,...
100830,609,968,4.0
100831,609,2388,4.0
100832,609,1698,5.0
100833,609,2389,5.0


In [165]:
# split dataset
train_split = userInteractionData.iloc[train_split_indices]
test_split = userInteractionData.iloc[test_split_indices]
validation_split = userInteractionData.iloc[validation_split_indices]
#
train_split.reset_index(drop=True, inplace=True)
test_split.reset_index(drop=True, inplace=True)
validation_split.reset_index(drop=True, inplace=True)
print('train size:', train_split.size)
print('test size:', test_split.size)
print('validation size:', validation_split.size)

train size: 185562
test size: 53016
validation size: 26514


In [166]:
print('train split')
train_split

train split


Unnamed: 0,userId,movieId,rating
0,62,1655,5.0
1,281,1905,4.5
2,446,435,5.0
3,273,1556,4.0
4,453,1099,3.5
...,...,...,...
61849,168,1858,5.0
61850,572,17,4.0
61851,90,2052,2.0
61852,136,638,4.0


In [167]:

print('test split')
test_split

test split


Unnamed: 0,userId,movieId,rating
0,67,198,3.5
1,533,674,3.5
2,135,509,3.0
3,115,439,4.0
4,94,1758,5.0
...,...,...,...
17667,488,2443,4.0
17668,209,672,5.0
17669,413,2607,2.5
17670,408,150,2.0


In [168]:
print('validation split')
validation_split

validation split


Unnamed: 0,userId,movieId,rating
0,607,1909,4.0
1,134,44,5.0
2,140,958,5.0
3,127,47,4.0
4,605,1610,1.0
...,...,...,...
8833,25,19,3.0
8834,201,1749,3.0
8835,102,0,4.0
8836,45,0,5.0


In [169]:
# item features, in the form <movieId, genres, title>
reformatItemFeatures = pd.DataFrame(columns=['movieId', 'genres', 'title'])
for index, row in itemFeatureData.iterrows():
    movieId = itemFeatureData.at[index, 'movieId']
    if movieId in movieMapping:
        movieId = movieMapping[movieId]
    else:
        continue
    title = itemFeatureData.at[index, 'title']
    genres = itemFeatureData.at[index, 'genres']
    reformatItemFeatures = reformatItemFeatures.append({ 
        'movieId': movieId,
        'genres': genres,
        'title': title
    }, ignore_index=True)

In [170]:
reformatItemFeatures

Unnamed: 0,movieId,genres,title
0,0,Adventure|Animation|Children|Comedy|Fantasy,Toy Story (1995)
1,442,Adventure|Children|Fantasy,Jumanji (1995)
2,1,Comedy|Romance,Grumpier Old Men (1995)
3,443,Comedy|Drama|Romance,Waiting to Exhale (1995)
4,444,Comedy,Father of the Bride Part II (1995)
...,...,...,...
3263,2741,Adventure|Animation|Children,Coco (2017)
3264,2392,Action|Adventure|Children,Jumanji: Welcome to the Jungle (2017)
3265,2393,Action|Adventure|Fantasy|Sci-Fi,Star Wars: The Last Jedi (2017)
3266,3131,Adventure|Drama|Fantasy,The Shape of Water (2017)


In [171]:
reformatItemFeatures.to_csv(reformatItemFeaturesPath)

In [172]:
train_split.to_csv(trainDataPath)
test_split.to_csv(testDataPath)
validation_split.to_csv(validationDataPath)