In [101]:
import pandas as pd
import numpy as np
from config import datasetPath
from os import path
import math
from sklearn.model_selection import train_test_split


In [102]:
ratingsPath = path.join(datasetPath, "ratings.csv")


In [103]:
userInteractionData = pd.read_csv(ratingsPath)
# drop timestamp feature
userInteractionData.drop("timestamp", axis=1, inplace=True)


In [104]:
numUsers = len(set(userInteractionData["userId"]))
numMovies = len(set(userInteractionData["movieId"]))
print("Users:", numUsers)
print("Movies:", numMovies)


Users: 610
Movies: 9724


In [105]:
# We only count the interaction in the interaction matrix if rating >= ratingCutoff
ratingCutoff = 3
# filter out sparse data
numRows = len(userInteractionData)
userInteractionData.drop(
    userInteractionData.index[userInteractionData["rating"] < ratingCutoff],
    axis=0,
    inplace=True,
)
userInteractionData.drop("rating", axis=1, inplace=True)
newNumRows = len(userInteractionData)
print(
    f"Filtered out {numRows - newNumRows}/{numRows} ({100 * (numRows - newNumRows) / numRows}%) through the rating cuttoff of {ratingCutoff}."
)


Filtered out 19073/100836 (18.914871672815263%) through the rating cuttoff of 3.


In [106]:
freqCutoff = 5
# if a user has less than freqCutoff interactions, remove user.
numRows = len(userInteractionData)
userInteractionData["user_freq"] = userInteractionData.groupby("userId")[
    "userId"
].transform("count")
userInteractionData.drop(
    userInteractionData.index[userInteractionData["user_freq"] <= freqCutoff],
    inplace=True,
)
userInteractionData.drop("user_freq", axis=1, inplace=True)
# if a movie has less than freqCutoff interactions, remove movie.
userInteractionData["movie_freq"] = userInteractionData.groupby("movieId")[
    "movieId"
].transform("count")
userInteractionData.drop(
    userInteractionData.index[userInteractionData["movie_freq"] <= freqCutoff],
    inplace=True,
)
userInteractionData.drop("movie_freq", axis=1, inplace=True)
newNumRows = len(userInteractionData)
print(
    f"Filtered out {numRows - newNumRows}/{numRows} ({100 * (numRows - newNumRows) / numRows}%) through the frequency cuttoff of {freqCutoff}."
)


Filtered out 11146/81763 (13.632082971515233%) through the frequency cuttoff of 5.


In [107]:
numUsers = len(set(userInteractionData["userId"]))
numMovies = len(set(userInteractionData["movieId"]))
print("Users:", numUsers)
print("Movies:", numMovies)


Users: 608
Movies: 2655


In [108]:
# shuffle and split dataset
len_df = len(userInteractionData)
train_ratio, test_ratio, validation_ratio = 0.7, 0.2, 0.1
train_n = math.floor(train_ratio * len_df)
test_n = math.floor(test_ratio * len_df)
validation_n = math.floor(validation_ratio * len_df)
row_indices = list(range(len_df))
np.random.shuffle(row_indices)
print("confirming shuffled.", row_indices[0:3])
train_split_indices = row_indices[:train_n]
test_split_indices = row_indices[train_n : train_n + test_n]
validation_split_indices = row_indices[train_n + test_n :]
train_split = userInteractionData.index[train_split_indices]
test_split = userInteractionData.index[test_split_indices]
validation_split = userInteractionData.index[validation_split_indices]
print('train size:', train_split.size)
print('test size:', test_split.size)
print('validation size:', validation_split.size)

confirming shuffled. [46734, 6143, 53432]
train size: 49431
test size: 14123
validation size: 7063


In [109]:
userInteractionData

Unnamed: 0,userId,movieId
0,1,1
1,1,3
2,1,6
3,1,47
4,1,50
...,...,...
100829,610,164179
100830,610,166528
100832,610,168248
100833,610,168250


In [110]:
userInteractionData.head()
# renumber user and movie ids
userMapping = {}
movieMapping = {}
user_count = 0
movie_count = 0
for userInteractionIndex in range(len_df):
    userInteractionRow = userInteractionData.iloc[userInteractionIndex]
    # print(userInteractionRow)
    userId = userInteractionRow['userId']
    movieId = userInteractionRow['movieId']
    if not userId in userMapping:
        userMapping[userId] = user_count
        user_count += 1
    if not movieId in movieMapping:
        movieMapping[movieId] = movie_count
        movie_count += 1
    userId = userMapping[userId]
    movieId = movieMapping[movieId]
    userInteractionData.iloc[userInteractionIndex]['userId'] = userId
    userInteractionData.iloc[userInteractionIndex]['movieId'] = movieId

In [113]:
trainRows = userInteractionData.index[train_split_indices]
testRows = userInteractionData.index[test_split_indices]
validationRows = userInteractionData.index[validation_split_indices]
train_matrix = np.zeros((numUsers, numMovies))
test_matrix = np.zeros((numUsers, numMovies))
validation_matrix = np.zeros((numUsers, numMovies))
for trainRow in trainRows:
    userIndex = trainRow['userId']
    movieIndex = trainRow['movieId']
    train_matrix[userIndex][movieIndex] = 1
for testRow in testRows:
    userIndex = testRow['userId']
    movieIndex = testRow['movieId']
    test_matrix[userIndex][movieIndex] = 1
for validationRow in validationRows:
    userIndex = validationRow['userId']
    movieIndex = validationRow['movieId']
    validation_matrix[userIndex][movieIndex] = 1

Int64Index([64933,  8584, 75086, 63725, 59194, 12619, 99163,   217, 27909,
            14915,
            ...
            44622, 68737, 95927, 22883, 78559,   890, 36535,  2149, 54167,
             6839],
           dtype='int64', length=49431)
