In [32]:
!pip3 install numpy pandas

In [None]:
import pandas as pd
import numpy as np
from config import datasetPath
from os import path
import math
from pickle import dump

In [None]:
ratingsPath = path.join(datasetPath, "ratings.csv")
itemFeaturesPath = path.join(datasetPath, 'movies.csv')
reformatItemFeaturesPath = path.join(datasetPath, 'item_features.csv')
trainDataPath = path.join(datasetPath, "training.pickle")
testDataPath = path.join(datasetPath, "testing.pickle")
validationDataPath = path.join(datasetPath, "validation.pickle")

In [None]:
userInteractionData = pd.read_csv(ratingsPath)
itemFeatureData = pd.read_csv(itemFeaturesPath)
# drop timestamp feature
userInteractionData.drop("timestamp", axis=1, inplace=True)


In [None]:
numUsers = len(set(userInteractionData["userId"]))
numMovies = len(set(userInteractionData["movieId"]))
print("Users:", numUsers)
print("Movies:", numMovies)


Users: 610
Movies: 9724


In [None]:
# We only count the interaction in the interaction matrix if rating >= ratingCutoff
ratingCutoff = 3
# filter out sparse data
numRows = len(userInteractionData)
userInteractionData.drop(
    userInteractionData.index[userInteractionData["rating"] < ratingCutoff],
    axis=0,
    inplace=True,
)
userInteractionData.drop("rating", axis=1, inplace=True)
newNumRows = len(userInteractionData)
print(
    f"Filtered out {numRows - newNumRows}/{numRows} ({100 * (numRows - newNumRows) / numRows}%) through the rating cuttoff of {ratingCutoff}."
)


Filtered out 19073/100836 (18.914871672815263%) through the rating cuttoff of 3.


In [None]:
freqCutoff = 5
# if a user has less than freqCutoff interactions, remove user.
numRows = len(userInteractionData)
userInteractionData["user_freq"] = userInteractionData.groupby("userId")[
    "userId"
].transform("count")
userInteractionData.drop(
    userInteractionData.index[userInteractionData["user_freq"] <= freqCutoff],
    inplace=True,
)
userInteractionData.drop("user_freq", axis=1, inplace=True)
# if a movie has less than freqCutoff interactions, remove movie.
userInteractionData["movie_freq"] = userInteractionData.groupby("movieId")[
    "movieId"
].transform("count")
userInteractionData.drop(
    userInteractionData.index[userInteractionData["movie_freq"] <= freqCutoff],
    inplace=True,
)
userInteractionData.drop("movie_freq", axis=1, inplace=True)
newNumRows = len(userInteractionData)
print(
    f"Filtered out {numRows - newNumRows}/{numRows} ({100 * (numRows - newNumRows) / numRows}%) through the frequency cuttoff of {freqCutoff}."
)


Filtered out 11146/81763 (13.632082971515233%) through the frequency cuttoff of 5.


In [None]:
numUsers = len(set(userInteractionData["userId"]))
numMovies = len(set(userInteractionData["movieId"]))
print("Users:", numUsers)
print("Movies:", numMovies)


Users: 608
Movies: 2655


In [None]:
# shuffle and split dataset
len_df = len(userInteractionData)
train_ratio, test_ratio, validation_ratio = 0.7, 0.2, 0.1
train_n = math.floor(train_ratio * len_df)
test_n = math.floor(test_ratio * len_df)
validation_n = math.floor(validation_ratio * len_df)
row_indices = list(range(len_df))
np.random.shuffle(row_indices)
print("confirming shuffled.", row_indices[0:3])
train_split_indices = row_indices[:train_n]
test_split_indices = row_indices[train_n : train_n + test_n]
validation_split_indices = row_indices[train_n + test_n :]
train_split = userInteractionData.index[train_split_indices]
test_split = userInteractionData.index[test_split_indices]
validation_split = userInteractionData.index[validation_split_indices]
print('train size:', train_split.size)
print('test size:', test_split.size)
print('validation size:', validation_split.size)

confirming shuffled. [50486, 61951, 8124]
train size: 49431
test size: 14123
validation size: 7063


In [None]:
userInteractionData

Unnamed: 0,userId,movieId
0,1,1
1,1,3
2,1,6
3,1,47
4,1,50
...,...,...
100829,610,164179
100830,610,166528
100832,610,168248
100833,610,168250


In [None]:
userInteractionData.head()
# renumber user and movie ids
userMapping = {}
movieMapping = {}
user_count = 0
movie_count = 0
for userInteractionIndex in range(len_df):
    userInteractionRow = userInteractionData.iloc[userInteractionIndex]
    # print(userInteractionRow)
    userId = userInteractionRow['userId']
    movieId = userInteractionRow['movieId']
    if not userId in userMapping:
        userMapping[userId] = user_count
        user_count += 1
    if not movieId in movieMapping:
        movieMapping[movieId] = movie_count
        movie_count += 1
    userId = userMapping[userId]
    movieId = movieMapping[movieId]
    userInteractionData.iloc[userInteractionIndex].loc['userId'] = userId
    userInteractionData.iloc[userInteractionIndex].loc['movieId'] = movieId

In [None]:
trainRows = userInteractionData.iloc[train_split_indices]
testRows = userInteractionData.iloc[test_split_indices]
validationRows = userInteractionData.iloc[validation_split_indices]
train_matrix = np.zeros((numUsers, numMovies))
test_matrix = np.zeros((numUsers, numMovies))
validation_matrix = np.zeros((numUsers, numMovies))
for trainIndex, trainRow in trainRows.iterrows():
    userIndex = trainRow['userId']
    movieIndex = trainRow['movieId']
    train_matrix[userIndex][movieIndex] = 1
for testIndex, testRow in testRows.iterrows():
    userIndex = testRow['userId']
    movieIndex = testRow['movieId']
    test_matrix[userIndex][movieIndex] = 1
for validationIndex, validationRow in validationRows.iterrows():
    userIndex = validationRow['userId']
    movieIndex = validationRow['movieId']
    validation_matrix[userIndex][movieIndex] = 1

In [None]:
# item features, in the form <movieId, feature, value>
reformatItemFeatures = pd.DataFrame(columns=['movieId', 'feature', 'value'])
for index, row in itemFeatureData.iterrows():
    movieId = itemFeatureData.at[index, 'movieId']
    if movieId in movieMapping:
        movieId = movieMapping[movieId]
    else:
        continue
    title = itemFeatureData.at[index, 'title']
    genres = itemFeatureData.at[index, 'genres']
    reformatItemFeatures = reformatItemFeatures.append({ 
        'movieId': movieId,
        'feature': 'title',
        'value': title
    }, ignore_index=True)
    reformatItemFeatures = reformatItemFeatures.append({ 
        'movieId': movieId,
        'feature': 'genres',
        'value': genres
    }, ignore_index=True)

In [None]:
reformatItemFeatures

Unnamed: 0,movieId,feature,value
0,0,title,Toy Story (1995)
1,0,genres,Adventure|Animation|Children|Comedy|Fantasy
2,375,title,Jumanji (1995)
3,375,genres,Adventure|Children|Fantasy
4,1,title,Grumpier Old Men (1995)
...,...,...,...
5305,1961,genres,Action|Adventure|Fantasy|Sci-Fi
5306,2582,title,The Shape of Water (2017)
5307,2582,genres,Adventure|Drama|Fantasy
5308,1415,title,Deadpool 2 (2018)


In [None]:
reformatItemFeatures.to_csv(reformatItemFeaturesPath)

NameError: name 'reformatItemFeaturesPath' is not defined

In [None]:
with open(trainDataPath, "wb") as trainFile:
    dump(train_matrix, trainFile) 
with open(testDataPath, "wb") as testFile:
    dump(test_matrix, testFile) 
with open(validationDataPath, "wb") as validationFile:
    dump(validation_matrix, validationFile) 