This notebook splits the data into training and testing set for each user such that 80% ratings are in training and 20% are for testing.

In [31]:
import pandas as pd
import numpy as np
from ast import literal_eval

In [32]:
movies = pd.read_csv('../5_data/input/movies.csv', converters={"genres": literal_eval})
ratings = pd.read_csv('../5_data/input/ratings.csv')

print('movies: ', movies.shape)
print('ratings: ', ratings.shape)

print(movies.dtypes)
print(ratings.dtypes)


movies:  (45463, 3)
ratings:  (99933, 4)
tmdbId             int64
original_title    object
genres            object
dtype: object
userId         int64
tmdbId         int64
rating       float64
timestamp      int64
dtype: object


In [33]:
movies.head(5)

Unnamed: 0,tmdbId,original_title,genres
0,862,Toy Story,"[Animation, Comedy, Family]"
1,8844,Jumanji,"[Adventure, Fantasy, Family]"
2,15602,Grumpier Old Men,"[Romance, Comedy]"
3,31357,Waiting to Exhale,"[Comedy, Drama, Romance]"
4,11862,Father of the Bride Part II,[Comedy]


In [34]:
ratings.head(5)

Unnamed: 0,userId,tmdbId,rating,timestamp
0,1,9909,2.5,1260759144
1,7,9909,3.0,851868750
2,31,9909,4.0,1273541953
3,32,9909,4.0,834828440
4,36,9909,3.0,847057202


In [35]:
df = pd.merge(ratings, movies, on = 'tmdbId' , how = 'left')
df = df.drop('original_title', axis = 1)
df.head(5)

Unnamed: 0,userId,tmdbId,rating,timestamp,genres
0,1,9909,2.5,1260759144,"[Drama, Crime]"
1,7,9909,3.0,851868750,"[Drama, Crime]"
2,31,9909,4.0,1273541953,"[Drama, Crime]"
3,32,9909,4.0,834828440,"[Drama, Crime]"
4,36,9909,3.0,847057202,"[Drama, Crime]"


In [36]:
df.shape

(99973, 5)

In [9]:
# df['genres'] = df['genres'].apply(lambda d: d if isinstance(d, list) else [])

In [37]:
df.head()

Unnamed: 0,userId,tmdbId,rating,timestamp,genres
0,1,9909,2.5,1260759144,"[Drama, Crime]"
1,7,9909,3.0,851868750,"[Drama, Crime]"
2,31,9909,4.0,1273541953,"[Drama, Crime]"
3,32,9909,4.0,834828440,"[Drama, Crime]"
4,36,9909,3.0,847057202,"[Drama, Crime]"


#### Split into train and test data

In [38]:
from sklearn.model_selection import train_test_split
train_data, test_data = train_test_split(df, test_size=0.2, stratify=df.userId)

In [39]:
train_data = train_data.sort_values(['userId', 'tmdbId'])
train_data.head()

Unnamed: 0,userId,tmdbId,rating,timestamp,genres
581,1,97,4.0,1260759139,"[Science Fiction, Action, Adventure]"
442,1,152,2.5,1260759135,"[Science Fiction, Adventure, Mystery]"
305,1,783,2.0,1260759148,"[Drama, History]"
664,1,847,2.0,1260759198,"[Adventure, Drama, Action]"
117,1,1103,2.0,1260759185,"[Science Fiction, Action]"


In [40]:
test_data = test_data.sort_values(['userId','tmdbId'])
test_data.head()

Unnamed: 0,userId,tmdbId,rating,timestamp,genres
259,1,665,2.0,1260759187,"[Action, Adventure, Drama, History]"
84,1,819,3.0,1260759182,"[Crime, Drama, Thriller]"
535,1,1051,4.0,1260759191,"[Action, Crime, Thriller]"
403,1,1598,2.0,1260759131,"[Crime, Thriller]"
1779,2,197,4.0,835355532,"[Action, Drama, History, War]"


#### Save the dataframes as csv files

In [41]:
train_data.to_csv('../5_data/processed/training_data.csv', index = False)
test_data.to_csv('../5_data/processed/testing_data.csv', index = False)