In [1]:
## 1.Import libraries

import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import json

In [2]:
## 2.Import MovieLens dataset

path_movielens = "data/MovieLens 25m/"
links_df       = pd.read_csv(path_movielens+'links.csv')
movies_df      = pd.read_csv(path_movielens+'movies.csv')
ratings_df     = pd.read_csv(path_movielens+'ratings.csv')
print('MovieLens after importing')
print(f'number of movies: {movies_df.shape[0]}\nnumber of ratings: {ratings_df.shape[0]}')
print('---------------------------------------------------')

MovieLens after importing
number of movies: 62423
number of ratings: 25000095
---------------------------------------------------


In [3]:
## 3.Preprocess MovieLens dataset

# Delete timestamp column
ratings_df.drop(columns=['timestamp'],inplace=True)

# Make genre words equal, eg SciFi and Sci-Fi, become both Scifi
def process_genres(input_string):
    processed_string = input_string.lower()
    processed_string = input_string.replace('-','') # Remove '-'
    processed_string = processed_string.title()     # Capitalize first letter of every word
    return processed_string
movies_df['genres'] = movies_df['genres'].apply(lambda x:process_genres(x))

# Merge links and movies, delete movies with missing links
movies_df = pd.merge(links_df, movies_df, on='movieId', how='right') 
missing_links       = movies_df[movies_df['tmdbId'].isnull()==True]['movieId'].tolist()
movies_df           = movies_df[~movies_df['movieId'].isin(missing_links)]
movies_df['tmdbId'] = movies_df['tmdbId'].astype(int)
ratings_df          = ratings_df[~ratings_df['movieId'].isin(missing_links)]
print('MovieLens after removing movies with missing links')
print(f'number of movies: {movies_df.shape[0]}\nnumber of ratings: {ratings_df.shape[0]}')
print('---------------------------------------------------')

# Drop duplicates
movies_df.drop_duplicates(subset=['title'], keep=False, inplace=True)
movies_df.drop_duplicates(subset=['movieId'], keep=False, inplace=True)
movies_df.drop_duplicates(subset=['tmdbId'], keep=False, inplace=True)
ratings_df.drop_duplicates(inplace=True)
ids = movies_df['movieId'].tolist()
ratings_df = ratings_df[ratings_df['movieId'].isin(ids)]
print('MovieLens after removing duplicates')
print(f'number of movies: {movies_df.shape[0]}\nnumber of ratings: {ratings_df.shape[0]}')
print('---------------------------------------------------')

# Exclude movies without genre info
excluded_movies    = movies_df[movies_df['genres']=='(No Genres Listed)']
excluded_movie_ids = excluded_movies['movieId'].tolist()
movies_df          = movies_df[~movies_df['movieId'].isin(excluded_movie_ids)]
ratings_df         = ratings_df[~ratings_df['movieId'].isin(excluded_movie_ids)]
print('MovieLens after removing movies without genres')
print(f'number of movies: {movies_df.shape[0]}\nnumber of ratings: {ratings_df.shape[0]}')
print('---------------------------------------------------')

# Exclude movies without year
year_col = movies_df['title'].str.extract(r'\((\d{4})\)')
movies_df.insert(loc=3, column='year', value=year_col)
movies_df['title'] = movies_df['title'].str.replace(r'\((\d{4})\)','',regex=True)
movies_df['title'] = movies_df['title'].apply(lambda x: x.strip())
movies_without_year = movies_df[movies_df['year'].isnull()==True]
movies_without_year = movies_without_year['movieId'].tolist()
movies_df.dropna(inplace=True)
movies_df['year'] = movies_df['year'].astype(int)
ratings_df = ratings_df[~ratings_df['movieId'].isin(movies_without_year)]
print('MovieLens after removing movies without year')
print(f'number of movies: {movies_df.shape[0]}\nnumber of ratings: {ratings_df.shape[0]}')
print('---------------------------------------------------')


MovieLens after removing movies with missing links
number of movies: 62316
number of ratings: 24997664
---------------------------------------------------
MovieLens after removing duplicates
number of movies: 62074
number of ratings: 24915121
---------------------------------------------------
MovieLens after removing movies without genres
number of movies: 57029
number of ratings: 24888615
---------------------------------------------------
MovieLens after removing movies without year
number of movies: 56885
number of ratings: 24879976
---------------------------------------------------


In [4]:
## 4.Preprocess TMDB5000 dataset

path_tmdb       = 'data/TMDB5000/'
tmdb_movies_df  = pd.read_csv(path_tmdb+'tmdb_5000_movies.csv')
tmdb_credits_df = pd.read_csv(path_tmdb+'tmdb_5000_credits.csv')

tmdb_df            = pd.DataFrame(columns=['tmdbId','budget','revenue','runtime'])
tmdb_df['tmdbId']  = tmdb_credits_df['movie_id']
tmdb_df['budget']  = tmdb_movies_df['budget']
tmdb_df['revenue'] = tmdb_movies_df['revenue']
tmdb_df['runtime'] = tmdb_movies_df['runtime']

# Fetch the first director
def fetch_directors(x):
    director_name = 0
    for member in json.loads(x):
        if member['job']=='Director':
            director_name = member['name']
            break
    return director_name

# Fetch the leading cast member
def fetch_cast(x):
    cast_name = 0
    for member in json.loads(x):
        cast_name = member['name']
        break
    return cast_name

tmdb_df['director']  = tmdb_credits_df['crew'].apply(fetch_directors)
tmdb_df['cast']      = tmdb_credits_df['cast'].apply(fetch_cast)

# Remove movies that have missing values from columns
tmdb_df.replace(0, np.nan, inplace=True)
tmdb_df.dropna(inplace=True)
tmdb_df['runtime'] = tmdb_df['runtime'].apply(lambda x: int(x))
tmdb_df['budget']  = tmdb_df['budget'].apply(lambda x: int(x))
tmdb_df['revenue'] = tmdb_movies_df['revenue'].apply(lambda x: int(x))
print('TMDB5000 after preprocessing')
print(f'number of movies: {tmdb_df.shape[0]}')
print('---------------------------------------------------')

TMDB5000 after preprocessing
number of movies: 3225
---------------------------------------------------


In [5]:
## 5.Merge MovieLens with TMDB5000 (keep movies that exist in both)

# Keep only ids that are common between the 2 datasets
common_elements = [element for element in tmdb_df['tmdbId'].tolist() if element in movies_df['tmdbId'].tolist()]

tmdb_df    = tmdb_df[tmdb_df['tmdbId'].isin(common_elements)]
movies_df  = movies_df[movies_df['tmdbId'].isin(common_elements)].reset_index(drop=True)
ratings_df = ratings_df[ratings_df['movieId'].isin(movies_df['movieId'])].reset_index(drop=True)

# Convert tmdb_ids to movie_ids in TMDB5000
tmdbId2movieId    = dict(zip(movies_df['tmdbId'], movies_df['movieId']))
tmdb_df['tmdbId'] = tmdb_df['tmdbId'].map(tmdbId2movieId)
tmdb_df.rename(columns={'tmdbId':'movieId'}, inplace=True)

# Sort movie_ids in TMDB5000 in the same order as in movies_df
tmdb_df = tmdb_df.sort_values(by='movieId').reset_index(drop=True)
movies_df.drop(columns=['imdbId','tmdbId'], inplace=True)

print('MovieLens after merging with TMDB5000')
print(f'number of movies: {movies_df.shape[0]}\nnumber of ratings: {ratings_df.shape[0]}')
print('---------------------------------------------------')

MovieLens after merging with TMDB5000
number of movies: 3184
number of ratings: 16078580
---------------------------------------------------


In [6]:
# 6.Process final dataset

# Εxclude movies before year N
year_threshold = 2003
movies_df      = movies_df[movies_df['year']>=year_threshold]
test_ids       = movies_df['movieId'].tolist()
ratings_df     = ratings_df[ratings_df['movieId'].isin(test_ids)]
print(f'MovieLens after removing movies before year {year_threshold}')
print(f'number of movies: {movies_df.shape[0]}\nnumber of ratings: {ratings_df.shape[0]}')
print('---------------------------------------------------')

# Filter out movies without ratings
movie_ratings_count   = ratings_df.groupby('movieId')['rating'].count()
min_ratings_threshold = 1
popular_movies        = movie_ratings_count[movie_ratings_count >= min_ratings_threshold]
popular_movie_ids     = popular_movies.index.tolist()
ratings_df            = ratings_df[ratings_df['movieId'].isin(popular_movie_ids)]
movies_df             = movies_df[movies_df['movieId'].isin(popular_movie_ids)]
print('MovieLens after filtering out movies with low amount of ratings')
print(f'number of movies: {movies_df.shape[0]}\nnumber of ratings: {ratings_df.shape[0]}')
print('---------------------------------------------------')

# # Filter out users without ratings
user_ratings_count      = ratings_df.groupby('userId')['rating'].count()
min_ratings_threshold   = 1
user_with_N_ratings     = user_ratings_count[user_ratings_count >= min_ratings_threshold]
user_with_N_ratings_ids = user_with_N_ratings.index.tolist()
ratings_df              = ratings_df[ratings_df['userId'].isin(user_with_N_ratings_ids)]
print('MovieLens after filtering out users with low amount of ratings')
print(f'number of movies: {movies_df.shape[0]}\nnumber of ratings: {ratings_df.shape[0]}')
print('---------------------------------------------------')

MovieLens after removing movies before year 2003
number of movies: 1888
number of ratings: 5171040
---------------------------------------------------
MovieLens after filtering out movies with low amount of ratings
number of movies: 1888
number of ratings: 5171040
---------------------------------------------------
MovieLens after filtering out users with low amount of ratings
number of movies: 1888
number of ratings: 5171040
---------------------------------------------------


In [7]:
## 7.Preprocess oscars dataset
path_oscars = 'data/OSCARS/'
oscars_df   = pd.read_csv(path_oscars+'the_oscar_award.csv')

def replace_NaN(input_col):
    if pd.isna(input_col):
        return 0
    return input_col

# Create dataframe for movies nominated/awarded with best picture Oscar
best_picture_ = oscars_df[oscars_df['category'].isin(['OUTSTANDING PICTURE', 'BEST PICTURE'])]
best_picture_['best_picture_nominee'] = (best_picture_['winner']==0).astype(int)
best_picture_['best_picture_winner']  = best_picture_['winner'].astype(int)
best_picture_ = best_picture_[['film', 'best_picture_nominee', 'best_picture_winner']]
best_picture_.rename(columns={'film':'title'},inplace=True)

# Create dataframe for total number of nominations/awards for movies
tmp1 = oscars_df['film'].dropna().value_counts().reset_index()
tmp2 = oscars_df[oscars_df['winner'] == True]['film'].dropna().value_counts().reset_index()
total_oscars_ = pd.merge(tmp1, tmp2, on='film', how='outer')
total_oscars_.rename(columns={'count_x': 'oscar_noms', 'count_y': 'oscar_wins'}, inplace=True)
total_oscars_['oscar_wins'] = total_oscars_['oscar_wins'].apply(replace_NaN).astype(int)
total_oscars_.rename(columns={'film':'title'},inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  best_picture_['best_picture_nominee'] = (best_picture_['winner']==0).astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  best_picture_['best_picture_winner']  = best_picture_['winner'].astype(int)


In [8]:
## 8.Create user vector

# Group ratings dataframe by 'userId' to get rating count, average rating and std per user
user_ratings_info = ratings_df.groupby('userId')['rating'].agg(['count', 'mean','std']).reset_index()
user_ratings_info.columns = ['userId', 'rating_count', 'average_rating','variability']

# Merge ratings with movies based on 'movieId' to include movie genres in the resulting dataframe
user_movie_info = pd.merge(ratings_df, movies_df, on='movieId', how='left')

# Use one-hot encoding to represent movie genres as separate columns
genres_one_hot         = user_movie_info['genres'].str.get_dummies('|')
original_one_hot_cols  = genres_one_hot.columns
genres_one_hot.columns = [f'{col}_avg_rating' for col in genres_one_hot.columns]
genre_columns          = genres_one_hot.columns

# Concatenate the one-hot encoded genres with user_movie_info dataframe
user_movie_info = pd.concat([user_movie_info, genres_one_hot], axis=1)
# Drop the 'genres' column since we have it represented in one-hot encoding
user_movie_info.drop(columns=['genres'], inplace=True)

# Calculate the weighted sum of the one-hot encoded genre columns by the ratings
user_movie_info[genre_columns] = user_movie_info[genre_columns].multiply(user_movie_info['rating'], axis=0)

# Group by 'userId' and calculate the sum of the weighted genre columns for each user
user_genre_sum_rating = user_movie_info.groupby('userId')[genre_columns].sum().reset_index()
# user_genre_sum_rating.drop(columns=['userId'],inplace=True)

# Calculate the count of movies of each genre rated by the user
user_genre_count = user_movie_info.groupby('userId')[genre_columns].apply(lambda x: (x > 0).sum()).reset_index()
# user_genre_count.drop(columns=['userId'],inplace=True)

# Calculate the average rating of every genre for the user
user_genre_avg_rating = user_genre_sum_rating.iloc[:, 1:] / user_genre_count.iloc[:, 1:]

# Calculate user ratings percentage per genre
user_pct_genre = user_genre_count.iloc[:, 1:].div(user_ratings_info['rating_count'], axis=0)
user_pct_genre.columns = [f'{col}_pct' for col in original_one_hot_cols]

# Create user vector
user_vector = pd.concat([user_genre_avg_rating, user_pct_genre], axis=1)
user_vector.insert(0,'userId',user_ratings_info['userId'])
user_vector.insert(1,'rating_count',user_ratings_info['rating_count'])
user_vector.insert(2,'average_rating',user_ratings_info['average_rating'])
user_vector.insert(3,'variability',user_ratings_info['variability'])
user_vector.fillna(0.0, inplace=True)
user_vector = user_vector.reset_index(drop=True)
user_vector

Unnamed: 0,userId,rating_count,average_rating,variability,Action_avg_rating,Adventure_avg_rating,Animation_avg_rating,Children_avg_rating,Comedy_avg_rating,Crime_avg_rating,...,Filmnoir_pct,Horror_pct,Imax_pct,Musical_pct,Mystery_pct,Romance_pct,Scifi_pct,Thriller_pct,War_pct,Western_pct
0,1,9,3.888889,0.927961,3.500000,3.833333,4.000000,4.000000,3.916667,3.500000,...,0.000000,0.000000,0.000000,0.111111,0.111111,0.555556,0.222222,0.000000,0.000000,0.000000
1,2,24,3.833333,1.578846,4.318182,4.500000,4.250000,4.250000,3.200000,3.166667,...,0.000000,0.000000,0.166667,0.041667,0.000000,0.166667,0.125000,0.208333,0.166667,0.000000
2,3,364,3.659341,0.538486,3.607656,3.664062,3.800000,3.742424,3.457143,3.727273,...,0.002747,0.065934,0.206044,0.010989,0.087912,0.090659,0.343407,0.370879,0.038462,0.013736
3,4,134,3.223881,1.189818,3.023529,2.828125,3.184211,3.027778,3.609756,3.860000,...,0.000000,0.044776,0.276119,0.014925,0.097015,0.037313,0.335821,0.231343,0.052239,0.014925
4,10,6,2.833333,1.125463,2.750000,0.000000,0.000000,0.000000,1.000000,2.500000,...,0.000000,0.500000,0.000000,0.000000,0.166667,0.333333,0.333333,0.833333,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
97189,162536,49,3.806122,0.727555,3.300000,3.272727,0.000000,0.000000,4.000000,3.933333,...,0.020408,0.040816,0.224490,0.000000,0.081633,0.102041,0.326531,0.265306,0.061224,0.020408
97190,162537,1,2.000000,0.000000,0.000000,0.000000,0.000000,0.000000,2.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,1.000000,0.000000,0.000000,0.000000,0.000000
97191,162538,62,3.314516,1.379694,3.375000,3.133333,3.375000,3.300000,2.953125,4.166667,...,0.000000,0.016129,0.129032,0.000000,0.048387,0.338710,0.129032,0.161290,0.064516,0.016129
97192,162540,55,4.136364,0.910100,3.973684,4.166667,4.187500,4.230769,4.100000,4.214286,...,0.000000,0.109091,0.145455,0.036364,0.090909,0.272727,0.145455,0.236364,0.072727,0.000000


In [9]:
## 9. Create user_data, which is input to the Neural Network
user_data = user_vector.loc[user_vector.index.repeat(user_vector['rating_count'])]
user_data.drop(columns=['userId'],inplace=True)
user_data

Unnamed: 0,rating_count,average_rating,variability,Action_avg_rating,Adventure_avg_rating,Animation_avg_rating,Children_avg_rating,Comedy_avg_rating,Crime_avg_rating,Documentary_avg_rating,...,Filmnoir_pct,Horror_pct,Imax_pct,Musical_pct,Mystery_pct,Romance_pct,Scifi_pct,Thriller_pct,War_pct,Western_pct
0,9,3.888889,0.927961,3.500000,3.833333,4.0,4.0,3.916667,3.500000,0.0,...,0.0,0.0,0.000000,0.111111,0.111111,0.555556,0.222222,0.000000,0.0,0.0
0,9,3.888889,0.927961,3.500000,3.833333,4.0,4.0,3.916667,3.500000,0.0,...,0.0,0.0,0.000000,0.111111,0.111111,0.555556,0.222222,0.000000,0.0,0.0
0,9,3.888889,0.927961,3.500000,3.833333,4.0,4.0,3.916667,3.500000,0.0,...,0.0,0.0,0.000000,0.111111,0.111111,0.555556,0.222222,0.000000,0.0,0.0
0,9,3.888889,0.927961,3.500000,3.833333,4.0,4.0,3.916667,3.500000,0.0,...,0.0,0.0,0.000000,0.111111,0.111111,0.555556,0.222222,0.000000,0.0,0.0
0,9,3.888889,0.927961,3.500000,3.833333,4.0,4.0,3.916667,3.500000,0.0,...,0.0,0.0,0.000000,0.111111,0.111111,0.555556,0.222222,0.000000,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
97193,17,3.735294,1.105634,3.714286,3.916667,4.0,3.8,2.916667,3.666667,0.0,...,0.0,0.0,0.235294,0.000000,0.000000,0.235294,0.176471,0.176471,0.0,0.0
97193,17,3.735294,1.105634,3.714286,3.916667,4.0,3.8,2.916667,3.666667,0.0,...,0.0,0.0,0.235294,0.000000,0.000000,0.235294,0.176471,0.176471,0.0,0.0
97193,17,3.735294,1.105634,3.714286,3.916667,4.0,3.8,2.916667,3.666667,0.0,...,0.0,0.0,0.235294,0.000000,0.000000,0.235294,0.176471,0.176471,0.0,0.0
97193,17,3.735294,1.105634,3.714286,3.916667,4.0,3.8,2.916667,3.666667,0.0,...,0.0,0.0,0.235294,0.000000,0.000000,0.235294,0.176471,0.176471,0.0,0.0


In [10]:
## 10.Create item vector

# Calculate movies' average ratings
item_vector = ratings_df.groupby('movieId')['rating'].agg(rating_count='count', avg_rating='mean').reset_index(drop=False)

# Create movie genres one-hot encoding, count genres for each movie
tmp = pd.merge(movies_df, tmdb_df, on='movieId')
genres_one_hot             = movies_df['genres'].str.get_dummies('|').reset_index(drop=True)
item_vector['genre_count'] = genres_one_hot.sum(axis=1)
item_vector['avg_rating'] = item_vector['avg_rating'].astype('float32')

# Create budget, revenue, budget/revenue columns
item_vector['budget']         = tmp[['budget']].astype(int) #/ tmp[['budget']].max()
item_vector['revenue']        = tmp[['revenue']].astype(int) #/ tmp[['revenue']].max()
item_vector['budget/revenue'] = tmp['budget'].div(tmp['revenue']).astype('float32')

# Calculate movie popularity among users
item_vector['popularity'] = (item_vector['rating_count'] / ratings_df.shape[0] ).astype('float32')

# Include columns for oscar nominations(not wins) and wins for each movie
item_vector['title'] = movies_df['title'].reset_index(drop=True)
item_vector = pd.merge(item_vector, total_oscars_, on='title', how='left')
item_vector['oscar_noms'] = item_vector['oscar_noms'].apply(replace_NaN).astype(int)
item_vector['oscar_wins'] = item_vector['oscar_wins'].apply(replace_NaN).astype(int)

# Add dummy variables if movie has been nominated/won oscar for best picture
item_vector = pd.merge(item_vector, best_picture_, on='title', how='left')
item_vector['best_picture_nominee'] = item_vector['best_picture_nominee'].apply(replace_NaN).astype(int)
item_vector['best_picture_winner'] = item_vector['best_picture_winner'].apply(replace_NaN).astype(int)
item_vector.drop(columns=['title'], inplace=True)

# Include genres one-hot in the columns
item_vector = pd.concat([item_vector, genres_one_hot], axis=1)

# Include directors one-hot and cast-one-hot
def create_one_hot(col, threshold):
    one_hot = pd.get_dummies(col).astype(int)
    category_counts = col.value_counts()
    categories_to_group = category_counts[category_counts <= threshold].index        #Identify categories to group
    one_hot['other'] = one_hot[categories_to_group].sum(axis=1) #Create a new "Other" category
    one_hot = one_hot.drop(columns=categories_to_group) #Drop original columns that were grouped into "Other"
    one_hot.drop(columns=one_hot.columns[0], axis=1,  inplace=True) # drop first column to avoid dummy variable trap
    return one_hot
    
director_one_hot = create_one_hot(col=tmp['director'], threshold=6) #6
director_one_hot.columns = [f'director_{col}' for col in director_one_hot.columns]
item_vector = pd.concat([item_vector, director_one_hot], axis=1)

cast_one_hot = create_one_hot(col=tmp['cast'], threshold=8) #8
cast_one_hot.columns = [f'cast_{col}' for col in cast_one_hot.columns]
item_vector = pd.concat([item_vector, cast_one_hot], axis=1)
item_vector

Unnamed: 0,movieId,rating_count,avg_rating,genre_count,budget,revenue,budget/revenue,popularity,oscar_noms,oscar_wins,...,cast_Russell Crowe,cast_Sandra Bullock,cast_Seth Rogen,cast_Steve Carell,cast_Tom Cruise,cast_Tom Hanks,cast_Vin Diesel,cast_Will Ferrell,cast_Will Smith,cast_other
0,6006,1738,2.660529,2,18000000,101564935,0.177227,3.361026e-04,0,0,...,0,0,0,0,0,0,0,0,0,1
1,6012,482,2.823652,2,20000000,17432163,1.147305,9.321143e-05,0,0,...,0,0,0,0,0,0,0,0,0,1
2,6058,2665,2.897749,2,26000000,90426405,0.287527,5.153702e-04,0,0,...,0,0,0,0,0,0,0,0,0,1
3,6155,5938,3.181290,2,50000000,177371441,0.281894,1.148318e-03,0,0,...,0,0,0,0,0,0,0,0,0,1
4,6156,3657,3.075062,3,50000000,88323487,0.566101,7.072078e-04,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1883,163056,153,3.372549,4,15000000,77000000,0.194805,2.958786e-05,0,0,...,0,0,0,0,0,0,0,0,0,1
1884,163599,1,3.500000,2,7500000,3347647,2.240380,1.933847e-07,0,0,...,0,0,0,0,0,0,0,0,0,1
1885,163601,1,3.500000,1,6500000,2025032,3.209826,1.933847e-07,0,0,...,0,0,0,0,0,0,0,0,0,1
1886,165035,2,2.500000,1,200000,2295733,0.087118,3.867694e-07,0,0,...,0,0,0,0,0,0,0,0,0,1


In [11]:
## 11.Create item_data, which is input to the Neural Network

item_data = pd.merge(ratings_df, item_vector, on='movieId',how='left')
item_data.drop(columns=['userId','movieId','rating'],inplace=True)
item_data

Unnamed: 0,rating_count,avg_rating,genre_count,budget,revenue,budget/revenue,popularity,oscar_noms,oscar_wins,best_picture_nominee,...,cast_Russell Crowe,cast_Sandra Bullock,cast_Seth Rogen,cast_Steve Carell,cast_Tom Cruise,cast_Tom Hanks,cast_Vin Diesel,cast_Will Ferrell,cast_Will Smith,cast_other
0,34712,3.833977,4,94000000,940335536,0.099964,0.006713,4,1,0,...,0,0,0,0,0,0,0,0,0,1
1,37227,3.789212,4,140000000,655011224,0.213737,0.007199,5,0,0,...,0,0,0,0,0,0,0,0,0,0
2,20288,3.793942,3,4000000,119723856,0.033410,0.003923,4,1,1,...,0,0,0,0,0,0,0,0,0,1
3,1202,3.834027,5,8000000,3432342,2.330770,0.000232,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,4330,3.144457,1,30000000,611899420,0.049028,0.000837,0,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5171035,19157,3.811140,3,150000000,623722818,0.240491,0.003705,5,1,0,...,0,0,0,0,0,0,0,0,0,1
5171036,2100,2.776667,2,150000000,287594577,0.521568,0.000406,0,0,0,...,0,0,0,0,0,0,0,0,0,1
5171037,1154,2.316725,2,60000000,361366633,0.166036,0.000223,0,0,0,...,0,0,0,0,0,0,0,0,0,1
5171038,41519,4.166538,4,185000000,1004558444,0.184161,0.008029,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
## 12.Create true ratings numpy array

y_truth = ratings_df['rating'].to_numpy()
y_truth.shape

(5171040,)

In [13]:
## 13.Save preprocessing data

np.save('data/user_data.npy',user_data)
np.save('data/item_data.npy',item_data)
np.save('data/rating_data.npy',y_truth)

MODEL_PATH = 'api_calls/'
item_vector.to_csv(MODEL_PATH + 'item_vector.csv', index=False)
user_vector.to_csv(MODEL_PATH + 'user_vector.csv', index=False)
genres_one_hot.to_csv(MODEL_PATH + 'genres_one_hot.csv', index=False)

movies_df = pd.merge(movies_df, links_df, on='movieId', how='left')
movies_df['tmdbId'] = movies_df['tmdbId'].astype(int)
movies_df.to_csv(MODEL_PATH + 'filtered_movies.csv', index=False)