In [1]:
# Intialization
import os
import time

import warnings
warnings.filterwarnings("ignore", message="numpy.dtype size changed")

# spark imports
from pyspark.sql import SparkSession

# data science imports
import numpy as np
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
import unicodedata

In [2]:
data_path = 'hdfs:///user/andrew/'

In [8]:
# Read in data through spark since the data is sored in hadoop and format the columns
# Convert to pandas dataframes for easier and faster manipulation
from pyspark.sql.types import *
from pyspark.sql import SQLContext, Row
from pyspark.sql.functions import *
sqlContext = SQLContext(sc)

Rating = Row("user_id", "item_id", "label") # Ignore timestamp
User = Row("user_id", "gender", "age_group" ,"occupation")

movies_metadata = sqlContext.read.parquet(data_path + 'movie_metadata_OHE')
movies_df = movies_metadata.toPandas()
movies_df['item_id'] = movies_df.item_id.apply(lambda x: 'item_id_' + str(x))
movies_df = movies_df.set_index(movies_df.item_id) # set index so no sorting errors occur

users = sc.textFile(data_path + "users.dat")\
    .map(lambda line: line.split("::")[0:4])\
    .map(lambda line: (int(line[0]), line[1], int(line[2]), int(line[3])))\
    .map(lambda r: User(*r))
users = sqlContext.createDataFrame(users)
users_df = users.toPandas()
users_df['user_id'] = users_df.user_id.apply(lambda x: 'user_id_' + str(x))

ratings = sc.textFile(data_path + "ratings.dat")\
    .map(lambda line: line.split("::")[0:3])\
    .map(lambda line: map(int, line))\
    .map(lambda r: Rating(*r))
ratings = sqlContext.createDataFrame(ratings)
ratings_df = ratings.toPandas()
ratings_df['user_id'] = ratings_df.user_id.apply(lambda x: 'user_id_' + str(x))
ratings_df['item_id'] = ratings_df.item_id.apply(lambda x: 'item_id_' + str(x))

In [9]:
movies_df.head(5)

Unnamed: 0_level_0,item_id,title,imdb_id,imdb_rating,imdb_votes,metascore,runtime,year,ml_genre_Action,ml_genre_Adventure,...,MPAA_rating_PG,MPAA_rating_PG13,MPAA_rating_R,MPAA_rating_TV14,MPAA_rating_TVG,MPAA_rating_TVMA,MPAA_rating_TVPG,MPAA_rating_Unrated,MPAA_rating_X,MPAA_rating_Other
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
item_id_3699,item_id_3699,Starman,tt0088172,7.0,38393,71,115,1984,0,1,...,1,0,0,0,0,0,0,0,0,0
item_id_3700,item_id_3700,"Brother from Another Planet, The",tt0087004,6.8,5050,0,108,1984,0,0,...,0,0,1,0,0,0,0,0,0,0
item_id_3701,item_id_3701,Alien Nation,tt0094631,6.3,12867,45,91,1988,0,0,...,0,0,1,0,0,0,0,0,0,0
item_id_3702,item_id_3702,Mad Max,tt0079501,7.0,173984,73,88,1979,1,0,...,0,0,1,0,0,0,0,0,0,0
item_id_3703,item_id_3703,Mad Max 2 (a.k.a. The Road Warrior),tt0082694,7.6,152866,77,94,1981,1,0,...,0,0,1,0,0,0,0,0,0,0


In [10]:
users_df.head(5)

Unnamed: 0,user_id,gender,age_group,occupation
0,user_id_1,F,1,10
1,user_id_2,M,56,16
2,user_id_3,M,25,15
3,user_id_4,M,45,7
4,user_id_5,M,25,20


In [11]:
ratings_df.head(5)

Unnamed: 0,user_id,item_id,label
0,user_id_1,item_id_1193,5
1,user_id_1,item_id_661,3
2,user_id_1,item_id_914,3
3,user_id_1,item_id_3408,4
4,user_id_1,item_id_2355,5


Find the unique set of movies that have ratings in ratings_df. Remove all movies without ratings from movies_df.
There are 3883 movies total, and only 3706 movies with ratings.

In [17]:
print(len(ratings_df.item_id.unique()))
#3706 < 3883 so some movies do not have any ratings, these movies can be removed for genre rating averaging
# keep only movies that have at least one user rating
distinct_rated_movies = ratings_df.item_id.unique()
movies_df_rated = movies_df.iloc[[item in distinct_rated_movies for item in movies_df.item_id]]
print(movies_df_rated.shape)

3706
(3706, 10217)


Transform the ratings dataframe (user_id, item_id, label) to a user_id x item_id dataframe with the labels (ratings) as the values. Fill all missing values, items that have not been rated by a user, with 0s. 

Then create the binarized version of this matrix, and again fill any missing values with 0s. This dataframe will have values of 1 where a rating exists, and 0 otherwise.

In [18]:
# Transform ratings dataframe to user_id x item_id dataframe with label (or rating) as the values
ratings_spread = ratings_df.pivot(index='user_id',columns='item_id',values='label').fillna(0)
# Create binary valued dataframe from the transformed ratings dataframe. 
#   This should have values of 1 where the labels are > 0 and 0 otherwise.
ratings_spread_binary = (ratings_spread/ratings_spread).fillna(0)

print('User Movie Ratings')
ratings_spread.head()

User Movie Ratings


item_id,item_id_1,item_id_10,item_id_100,item_id_1000,item_id_1002,item_id_1003,item_id_1004,item_id_1005,item_id_1006,item_id_1007,...,item_id_99,item_id_990,item_id_991,item_id_992,item_id_993,item_id_994,item_id_996,item_id_997,item_id_998,item_id_999
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
user_id_1,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
user_id_10,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
user_id_100,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
user_id_1000,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
user_id_1001,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Movie Genre Rating Summary

In [20]:
# Create movies dataframe subset that only contains the Movie Lens genre or IMDb genre.
genre_cols = [mg for mg in list(movies_df_rated.columns) if (mg[0:8] == 'ml_genre' or mg[0:10] == 'imdb_genre')]
movie_genres = movies_df_rated[genre_cols]
# Cross the new ratings dataframe with the genres dataframe 
#   - this is now a datframe with sum of ratings by genre for each user
user_genre_total = ratings_spread.dot(movie_genres)
# Cross the binarized ratings dataframe with the genres dataframe 
#   - this is now a dataframe that contains counts of rated movies by genre for each user
user_genre_count = ratings_spread_binary.dot(movie_genres)

In [22]:
ratings_spread_binary.head()

item_id,item_id_1,item_id_10,item_id_100,item_id_1000,item_id_1002,item_id_1003,item_id_1004,item_id_1005,item_id_1006,item_id_1007,...,item_id_99,item_id_990,item_id_991,item_id_992,item_id_993,item_id_994,item_id_996,item_id_997,item_id_998,item_id_999
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
user_id_1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
user_id_10,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
user_id_100,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
user_id_1000,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
user_id_1001,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
# Perform element wise division on user genre total and user genre count to get user genre average
user_genre_avg = user_genre_total / user_genre_count
user_genre_avg = user_genre_avg.add_suffix('_avg_rating').reset_index()
# adding user_genre_avg to users_df, building user_id profiles
users_metadata = pd.merge(users_df, user_genre_avg, on = 'user_id')

print('User Metadata with Genre Averages')
users_metadata.head()

In [None]:
del movie_genres, user_genre_total, user_genre_count, user_genre_avg

## Actor and Director Ratings Summary

In [None]:
# Create movies dataframe subset that only contains the actors or directors
actor_cols = [mg for mg in list(movies_df_rated.columns) if mg[0:6] == 'actor_']
director_cols = [mg for mg in list(movies_df_rated.columns) if mg[0:9] == 'director_']
people_cols = actor_cols + director_cols
movie_people = movies_df_rated[people_cols]
# # Cross the binarized ratings dataframe with the genres dataframe 
# #   - this is now a dataframe that contains counts of rated movies by genre for each user
user_movie_people_count = ratings_spread_binary.dot(movie_people)

In [None]:
# For testing purposes, create list of Alfred Hitchcock movies (can be done for any director)
hitchcock_movies = list(movies_df_rated.item_id.iloc[np.where(movies_df_rated.director_Alfred_Hitchcock == 1)])
print(hitchcock_movies)

In [None]:
# Determine how many ratings were made for Hitchcock movies
ratings_df_temp = ratings_df.iloc[np.where([item in hitchcock_movies for item in ratings_df.item_id])]
print 'number of ratings:', len(ratings_df_temp)

In [None]:
# Make sure that the ratings were spread correctly and that the number of ratings remains constant
print 'number of ratings:', np.sum(np.sum(ratings_spread_binary[hitchcock_movies]))

In [None]:
# Confirm that the dot product maintained a correct number of ratings
print 'number of ratings:', np.sum(user_movie_people_count.director_Alfred_Hitchcock)

In [None]:
actor_views = user_movie_people_count[actor_cols].apply(np.sum, axis = 0)
# This total will be much greater than the total number of ratings because an individual movie has multiple actors
print 'total actor views: ', np.sum(actor_views)
director_views = user_movie_people_count[director_cols].apply(np.sum, axis = 0)
# This total may be greater than the total number of ratings if one movie had multiple directors
print 'total director views: ', np.sum(director_views)

In [None]:
del movie_people, user_movie_people_count, hitchcock_movies, ratings_df_temp

In [None]:
# Filter actors to top 200, or ~7800, based on number of views
top_200_actors = actor_views.sort_values(ascending=False).head(200)
top_200_actors_names = top_200_actors.index
top_200_actors

In [None]:
# Filter directors to top 50, of ~2200, based on number of views
top_50_directors = director_views.sort_values(ascending=False).head(50)
top_50_directors_names = top_50_directors.index
top_50_directors

In [None]:
# Filter out actors and directors not in the top 200 and 50
# Create movies dataframe subset that only contains the top actors and directors
movie_people = movies_df_rated[list(top_200_actors_names) + list(top_50_directors_names)]
# Cross the new ratings dataframe with the  movie_people dataframes
#   - this is now a datframe with sum of ratings by actor and director for each user
user_movie_people_total = ratings_spread.dot(movie_people)
# Cross the binarized ratings dataframe with the movie_people dataframes
#   - this is now a dataframe with counts of rated movies by actor and director for each user
user_movie_people_count = ratings_spread_binary.dot(movie_people)

In [None]:
# Perform element wise division on user movie people total and user movie people count to get user genre average
user_movie_people_avg_ratings = user_movie_people_total / user_movie_people_count
user_movie_people_avg_ratings = user_movie_people_avg_ratings.add_suffix('_avg_rating').reset_index()

In [None]:
# continue building user profiles
users_metadata = pd.merge(users_metadata, user_movie_people_avg_ratings, on = 'user_id')
# one-hot encode gender and age group
gender_OHE = pd.get_dummies(users_metadata.gender, prefix = 'gender')
age_group_OHE = pd.get_dummies(users_metadata.age_group, prefix = 'age_group')
# add ohe columns to user profile data and remove original columns
users_metadata = pd.concat([users_metadata, gender_OHE, age_group_OHE], axis = 1, sort = False)
users_metadata.drop(['gender', 'age_group', 'occupation'], axis = 1, inplace = True)
users_metadata.head()

In [None]:
del movie_people, user_movie_people_total, user_movie_people_count, \
    user_movie_people_avg_ratings, movies_df_rated, ratings_spread, ratings_spread_binary, \
    top_200_actors, top_50_directors, ratings_df, actor_views, \
    director_views, movies_df, users_df

## Convert to Spark Dataframe and Save

In [None]:
# convert user_id back to integer values
users_metadata['user_id'] = users_metadata.user_id.apply(lambda x: int(x[8:]))
# sort columns, this makes it easier for use later when all of the OHE columns are sorted
sorted_columns = list(users_metadata.columns.sort_values())
users_metadata = users_metadata[sorted_columns]
# convert to spark dataframe and save out user profiles
users_metadata_spark = sqlContext.createDataFrame(users_metadata)
users_metadata_spark.write.format('parquet').mode('overwrite').save(data_path + 'users_metadata')

In [None]:
# specify movie columns to keep and save out movie profiles
movie_cols_to_keep = ['item_id', 'title', 'imdb_id', 'imdb_rating', 
                       'imdb_votes', 'metascore', 'runtime', 'year'] +\
        genre_cols + list(top_50_directors_names) + list(top_200_actors_names)
movie_metadata_ohe_subset = movies[movie_cols_to_keep]
movie_metadata_ohe_subset.write.format('parquet').mode('overwrite').save(data_path + 'movie_metadata_OHE_subset')

In [None]:
# specify only genre and people columns and save out for use later
genre_and_people_cols = ['item_id'] + genre_cols + list(top_50_directors_names) + list(top_200_actors_names)
movie_genre_and_people_metadata_ohe_subset = movies[genre_and_people_cols]
movie_genre_and_people_metadata_ohe_subset.write.format('parquet').mode('overwrite'). \
        save(data_path + 'movie_genre_and_people_metadata_ohe_subset')