In [21]:
import pandas as pd
import numpy as np

MIN_N_RATINGS = 5000
DATA_COLLECTION_N = 30
RECOMMENDATION_N = 30
#  training users (75%) are ids 1 to 103869 and testing users (25%) are ids 103870 or greater
FIRST_TESTING_USERID = 103870



In [22]:
# load data with 27k movies and 20M ratings
Movies = pd.read_csv("C:/Users/easht/Documents/CS 673/Movie Database/movie.csv")
Ratings = pd.read_csv("C:/Users/easht/Documents/CS 673/Movie Database/rating.csv")

In [23]:
# merge data to one row per movie/rating
MovieRatings = Movies.merge(Ratings)


In [24]:
# roll-up ratings by movie to determine each movies, average rating and numebr of ratings
MovieRatingsSummary = MovieRatings.groupby(['movieId','title']).agg({'rating': ['count','mean']})
MovieRatingsSummary = MovieRatingsSummary.reset_index()
MovieRatingsSummary.columns = MovieRatingsSummary.columns.map(''.join)
MovieRatingsSummary = MovieRatingsSummary.rename(columns={'ratingcount':'ratingCount', 'ratingmean':'ratingMean' })


In [25]:
# limit possible recommendations to movies to about 1000 movies with >= 5000 ratings each
MoviesGtMinRatings = MovieRatingsSummary.loc[MovieRatingsSummary['ratingCount'] >= MIN_N_RATINGS]
MoviesGtMinRatings = MoviesGtMinRatings.sort_values(by = ['ratingCount'], ascending= False)


In [26]:
# Take out the 30 most-rated movies that will be used for data collection and therefore are 
# not avaialbe to recommend
DataCollectionMovies = MoviesGtMinRatings[:DATA_COLLECTION_N]
DataCollectionMovies

Unnamed: 0,movieId,title,ratingCount,ratingMean
293,296,Pulp Fiction (1994),67310,4.174231
352,356,Forrest Gump (1994),66172,4.029
315,318,"Shawshank Redemption, The (1994)",63366,4.44699
587,593,"Silence of the Lambs, The (1991)",63299,4.177057
476,480,Jurassic Park (1993),59715,3.664741
257,260,Star Wars: Episode IV - A New Hope (1977),54502,4.190672
108,110,Braveheart (1995),53769,4.042534
583,589,Terminator 2: Judgment Day (1991),52244,3.931954
2486,2571,"Matrix, The (1999)",51334,4.187186
523,527,Schindler's List (1993),50054,4.310175


In [27]:
#Remove the data collection moves from the movies with sufficent ratings to get the possible rec list
PossibleRecMovies = MoviesGtMinRatings[DATA_COLLECTION_N + 1:]

#Pick the 30 highest-rating movies to become the control model's recommended movies for everyone
PossibleRecMovies = PossibleRecMovies.sort_values(by= 'ratingMean', ascending = False)
ControlRecMovies = PossibleRecMovies[:RECOMMENDATION_N]
controlRecMovieIds = sorted(ControlRecMovies['movieId'].tolist())
ControlRecMovies

Unnamed: 0,movieId,title,ratingCount,ratingMean
1195,1221,"Godfather: Part II, The (1974)",27398,4.275641
1935,2019,Seven Samurai (Shichinin no samurai) (1954),11611,4.27418
887,904,Rear Window (1954),17449,4.271334
895,912,Casablanca (1942),24349,4.258327
905,922,Sunset Blvd. (a.k.a. Sunset Boulevard) (1950),6525,4.256935
1169,1193,One Flew Over the Cuckoo's Nest (1975),29932,4.248079
737,750,Dr. Strangelove or: How I Learned to Stop Worr...,23220,4.247287
1186,1212,"Third Man, The (1949)",6565,4.246002
5917,6016,City of God (Cidade de Deus) (2002),12937,4.23541
10901,44555,"Lives of Others, The (Das leben der Anderen) (...",5720,4.23479


In [28]:
#limit the users to those in the testing pool only
RatingsTestingUsers = Ratings.loc[Ratings['userId'] >= FIRST_TESTING_USERID]
#isolte the ratings for movies by the testing pool that the control model recommended
RatingsRecMovies = RatingsTestingUsers.loc[RatingsTestingUsers['movieId'].isin(controlRecMovieIds)]
RatingsRecMovies.describe()

Unnamed: 0,userId,movieId,rating
count,125333.0,125333.0,125333.0
mean,121177.307493,4585.81818,4.209243
std,9973.381378,12033.56415,0.826843
min,103870.0,745.0,0.5
25%,112601.0,1136.0,4.0
50%,121140.0,1204.0,4.5
75%,129866.0,2019.0,5.0
max,138493.0,58559.0,5.0


In [29]:
#group the ratings by user and re-flatten the column names
UserPerformacneSummary = RatingsRecMovies.groupby(['userId']).agg({'rating': ['count','mean']})
UserPerformacneSummary = UserPerformacneSummary.reset_index()
UserPerformacneSummary.columns = UserPerformacneSummary.columns.map(''.join)
UserPerformacneSummary.describe()

Unnamed: 0,userId,ratingcount,ratingmean
count,22584.0,22584.0,22584.0
mean,121179.456474,5.549637,4.198132
std,9959.398717,4.990113,0.655306
min,103870.0,1.0,0.5
25%,112626.75,2.0,4.0
50%,121125.5,4.0,4.278889
75%,129803.25,8.0,4.625
max,138493.0,30.0,5.0


In [30]:
#summarize results
total_users = Ratings['userId'].max()
total_testing_users = total_users - FIRST_TESTING_USERID + 1
users_with_performance_scores = UserPerformacneSummary['userId'].count()
possible_rec_movie_n = PossibleRecMovies['title'].count()
n_recs_measurable = RatingsRecMovies['rating'].count()
avg_rec_rating = RatingsRecMovies['rating'].mean()
q1_rec_rating = RatingsRecMovies['rating'].quantile(0.25)
q3_rec_rating = RatingsRecMovies['rating'].quantile(0.75)
avg_all_rating = Ratings['rating'].mean()
q1_all_rating = Ratings['rating'].quantile(0.25)
q3_all_rating = Ratings['rating'].quantile(0.75)
users_with_measured_recs = UserPerformacneSummary['userId'].count()
mean_recs_per_user = UserPerformacneSummary['ratingcount'].mean()
q1_recs_per_user = UserPerformacneSummary['ratingcount'].quantile(0.25)
q3_recs_per_user = UserPerformacneSummary['ratingcount'].quantile(0.75)
print('Out of ' + str(total_users) + ' total users ' + str(total_testing_users) + ' were selected to test the model.')
print('Each user was recommended ' + str(RECOMMENDATION_N) + ' movies. Only movies with at least ' + str(MIN_N_RATINGS) +
    ' ratings were eligible to be recommended.')
print('The ' + str(DATA_COLLECTION_N) + ' movies that were rated most often are also not \
eligible since it is assumed we will use those for data collection.')
print('Out of the remaining ' + str(possible_rec_movie_n) + ' movies, the control model recommends the \
30 movies with the higest average rating to each user.')
print('Out of the ' + str(total_users * RECOMMENDATION_N) + ' recs made, ' + str(n_recs_measurable) + ' have ratings we can use\
 to test the control model\'s performance.')
print('The control model average rec rating is {:.2f} out of 5.00 with most falling between {:.2f} and {:.2f}'.format(avg_rec_rating, q1_rec_rating, q3_rec_rating ))
print('This is higher than the original data set where the average rating is {:.2f} with most falling between {:.2f} and {:.2f}'.format(avg_all_rating, q1_all_rating, q3_all_rating))


Out of 138493 total users 34624 were selected to test the model.
Each user was recommended 30 movies. Only movies with at least 5000 ratings were eligible to be recommended.
The 30 movies that were rated most often are also not eligible since it is assumed we will use those for data collection.
Out of the remaining 974 movies, the control model recommends the 30 movies with the higest average rating to each user.
Out of the 4154790 recs made, 125333 have ratings we can use to test the control model's performance.
The control model average rec rating is 4.21 out of 5.00 with most falling between 4.00 and 5.00
This is higher than the original data set where the average rating is 3.53 with most falling between 3.00 and 4.00
