In [0]:
display(dbutils.fs.ls("dbfs:/FileStore/tables"))

path,name,size
dbfs:/FileStore/tables/genome_scores.csv,genome_scores.csv,435164157
dbfs:/FileStore/tables/genome_tags.csv,genome_tags.csv,18103
dbfs:/FileStore/tables/kjvdat.txt,kjvdat.txt,4521345
dbfs:/FileStore/tables/links.csv,links.csv,1368578
dbfs:/FileStore/tables/movies.csv,movies.csv,3038099
dbfs:/FileStore/tables/ratings.csv,ratings.csv,678260987
dbfs:/FileStore/tables/tags.csv,tags.csv,38810332


Recommendations:
  1. Recently Watched
  2. Highest genre rated per user
  3. Latest Trending

In [0]:
import datetime
import pyspark.sql.functions as f
import pyspark.sql.types
import pandas as pd

from pyspark.sql.functions import year, month, dayofmonth
from pyspark.sql.functions import unix_timestamp, from_unixtime
from pyspark.sql import Window
from pyspark.sql.functions import rank, min

In [0]:
file_location = "dbfs:/FileStore/tables/movies.csv"

file_type = "csv"

infer_schema = True
first_row_is_header = True
delimiter = ","

df_movies = spark.read.format(file_type) \
  .option("inferSchema", infer_schema) \
  .option("header", first_row_is_header) \
  .option("sep", delimiter) \
  .load(file_location)

display(df_movies)

movieId,title,genres
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,Jumanji (1995),Adventure|Children|Fantasy
3,Grumpier Old Men (1995),Comedy|Romance
4,Waiting to Exhale (1995),Comedy|Drama|Romance
5,Father of the Bride Part II (1995),Comedy
6,Heat (1995),Action|Crime|Thriller
7,Sabrina (1995),Comedy|Romance
8,Tom and Huck (1995),Adventure|Children
9,Sudden Death (1995),Action
10,GoldenEye (1995),Action|Adventure|Thriller


In [0]:
links="/FileStore/tables/links.csv"
df_links = spark.read.format(file_type) \
      .option("inferSchema", infer_schema) \
      .option("header", first_row_is_header) \
      .option("sep", delimiter) \
      .load(links)
    
display(df_links)

movieId,imdbId,tmdbId
1,114709,862.0
2,113497,8844.0
3,113228,15602.0
4,114885,31357.0
5,113041,11862.0
6,113277,949.0
7,114319,11860.0
8,112302,45325.0
9,114576,9091.0
10,113189,710.0


In [0]:
#Read data "tags"
tags="/FileStore/tables/tags.csv"
df_tags = spark.read.format(file_type) \
      .option("inferSchema", infer_schema) \
      .option("header", first_row_is_header) \
      .option("sep", delimiter) \
      .load(tags)
    
display(df_tags)

userId,movieId,tag,timestamp
3,260,classic,1439472355
3,260,sci-fi,1439472256
4,1732,dark comedy,1573943598
4,1732,great dialogue,1573943604
4,7569,so bad it's good,1573943455
4,44665,unreliable narrators,1573943619
4,115569,tense,1573943077
4,115713,artificial intelligence,1573942979
4,115713,philosophical,1573943033
4,115713,tense,1573943042


In [0]:
#Read data "ratings"
ratings="/FileStore/tables/ratings.csv"
df_ratings = spark.read.format(file_type) \
      .option("inferSchema", infer_schema) \
      .option("header", first_row_is_header) \
      .option("sep", delimiter) \
      .load(ratings)
    
display(df_ratings)

userId,movieId,rating,timestamp
1,296,5.0,1147880044
1,306,3.5,1147868817
1,307,5.0,1147868828
1,665,5.0,1147878820
1,899,3.5,1147868510
1,1088,4.0,1147868495
1,1175,3.5,1147868826
1,1217,3.5,1147878326
1,1237,5.0,1147868839
1,1250,4.0,1147868414


In [0]:
df_movies.count()

In [0]:
# Joining to get corresponding ratings for the movies, doing left join so that we won't lose any movies that don't have rating.
df_movies_with_ratings = df_movies.join(df_ratings, "movieId", "left")
display(df_movies_with_ratings)

movieId,title,genres,userId,rating,timestamp
31,Dangerous Minds (1995),Drama,12,2.0,940768545
31,Dangerous Minds (1995),Drama,24,4.0,1366520015
31,Dangerous Minds (1995),Drama,50,4.5,1402504725
31,Dangerous Minds (1995),Drama,61,4.0,986604671
31,Dangerous Minds (1995),Drama,80,3.0,993327037
31,Dangerous Minds (1995),Drama,106,4.0,828854636
31,Dangerous Minds (1995),Drama,165,5.0,846330065
31,Dangerous Minds (1995),Drama,166,4.0,951603835
31,Dangerous Minds (1995),Drama,199,4.0,832963045
31,Dangerous Minds (1995),Drama,217,2.5,1110244695


In [0]:
#check if there are multiple ratings for a movie
df_movies_chk_dup = df_movies_with_ratings.groupby("movieId").count()
display(df_movies_chk_dup)

movieId,count
31,9106
53,137
65,4663
78,1129
85,2441
108,115
133,20
137,206
148,335
155,1168


In [0]:
# Join tags df with previous created df
df_movies_with_ratings = df_movies_with_ratings.join(df_tags, "movieID", "inner")
display(df_movies_with_ratings)

movieId,title,genres,userId,rating,timestamp,userId.1,tag,timestamp.1
31,Dangerous Minds (1995),Drama,12,2.0,940768545,2403,inspirational,1368562859
31,Dangerous Minds (1995),Drama,12,2.0,940768545,5441,inspirational,1368436529
31,Dangerous Minds (1995),Drama,12,2.0,940768545,6118,inspirational,1368933234
31,Dangerous Minds (1995),Drama,12,2.0,940768545,6550,amusement park,1528560000
31,Dangerous Minds (1995),Drama,12,2.0,940768545,6550,high school,1528560000
31,Dangerous Minds (1995),Drama,12,2.0,940768545,6550,karate,1528560000
31,Dangerous Minds (1995),Drama,12,2.0,940768545,6550,naval officer,1528560000
31,Dangerous Minds (1995),Drama,12,2.0,940768545,6550,rap music,1528560000
31,Dangerous Minds (1995),Drama,12,2.0,940768545,6550,teacher,1528560000
31,Dangerous Minds (1995),Drama,12,2.0,940768545,15200,inspirational,1449068115


In [0]:
# convert int to string
df_ratings = df_ratings.withColumn("tsDate", f.from_unixtime("timestamp"))

In [0]:
display(df_ratings)

userId,movieId,rating,timestamp,tsDate
1,296,5.0,1147880044,2006-05-17 15:34:04
1,306,3.5,1147868817,2006-05-17 12:26:57
1,307,5.0,1147868828,2006-05-17 12:27:08
1,665,5.0,1147878820,2006-05-17 15:13:40
1,899,3.5,1147868510,2006-05-17 12:21:50
1,1088,4.0,1147868495,2006-05-17 12:21:35
1,1175,3.5,1147868826,2006-05-17 12:27:06
1,1217,3.5,1147878326,2006-05-17 15:05:26
1,1237,5.0,1147868839,2006-05-17 12:27:19
1,1250,4.0,1147868414,2006-05-17 12:20:14


In [0]:
#string to date conversion
df_ratings1 = df_ratings.select('userId', 'movieId', 'rating', f.to_date(unix_timestamp('tsDate').cast("timestamp")).alias('rating_date'))
display(df_ratings1)

userId,movieId,rating,rating_date
1,296,5.0,2006-05-17
1,306,3.5,2006-05-17
1,307,5.0,2006-05-17
1,665,5.0,2006-05-17
1,899,3.5,2006-05-17
1,1088,4.0,2006-05-17
1,1175,3.5,2006-05-17
1,1217,3.5,2006-05-17
1,1237,5.0,2006-05-17
1,1250,4.0,2006-05-17


In [0]:
#  check for particular date how many ratings is given
df_ratings_year = df_ratings1.groupby('rating_date').count()
display(df_ratings_year)

rating_date,count
2006-05-17,2946
2015-05-19,4747
2002-03-02,1861
2013-01-22,1819
2013-09-09,2147
2018-08-10,3588
2014-11-12,1964
2017-09-11,3521
2017-08-11,3355
2014-09-26,929


In [0]:
# calculate avg rating for a particular movie
df_avg_ratings = df_ratings1.groupby("movieId").mean("rating")
display(df_avg_ratings)

movieId,avg(rating)
1088,3.25002094679514
1580,3.5817083457378187
3175,3.607783614161949
44022,3.2593627146699773
175197,2.754918032786885
1645,3.547347362181387
471,3.6579813752234034
3794,3.247051114023591
8638,3.9717508278145695
33722,3.5552486187845305


In [0]:
from pyspark.sql.functions import mean

df_avg_ratings = df_ratings1.groupby("movieId").agg(mean("rating").alias("Avg Rating"))
display(df_avg_ratings)

movieId,Avg Rating
1088,3.25002094679514
1580,3.5817083457378187
3175,3.607783614161949
44022,3.2593627146699773
175197,2.754918032786885
1645,3.547347362181387
471,3.6579813752234034
3794,3.247051114023591
8638,3.9717508278145695
33722,3.5552486187845305


In [0]:
# Join avg ratings df with movie df as we want to know movie title as well
df = df_avg_ratings.join(df_movies, "movieId", "inner")
display(df)

movieId,Avg Rating,title,genres
1088,3.25002094679514,Dirty Dancing (1987),Drama|Musical|Romance
1580,3.5817083457378187,Men in Black (a.k.a. MIB) (1997),Action|Comedy|Sci-Fi
3175,3.607783614161949,Galaxy Quest (1999),Adventure|Comedy|Sci-Fi
44022,3.2593627146699773,Ice Age 2: The Meltdown (2006),Adventure|Animation|Children|Comedy
175197,2.754918032786885,The Dark Tower (2017),Fantasy|Horror|Sci-Fi|Western
1645,3.547347362181387,The Devil's Advocate (1997),Drama|Mystery|Thriller
471,3.6579813752234034,"Hudsucker Proxy, The (1994)",Comedy
3794,3.247051114023591,Chuck & Buck (2000),Comedy|Drama
8638,3.9717508278145695,Before Sunset (2004),Drama|Romance
33722,3.5552486187845305,Ladies in Lavender (2004),Comedy|Drama|Romance


In [0]:
# we are checking how many times a particular movie has been rated
df_total_rating = df_ratings1.groupby("movieId").count()
display(df_total_rating)

movieId,count
1088,11935
1580,40308
3175,14659
44022,4833
175197,610
1645,13496
471,10631
3794,763
8638,4832
33722,181


In [0]:
# filter out movie which has no of ratings less than 5
df_total_rating = df_total_rating.filter("count > 5 ")
df_ratings_filtered = df_total_rating.join(df_ratings1, "movieId", "inner")

In [0]:
# if a particular user has rated same movie more than once, then take the maximum of those ratings
# Join with movies df so that we associate movie id with movie title
df_rating_per_user = df_ratings_filtered.select('userId', 'movieId', 'rating').groupby("userId", "movieId").max("rating")
df_rating_per_user_movie = df_rating_per_user.join(df_movies, "movieId", "inner")

In [0]:
df_rating_per_user_movie = df_rating_per_user_movie.withColumnRenamed("max(rating)", "max_rating")
display(df_rating_per_user_movie)

movieId,userId,max_rating,title,genres
108932,20957,3.0,The Lego Movie (2014),Action|Adventure|Animation|Children|Comedy|Fantasy
3377,20958,3.5,Hangmen Also Die! (1943),Drama|War
6934,20958,3.0,"Matrix Revolutions, The (2003)",Action|Adventure|Sci-Fi|Thriller|IMAX
879,20959,4.0,"Relic, The (1997)",Horror|Thriller
2655,20959,0.5,Howling II: Your Sister Is a Werewolf (1985),Horror
5433,20959,2.0,Silver Bullet (Stephen King's Silver Bullet) (1985),Adventure|Drama|Horror|Mystery|Thriller
5697,20959,3.0,Terror Train (1980),Horror
6500,20959,3.0,"Satanic Rites of Dracula, The (1974)",Horror
8501,20959,3.0,"Hitcher, The (1986)",Action|Thriller
8665,20959,4.0,"Bourne Supremacy, The (2004)",Action|Crime|Thriller


In [0]:
df_rating = df_rating_per_user_movie.filter("max_rating >= 4")
display(df_rating)

movieId,userId,max_rating,title,genres
879,20959,4.0,"Relic, The (1997)",Horror|Thriller
8665,20959,4.0,"Bourne Supremacy, The (2004)",Action|Crime|Thriller
8665,20966,4.0,"Bourne Supremacy, The (2004)",Action|Crime|Thriller
6934,20969,5.0,"Matrix Revolutions, The (2003)",Action|Adventure|Sci-Fi|Thriller|IMAX
879,20987,4.0,"Relic, The (1997)",Horror|Thriller
8665,20989,4.0,"Bourne Supremacy, The (2004)",Action|Crime|Thriller
104241,20992,4.5,Kick-Ass 2 (2013),Action|Comedy|Crime
108932,20992,4.5,The Lego Movie (2014),Action|Adventure|Animation|Children|Comedy|Fantasy
7324,20994,4.0,Hidalgo (2004),Adventure|Drama
8665,21002,4.0,"Bourne Supremacy, The (2004)",Action|Crime|Thriller


In [0]:
#Identify best movies per genre
df_movie_per_genre = df_rating.groupby("genres", "title").count()
display(df_movie_per_genre)

genres,title,count
Comedy,Ladybugs (1992),11
Drama,Margot at the Wedding (2007),78
Drama|Romance,"Reader, The (2008)",1437
Comedy|Drama,American Splendor (2003),2299
Drama|Romance,Two Days (2011),6
Horror,Terror Train (1980),14
Action|Horror,Alone in the Dark II (2008),3
Comedy,It's Love I'm After (1937),4
Drama,"Garden of Allah, The (1936)",1
Adventure|Animation|Children,Phantom Boy (2015),7


In [0]:
#Identify genre of user
# We check which genre of a particular user has greater count
df_rating_genre = df_rating.select("userId", "title", "genres").groupby("userId", "genres").count()
display(df_rating_genre)

In [0]:
# check recent movie rated by a user
df_recent_movie = df_ratings1.groupby("userId", "movieId").max("rating_date")
display(df_recent_movie)
# Here for a particular user we will check which movieID has latest max(rating_date)

In [0]:
#trending movie overall
df_ratings_per_genre = df.groupby("genre").avg("avg_rating")
display(df_ratings_per_genre)