In [1]:
import sys
sys.path.append("../src")

from Utils import *

## Initializing Spark Session

In [2]:
sc = SparkContext.getOrCreate()
ss = SparkSession(sc)

## Reading data

In [3]:
data_zipped = zipfile.ZipFile("../data/ml-20m.zip")

In [4]:
DATA_ROOT_PATH = "data"

if not os.path.isdir("data"):
    os.makedirs("data")

In [5]:
for f in data_zipped.filelist:
    filename = f.filename
    print("Reading {0}".format(filename))
    if os.path.exists(os.path.join(DATA_ROOT_PATH, filename)): continue
    
    data_zipped.extract(filename, DATA_ROOT_PATH)

Reading genome-scores.csv
Reading genome-tags.csv
Reading links.csv
Reading movies.csv
Reading ratings.csv
Reading README.txt
Reading tags.csv


In [6]:
readcsv = lambda filename: ss.read\
                            .format("csv")\
                            .option('header', 'true') \
                            .load(os.path.join("data", filename), inferSchema='true')

In [7]:
genome_scores = readcsv("genome-scores.csv")
print("Movies: {0:,}".format(get_count(genome_scores,"movieId")))
genome_scores.cache()
genome_scores.show(5)

Movies: 10,381
+-------+-----+--------------------+
|movieId|tagId|           relevance|
+-------+-----+--------------------+
|      1|    1|0.025000000000000022|
|      1|    2|0.025000000000000022|
|      1|    3|0.057750000000000024|
|      1|    4|             0.09675|
|      1|    5|             0.14675|
+-------+-----+--------------------+
only showing top 5 rows



In [8]:
genome_tags = readcsv("genome-tags.csv")
genome_tags.show(5)

+-----+------------+
|tagId|         tag|
+-----+------------+
|    1|         007|
|    2|007 (series)|
|    3|18th century|
|    4|       1920s|
|    5|       1930s|
+-----+------------+
only showing top 5 rows



In [9]:
links = readcsv("links.csv")
links.show(5)

+-------+------+------+
|movieId|imdbId|tmdbId|
+-------+------+------+
|      1|114709|   862|
|      2|113497|  8844|
|      3|113228| 15602|
|      4|114885| 31357|
|      5|113041| 11862|
+-------+------+------+
only showing top 5 rows



In [10]:
movies = readcsv("movies.csv")
print("Movies: {0:,}".format(movies.count()))
movies.cache()
movies.show(5)

Movies: 27,278
+-------+--------------------+--------------------+
|movieId|               title|              genres|
+-------+--------------------+--------------------+
|      1|    Toy Story (1995)|Adventure|Animati...|
|      2|      Jumanji (1995)|Adventure|Childre...|
|      3|Grumpier Old Men ...|      Comedy|Romance|
|      4|Waiting to Exhale...|Comedy|Drama|Romance|
|      5|Father of the Bri...|              Comedy|
+-------+--------------------+--------------------+
only showing top 5 rows



In [11]:
ratings = readcsv("ratings.csv").withColumn("ratingId", monotonically_increasing_id())
ratings.show(5)

+------+-------+------+----------+--------+
|userId|movieId|rating| timestamp|ratingId|
+------+-------+------+----------+--------+
|     1|      2|   3.5|1112486027|       0|
|     1|     29|   3.5|1112484676|       1|
|     1|     32|   3.5|1112484819|       2|
|     1|     47|   3.5|1112484727|       3|
|     1|     50|   3.5|1112484580|       4|
+------+-------+------+----------+--------+
only showing top 5 rows



In [12]:
print("Users: {0:,}".format(get_count(ratings,"userId")))
print("Movies: {0:,}".format(get_count(ratings,"movieId")))
print("Ratings: {0:,}".format(ratings.count()))

Users: 138,493
Movies: 26,744
Ratings: 20,000,263


In [13]:
tags = readcsv("tags.csv")
tags.show(5)

+------+-------+-------------+----------+
|userId|movieId|          tag| timestamp|
+------+-------+-------------+----------+
|    18|   4141|  Mark Waters|1240597180|
|    65|    208|    dark hero|1368150078|
|    65|    353|    dark hero|1368150079|
|    65|    521|noir thriller|1368149983|
|    65|    592|    dark hero|1368150078|
+------+-------+-------------+----------+
only showing top 5 rows



## 1) Users' tag assignment reliability

<img src="../misc/critic.jpeg" >

### 1.1) Which "User" gave which "Tag" to which "Movie"?

In [14]:
userId_tagId_movieId = tags.join(genome_tags, on="tag").select(["userId","tagId","movieId"])
userId_tagId_movieId.cache()
userId_tagId_movieId.show(10)

+------+-----+-------+
|userId|tagId|movieId|
+------+-----+-------+
|    65|  288|    208|
|    65|  288|    353|
|    65|  712|    521|
|    65|  288|    592|
|    65|  149|    668|
|    65|  894|    898|
|    65|  712|   1248|
|    65|  630|   1391|
|    65|  700|   1617|
|    65|  579|   1694|
+------+-----+-------+
only showing top 10 rows



### 1.2) How relevant each tag assignment was

In [15]:
userId_tagId_movieId_relevance = userId_tagId_movieId.join(genome_scores, on=["tagId","movieId"])
userId_tagId_movieId_relevance.show(10)

+-----+-------+------+------------------+
|tagId|movieId|userId|         relevance|
+-----+-------+------+------------------+
|   29|      1|135595|0.8919999999999999|
|   29|      1|131900|0.8919999999999999|
|   29|      1|111202|0.8919999999999999|
|   29|      1|107711|0.8919999999999999|
|   29|      1|103582|0.8919999999999999|
|   29|      1|103125|0.8919999999999999|
|   29|      1| 76878|0.8919999999999999|
|   29|      1| 72257|0.8919999999999999|
|   29|      1| 72073|0.8919999999999999|
|   29|      1| 10616|0.8919999999999999|
+-----+-------+------+------------------+
only showing top 10 rows



### 1.3) Calculating user reliability (ranges btw 0 --> 1)
#### Irrelevant tag assignments are punished
First multiply relevance with 10 and then substract $8$ so relevance score range will be between $-6$ and $6$ and then apply sigmoid function.
Relevance scores less than or equal to $0.2$ will go closer to $0$.
- $ R = \sigma(r * 15 - 9) $
- $R$ Reliability score
- $r$ Relevance score
<img src="../misc/sigmoid.png">

In [16]:
sigmoid = udf(lambda x: 1/(1+exp(-x)))

In [17]:
users_reliability = userId_tagId_movieId_relevance.select(["userId","relevance"])\
                                            .withColumn("reliability", sigmoid(userId_tagId_movieId_relevance.relevance*15-9).cast(DoubleType()))\
                                            .groupBy("userId").avg("reliability")\
                                            .select(["userId", col("avg(reliability)").alias("reliability")])\
                                            .sort(desc("reliability"))

users_reliability.show(10)

+------+------------------+
|userId|       reliability|
+------+------------------+
| 81949|0.9975273768433653|
| 50441|0.9975273768433653|
|105357|0.9975181101552658|
|113642|0.9975181101552658|
|119384|0.9975181101552658|
| 23923|0.9975181101552658|
| 68839|0.9975181101552658|
|136694|0.9975181101552658|
|136455|0.9975181101552658|
| 20109|0.9975180928341759|
+------+------------------+
only showing top 10 rows



## 2) Most reliably rated movies
Ratings are multiplied by users' reliability scores

In [19]:
userId_movieId_title_ratings  = ratings.join(users_reliability, on="userId")\
                                 .withColumn("reliable_rating", col("rating")*col("reliability"))\
                                 .join(movies, on="movieId")\
                                 .select(["ratingId", "userId", "movieId",
                                          "title", "genres", "rating", "reliable_rating"])
userId_movieId_title_ratings.cache()
userId_movieId_title_ratings.show(10)

+--------+------+-------+--------------------+--------------------+------+------------------+
|ratingId|userId|movieId|               title|              genres|rating|   reliable_rating|
+--------+------+-------+--------------------+--------------------+------+------------------+
|  738649|  4935|     50|Usual Suspects, T...|Crime|Mystery|Thr...|   3.0|1.6281716302354838|
|  738650|  4935|    288|Natural Born Kill...|Action|Crime|Thri...|   3.5|1.8995335686080645|
|  738651|  4935|    413|     Airheads (1994)|              Comedy|   4.0|2.1708955069806453|
|  738652|  4935|    527|Schindler's List ...|           Drama|War|   2.5|1.3568096918629033|
|  738653|  4935|    546|Super Mario Bros....|Action|Adventure|...|   1.0|0.5427238767451613|
|  738654|  4935|    562|Welcome to the Do...|        Comedy|Drama|   3.5|1.8995335686080645|
|  738655|  4935|    745|Wallace & Gromit:...|Animation|Childre...|   3.0|1.6281716302354838|
|  738656|  4935|   1175| Delicatessen (1991)|Comedy|Drama|R

In [20]:
userId_movieId_title_ratings.write.mode('overwrite').csv("../data/userId_movieId_title_ratings.csv", header=True)

In [20]:
movieId_title_rating = userId_movieId_title_ratings.select(["movieId", "title", "reliable_rating"])\
                                       .groupBy(["movieId", "title"]).avg("reliable_rating")\
                                       .select(["movieId", "title", col("avg(reliable_rating)").alias("rating")])\
                                       .sort(desc("rating"))

movieId_title_rating.show(10)

+-------+--------------------+------------------+
|movieId|               title|            rating|
+-------+--------------------+------------------+
| 117314|Neurons to Nirvan...| 4.987590550776329|
|  27829|      Slasher (2004)| 4.985802022050766|
| 113244|  When I Walk (2013)| 4.985802022050766|
|  99450|   Sun Kissed (2012)| 4.985802022050766|
|  96935|My Left Eye Sees ...| 4.985802022050766|
|  33380|     25 Watts (2001)| 4.984699608420732|
|  51402|Forest for the Tr...|4.9844604768211145|
|  94394|Scarlet Letter, T...| 4.983997618057762|
|  95600|Candles on Bay St...| 4.983997618057762|
|  95606|Thomas Jefferson ...| 4.983997618057762|
+-------+--------------------+------------------+
only showing top 10 rows



### Most rated movie

<img src="../misc/best_movie.jpg">

## 3) Most reliably rated tags

In [21]:
tagId_tag_rating = userId_movieId_title_ratings.join(userId_tagId_movieId, on=["movieId", "userId"])\
                                   .select(["tagId", "reliable_rating"])\
                                   .groupBy("tagId").avg("reliable_rating")\
                                   .join(genome_tags, on="tagId")\
                                   .select(["tagId", "tag", col("avg(reliable_rating)").alias("rating")])\
                                   .sort(desc("rating"))

tagId_tag_rating.show(10)

+-----+--------------------+------------------+
|tagId|                 tag|            rating|
+-----+--------------------+------------------+
|  654|           minnesota| 4.823053035379038|
|  590|         king arthur| 4.797952835820484|
|  976|       studio ghibli| 4.554853506188381|
|  760|oscar (best writi...| 4.468975853673555|
|  290|           dc comics| 4.461021798385487|
|  591|             kubrick| 4.419001271985087|
|  677|              mozart|4.3786996205063415|
|  207|        chris tucker| 4.343594240545502|
|  858|    robert downey jr| 4.269332120489323|
|  879| saturday night live| 4.240571940391272|
+-----+--------------------+------------------+
only showing top 10 rows



## 3) Most reliably rated genres

In [22]:
genre_rating = userId_movieId_title_ratings.join(movies, on="movieId")
columns = genre_rating.columns
genre_rating_rdd = genre_rating.rdd.map(lambda x: (x["ratingId"], x))
del genre_rating
genre_rating_rdd.take(1)

[(738649,
  Row(movieId=50, ratingId=738649, userId=4935, title='Usual Suspects, The (1995)', rating=3.0, reliable_rating=1.6281716302354838, title='Usual Suspects, The (1995)', genres='Crime|Mystery|Thriller'))]

In [23]:
dict(zip(columns, genre_rating_rdd.take(1)[0][1]))

{'movieId': 50,
 'ratingId': 738649,
 'userId': 4935,
 'title': 'Usual Suspects, The (1995)',
 'rating': 3.0,
 'reliable_rating': 1.6281716302354838,
 'genres': 'Crime|Mystery|Thriller'}

In [24]:
ratingId_genre = genre_rating_rdd.flatMapValues(lambda row: row[-1].split('|'))\
                                 .map(lambda row: Row(ratingId=row[0], genre=row[1]))\
                                 .toDF()
ratingId_genre.show(10)

+--------+--------+
|   genre|ratingId|
+--------+--------+
|   Crime|  738649|
| Mystery|  738649|
|Thriller|  738649|
|  Action|  738650|
|   Crime|  738650|
|Thriller|  738650|
|  Comedy|  738651|
|   Drama|  738652|
|     War|  738652|
|  Action|  738653|
+--------+--------+
only showing top 10 rows



In [25]:
genre_rating = genre_rating_rdd.map(lambda row: Row(**dict(zip(columns, row[1]))))\
                               .toDF()\
                               .join(ratingId_genre, on="ratingId")\
                               .select(["genre", "reliable_rating"])\
                               .groupBy("genre").avg("reliable_rating")\
                               .select(["genre", dfround(col("avg(reliable_rating)"), 3).alias("rating")])\
                               .sort(desc("rating"))\
                               .withColumn("Id", monotonically_increasing_id())\
                               .select(["Id", "genre", "rating"])

genre_rating.cache()
genre_rating.show(20)

+------------+------------------+------+
|          Id|             genre|rating|
+------------+------------------+------+
|           0|         Film-Noir| 3.018|
|  8589934592|               War| 2.914|
| 17179869184|             Crime| 2.856|
| 25769803776|           Mystery| 2.851|
| 34359738368|             Drama| 2.839|
| 42949672960|       Documentary| 2.833|
| 51539607552|         Animation| 2.822|
| 60129542144|              IMAX| 2.808|
| 68719476736|           Western| 2.801|
| 77309411328|          Thriller| 2.737|
| 85899345920|         Adventure| 2.724|
| 85899345921|           Musical| 2.724|
| 94489280512|           Fantasy| 2.718|
|103079215104|            Sci-Fi| 2.717|
|111669149696|           Romance| 2.706|
|120259084288|            Action| 2.699|
|128849018880|            Comedy| 2.631|
|137438953472|          Children| 2.623|
|146028888064|(no genres listed)| 2.609|
|154618822656|            Horror| 2.557|
+------------+------------------+------+



<img src="../misc/best_genre.jpg">