In [1]:
import zipfile, os

from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, col, desc, round
from pyspark.sql.types import DoubleType
from pyspark import rdd, Row
from math import exp

## Initializing Spark Session

In [2]:
sc = SparkContext.getOrCreate()
ss = SparkSession(sc)

## Reading data

In [3]:
data_zipped = zipfile.ZipFile("data/ml-20m.zip")

In [4]:
DATA_ROOT_PATH = "data"

if not os.path.isdir("data"):
    os.makedirs("data")

In [5]:
for f in data_zipped.filelist:
    filename = f.filename
    print("Reading {0}".format(filename))
    if os.path.exists(os.path.join(DATA_ROOT_PATH, filename)): continue
    
    data_zipped.extract(filename, DATA_ROOT_PATH)

Reading genome-scores.csv
Reading genome-tags.csv
Reading links.csv
Reading movies.csv
Reading ratings.csv
Reading README.txt
Reading tags.csv


In [6]:
readcsv = lambda filename: ss.read\
                            .format("csv")\
                            .option('header', 'true')\
                            .load(os.path.join("data",filename), inferSchema='true')

In [7]:
genome_scores = readcsv("genome-scores.csv")
genome_scores.cache()
genome_scores.show(5)

+-------+-----+--------------------+
|movieId|tagId|           relevance|
+-------+-----+--------------------+
|      1|    1|0.025000000000000022|
|      1|    2|0.025000000000000022|
|      1|    3|0.057750000000000024|
|      1|    4|             0.09675|
|      1|    5|             0.14675|
+-------+-----+--------------------+
only showing top 5 rows



In [8]:
genome_tags = readcsv("genome-tags.csv")
genome_tags.show(5)

+-----+------------+
|tagId|         tag|
+-----+------------+
|    1|         007|
|    2|007 (series)|
|    3|18th century|
|    4|       1920s|
|    5|       1930s|
+-----+------------+
only showing top 5 rows



In [9]:
links = readcsv("links.csv")
links.show(5)

+-------+------+------+
|movieId|imdbId|tmdbId|
+-------+------+------+
|      1|114709|   862|
|      2|113497|  8844|
|      3|113228| 15602|
|      4|114885| 31357|
|      5|113041| 11862|
+-------+------+------+
only showing top 5 rows



In [10]:
movies = readcsv("movies.csv")
movies.cache()
movies.show(5)

+-------+--------------------+--------------------+
|movieId|               title|              genres|
+-------+--------------------+--------------------+
|      1|    Toy Story (1995)|Adventure|Animati...|
|      2|      Jumanji (1995)|Adventure|Childre...|
|      3|Grumpier Old Men ...|      Comedy|Romance|
|      4|Waiting to Exhale...|Comedy|Drama|Romance|
|      5|Father of the Bri...|              Comedy|
+-------+--------------------+--------------------+
only showing top 5 rows



In [11]:
ratings = readcsv("ratings.csv")
ratings.show(5)

+------+-------+------+----------+
|userId|movieId|rating| timestamp|
+------+-------+------+----------+
|     1|      2|   3.5|1112486027|
|     1|     29|   3.5|1112484676|
|     1|     32|   3.5|1112484819|
|     1|     47|   3.5|1112484727|
|     1|     50|   3.5|1112484580|
+------+-------+------+----------+
only showing top 5 rows



In [12]:
tags = readcsv("tags.csv")
tags.show(5)

+------+-------+-------------+----------+
|userId|movieId|          tag| timestamp|
+------+-------+-------------+----------+
|    18|   4141|  Mark Waters|1240597180|
|    65|    208|    dark hero|1368150078|
|    65|    353|    dark hero|1368150079|
|    65|    521|noir thriller|1368149983|
|    65|    592|    dark hero|1368150078|
+------+-------+-------------+----------+
only showing top 5 rows



## 1) Users' tag assignment reliability
<img src="misc/critic.jpeg" >

### 1.1) Which "User" gave which "Tag" to which "Movie"?

In [13]:
userId_tagId_movieId = tags.join(genome_tags, on="tag").select(["userId","tagId","movieId"])
userId_tagId_movieId.cache()
userId_tagId_movieId.show(10)

+------+-----+-------+
|userId|tagId|movieId|
+------+-----+-------+
|    65|  288|    208|
|    65|  288|    353|
|    65|  712|    521|
|    65|  288|    592|
|    65|  149|    668|
|    65|  894|    898|
|    65|  712|   1248|
|    65|  630|   1391|
|    65|  700|   1617|
|    65|  579|   1694|
+------+-----+-------+
only showing top 10 rows



### 1.2) How relevant each tag assignment was

In [14]:
userId_tagId_movieId_relevance = userId_tagId_movieId.join(genome_scores, on=["tagId","movieId"])
userId_tagId_movieId_relevance.show(10)

+-----+-------+------+------------------+
|tagId|movieId|userId|         relevance|
+-----+-------+------+------------------+
|   29|      1|135595|0.8919999999999999|
|   29|      1|131900|0.8919999999999999|
|   29|      1|111202|0.8919999999999999|
|   29|      1|107711|0.8919999999999999|
|   29|      1|103582|0.8919999999999999|
|   29|      1|103125|0.8919999999999999|
|   29|      1| 76878|0.8919999999999999|
|   29|      1| 72257|0.8919999999999999|
|   29|      1| 72073|0.8919999999999999|
|   29|      1| 10616|0.8919999999999999|
+-----+-------+------+------------------+
only showing top 10 rows



### 1.3) Calculating user reliability (ranges btw 0 --> 1)
#### Irrelevant tag assignments are punished
First multiply relevance with 10 and then substract $8$ so relevance score range will be between $-6$ and $6$ and then apply sigmoid function.
Relevance scores less than or equal to $0.2$ will go closer to $0$.
- $ R = \sigma(r * 15 - 9) $
- $R$ Reliability score
- $r$ Relevance score
<img src="misc/sigmoid.png">

In [15]:
sigmoid = udf(lambda x: 1/(1+exp(-x)))

In [16]:
users_reliability = userId_tagId_movieId_relevance.select(["userId","relevance"])\
                                            .withColumn("reliability", sigmoid(userId_tagId_movieId_relevance.relevance*15-9).cast(DoubleType()))\
                                            .groupBy("userId").avg("reliability")\
                                            .select(["userId", col("avg(reliability)").alias("reliability")])\
                                            .sort(desc("reliability"))

users_reliability.show(10)

+------+------------------+
|userId|       reliability|
+------+------------------+
| 81949|0.9975273768433653|
| 50441|0.9975273768433653|
|136694|0.9975181101552658|
|119384|0.9975181101552658|
|136455|0.9975181101552658|
| 68839|0.9975181101552658|
| 23923|0.9975181101552658|
|113642|0.9975181101552658|
|105357|0.9975181101552658|
| 20109|0.9975180928341759|
+------+------------------+
only showing top 10 rows



## 2) Most reliably rated movies
Ratings are multiplied by users' reliability scores

In [17]:
userId_movieId_ratings  = ratings.join(users_reliability, on="userId")\
                                 .withColumn("rating", round(col("rating")*col("reliability"), 1))\
                                 .select(["userId", "movieId", "rating"])
userId_movieId_ratings.cache()
userId_movieId_ratings.show(10)

+------+-------+------+
|userId|movieId|rating|
+------+-------+------+
|  4935|     50|   1.6|
|  4935|    288|   1.9|
|  4935|    413|   2.2|
|  4935|    527|   1.4|
|  4935|    546|   0.5|
|  4935|    562|   1.9|
|  4935|    745|   1.6|
|  4935|   1175|   1.4|
|  4935|   1220|   1.4|
|  4935|   1235|   1.4|
+------+-------+------+
only showing top 10 rows



In [18]:
movieId_title_rating = userId_movieId_ratings.select(["movieId", "rating"])\
                                       .groupBy("movieId").avg("rating")\
                                       .join(movies, on="movieId")\
                                       .select(["movieId", "title", col("avg(rating)").alias("rating")])\
                                       .sort(desc("rating"))

movieId_title_rating.show(10)

+-------+--------------------+------+
|movieId|               title|rating|
+-------+--------------------+------+
|  81736|Hamlet (Gamlet) (...|   5.0|
|  94153| Cold Storage (2000)|   5.0|
|  27829|      Slasher (2004)|   5.0|
|  94027|Uwasa No Onna (Th...|   5.0|
|  86055|Foster Brothers, ...|   5.0|
|  87332|City Below, The (...|   5.0|
|  93967|Keeping the Promi...|   5.0|
|  89083|Great White Silen...|   5.0|
|  51402|Forest for the Tr...|   5.0|
|  93707|Prom Queen: The M...|   5.0|
+-------+--------------------+------+
only showing top 10 rows



### Most rated movie

<img src="misc/best_movie.jpg" width=400>

## 3) Most reliably rated tags

In [19]:
tagId_tag_rating = userId_movieId_ratings.join(userId_tagId_movieId, on=["movieId", "userId"])\
                                   .select(["tagId", "rating"])\
                                   .groupBy("tagId").avg("rating")\
                                   .join(genome_tags, on="tagId")\
                                   .select(["tagId", "tag", col("avg(rating)").alias("rating")])\
                                   .sort(desc("rating"))

tagId_tag_rating.show(10)

+-----+--------------------+------+
|tagId|                 tag|rating|
+-----+--------------------+------+
|  654|           minnesota|   4.8|
|  590|         king arthur|   4.8|
|  976|       studio ghibli|   4.6|
|  290|           dc comics|   4.5|
|  760|oscar (best writi...|   4.5|
|  591|             kubrick|4.4375|
|  677|              mozart|   4.4|
|  858|    robert downey jr|   4.3|
|  207|        chris tucker|   4.3|
|  879| saturday night live|  4.25|
+-----+--------------------+------+
only showing top 10 rows



## 3) Most reliably rated genres

In [20]:
genre_rating = userId_movieId_ratings.join(movies, on="movieId")
columns = genre_rating.columns
genre_rating_rdd = genre_rating.rdd.zipWithUniqueId().map(lambda x: (x[1], tuple([*x[0]])))
del genre_rating
genre_rating_rdd.take(1)

[(0, (50, 4935, 1.6, 'Usual Suspects, The (1995)', 'Crime|Mystery|Thriller'))]

In [None]:
dict(zip(columns, genre_rating_rdd.take(1)[0][1]))

{'movieId': 50,
 'userId': 4935,
 'rating': 1.6,
 'title': 'Usual Suspects, The (1995)',
 'genres': 'Crime|Mystery|Thriller'}

In [None]:
ratingId_genre = genre_rating_rdd.flatMapValues(lambda row: row[-1].split('|'))\
                                 .map(lambda row: Row(Id=row[0], genre=row[1]))\
                                 .toDF()
ratingId_genre.show(10)

+---+--------+
| Id|   genre|
+---+--------+
|  0|   Crime|
|  0| Mystery|
|  0|Thriller|
|200|  Action|
|200|   Crime|
|200|Thriller|
|400|  Comedy|
|600|   Drama|
|600|     War|
|800|  Action|
+---+--------+
only showing top 10 rows



In [None]:
genre_rating = genre_rating_rdd.map(lambda row: Row(Id=row[0], **dict(zip(columns, row[1]))))\
                               .toDF()\
                               .join(ratingId_genre, on="Id")\
                               .select(["genre", "rating"])\
                               .groupBy("genre").avg("rating")\
                               .select(["genre", col("avg(rating)").alias("rating")])\
                               .sort(desc("rating"))

genre_rating.cache()
genre_rating.show(20)

<img src="misc/best_genre.jpg">