In [1]:
from pyspark.sql.types import *
from pyspark.sql.functions import *

### Task 2

In [2]:
movie_path = '/data/movielens/movies.csv'

In [3]:
movie_schema = StructType(fields=[
    StructField("movieid", IntegerType()),
    StructField("title", StringType()),
    StructField("genres", StringType())
])

In [4]:
movies = spark.read.csv(movie_path, schema=movie_schema, header=True)

In [5]:
movies_upd = movies.withColumn('movieid', trim(col('movieid')))\
                   .withColumn('new_title', trim(expr("substring(title, 1, length(title)-6)")))\
                   .withColumn('year', trim(col('title').substr(-5,4).astype('int')))\
                   .filter(col('genres') != '(no genres listed)')\
                   .withColumn('genres', split(col('genres'), '\|'))\
                   .select(col('movieid'), 
                           col('new_title').alias('title'),
                           col('year'),
                           col('genres'))\
                   .na.drop()

In [6]:
movies_upd.write\
          .format('org.apache.spark.sql.cassandra')\
          .mode('append')\
          .options(table='movies', keyspace='mf_goryacheva')\
          .save()

### Task 3

In [7]:
movies_by_genre = movies_upd.select(explode("genres").alias("genres"),
                                    col("year"), 
                                    col("movieid"), 
                                    col("title"))\
                            .na.drop()

In [8]:
movies_by_genre.write\
               .format('org.apache.spark.sql.cassandra')\
               .mode('append')\
               .options(table='movies_by_genre', keyspace='mf_goryacheva')\
               .save()

### Task 5

In [9]:
ratings_path = '/data/movielens/ratings.csv'

In [10]:
ratings_schema = StructType(fields=[
    StructField("userid", IntegerType()),
    StructField("movieid", IntegerType()),
    StructField("rating", FloatType()),
    StructField("timestamp", IntegerType())
])

In [11]:
ratings = spark.read.csv(ratings_path, schema=ratings_schema, header=True)

In [12]:
ratings.show(5)

+------+-------+------+---------+
|userid|movieid|rating|timestamp|
+------+-------+------+---------+
|     1|    122|   2.0|945544824|
|     1|    172|   1.0|945544871|
|     1|   1221|   5.0|945544788|
|     1|   1441|   4.0|945544871|
|     1|   1609|   3.0|945544824|
+------+-------+------+---------+
only showing top 5 rows



In [13]:
avg_ratings = ratings.groupBy("movieid")\
                     .agg({'rating':'avg'})\
                     .select(col('movieid').alias('movieid_rnk'), col('avg(rating)').alias('avg_rnk'))

In [14]:
join_condition = (col("movieid") == col("movieid_rnk"))
movies_by_genre_rating = movies_by_genre.join(avg_ratings, join_condition, how='inner')\
                                        .drop('movieid_rnk')

In [15]:
movies_by_genre_rating.show(5)

+------+----+-------+--------------------+------------------+
|genres|year|movieid|               title|           avg_rnk|
+------+----+-------+--------------------+------------------+
|Sci-Fi|1997|   1580|Men in Black (a.k...| 3.567517702204049|
|Comedy|1997|   1580|Men in Black (a.k...| 3.567517702204049|
|Action|1997|   1580|Men in Black (a.k...| 3.567517702204049|
|Sci-Fi|1999|   3175|        Galaxy Quest|3.5880611270296083|
|Comedy|1999|   3175|        Galaxy Quest|3.5880611270296083|
+------+----+-------+--------------------+------------------+
only showing top 5 rows



In [17]:
movies_by_genre_rating.write\
                      .format('org.apache.spark.sql.cassandra')\
                      .mode('append')\
                      .options(table='movies_by_genre_rating', keyspace='mf_goryacheva')\
                      .save()