In [32]:
# import necesary packages:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.sql.functions import array_contains
from pyspark.sql.types import DoubleType



In [2]:
# We create a spark Session
spark = SparkSession.builder.appName("Jupyter Notebook").getOrCreate()

spark

In [4]:
# We load the data:
data = spark.read.json("movie.json")

In [6]:
data.show(1)

+---------------+------+---------+-----------+---------+-----+--------+-------+---------+---------+------+-------+-------+-----+------------+-----+----+
|_corrupt_record|actors|countries|description|directors|genre|imdb_url|img_url|languages|metascore|rating|runtime|tagline|title|users_rating|votes|year|
+---------------+------+---------+-----------+---------+-----+--------+-------+---------+---------+------+-------+-------+-----+------------+-----+----+
|              [|  null|     null|       null|     null| null|    null|   null|     null|     null|  null|   null|   null| null|        null| null|null|
+---------------+------+---------+-----------+---------+-----+--------+-------+---------+---------+------+-------+-------+-----+------------+-----+----+
only showing top 1 row



In [22]:
# First i will keep just the rows with that the title starts with 'V' :
V_data = data.filter(col('title').startswith('V'))

# Now i will sort the data frame in descending order based on the users_rating column:
sorted_V_data = V_data.sort(col('users_rating').desc())

# Finally i keep the columns that i need: title, year and the users rating and i print the first row:
sorted_V_data.select('title', 'year','users_rating').head()

Row(title='Violet', year='2020', users_rating='8.8')

In [40]:
# First i will keep just the rows with that the title starts with 'L' :
l_data = data.filter(col('title').startswith('L'))

# i have to change the column data type of the users rating to numeric: 
l_n_data = l_data.withColumn("users_rating", data["users_rating"].cast(DoubleType()))

# i calculate the mean user rating for each title
l_n_avg_data = l_n_data.groupBy('title').mean('users_rating')

l_n_avg_data.show()

+--------------------+-----------------+
|               title|avg(users_rating)|
+--------------------+-----------------+
|Lost in the White...|              5.5|
|Life of an Actres...|              5.2|
|            Lord Jim|              6.4|
|            Love Toy|              5.7|
|              Luster|              5.2|
|     Limehouse Blues|              6.2|
|         Lucky Stiff|4.949999999999999|
|   Lady of Burlesque|              6.4|
|     Lost Boundaries|              7.0|
|Legend of the Whi...|              5.3|
|       Livin' Large!|              5.0|
| L.A. Streetfighters|              4.6|
|    Lost & Turnt Out|              4.1|
|    Listen Up Philip|              6.3|
|Life Begins at Ei...|              6.1|
|         Lie with Me|              5.7|
|      Lenexa, 1 Mile|              5.8|
|   Lone Wolf McQuade|              6.4|
|     Love and Debate|              5.6|
|     Lost in a Harem|              6.6|
+--------------------+-----------------+
only showing top

In [45]:
# First i will keep just the rows with that the title starts with 'A' :
a_data = data.filter(col('title').startswith('A'))

# i have to change the column data type of the votes to numeric: 
a_n_data = a_data.withColumn("votes", data["votes"].cast(DoubleType()))

# Now i will sort the data frame in descending order based on the votes column:
sorted_A_data = a_n_data.sort(col('votes').desc())

sorted_A_data.select('title','year','votes').head()

Row(title='A.P.E.X.', year='1994', votes=996.0)