In [1]:
import wget
import findspark
findspark.init()

In [2]:
import pyspark
from pyspark.sql import SQLContext
from pyspark.sql import Row
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("Python Spark SQL basic example") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()


wget.download("https://datasets.imdbws.com/name.basics.tsv.gz")


## title.principals

#### title.principals.tsv.gz – Contains the principal cast/crew for titles
- tconst (string) - alphanumeric unique identifier of the title
- ordering (integer) – a number to uniquely identify rows for a given titleId
- nconst (string) - alphanumeric unique identifier of the name/person
- category (string) - the category of job that person was in
- job (string) - the specific job title if applicable, else '\N'
- characters (string) - the name of the character played if applicable, else '\N'

In [None]:
data_file = "./title.principals.tsv.gz"
raw_data = sc.textFile(data_file) 
header = raw_data.first() #remove header
raw_data = raw_data.filter(lambda x: header != x)

In [3]:
# dataframe
df_title_principal = spark.read.option("header","true").option("sep","\t").option("inferSchema","true").csv("./title.principals.tsv.gz")

In [4]:
df_names = spark.read.option("header","true").option("sep","\t").option("inferSchema","true").csv("./name.basics.tsv.gz")

In [5]:
df_title = spark.read.option("header","true").option("sep","\t").option("inferSchema","true").csv("./title.basics.tsv.gz")

In [31]:
df_ratings = spark.read.option("header","true").option("sep","\t").option("inferSchema","true").csv("./title.ratings.tsv.gz")
df_ratings.createOrReplaceTempView("rating")

In [6]:
df_title.createOrReplaceTempView("title")

In [10]:
df_names.createOrReplaceTempView("name")

In [11]:
df_title_principal.createOrReplaceTempView("title_principal")

In [13]:
df_title.count()

6782090

In [14]:
df_names.count()

10078193

In [15]:
df.count()

38928801

In [8]:
def execute_query(query, rows=20):
    sqlDF = spark.sql(query)
    sqlDF.show(rows)

In [12]:
execute_query("show tables")

+--------+----------------+-----------+
|database|       tableName|isTemporary|
+--------+----------------+-----------+
| default|covid19_bucketed|      false|
|        |            name|       true|
|        |           title|       true|
|        | title_principal|       true|
+--------+----------------+-----------+



In [13]:
execute_query("describe table title")

+--------------+---------+-------+
|      col_name|data_type|comment|
+--------------+---------+-------+
|        tconst|   string|   null|
|     titleType|   string|   null|
|  primaryTitle|   string|   null|
| originalTitle|   string|   null|
|       isAdult|      int|   null|
|     startYear|   string|   null|
|       endYear|   string|   null|
|runtimeMinutes|   string|   null|
|        genres|   string|   null|
+--------------+---------+-------+



In [26]:
execute_query("describe table name")

+-----------------+---------+-------+
|         col_name|data_type|comment|
+-----------------+---------+-------+
|           nconst|   string|   null|
|      primaryName|   string|   null|
|        birthYear|   string|   null|
|        deathYear|   string|   null|
|primaryProfession|   string|   null|
|   knownForTitles|   string|   null|
+-----------------+---------+-------+



In [32]:
execute_query("describe table rating")

+-------------+---------+-------+
|     col_name|data_type|comment|
+-------------+---------+-------+
|       tconst|   string|   null|
|averageRating|   double|   null|
|     numVotes|      int|   null|
+-------------+---------+-------+



In [33]:
execute_query("select * from rating limit 30")

+---------+-------------+--------+
|   tconst|averageRating|numVotes|
+---------+-------------+--------+
|tt0000001|          5.6|    1608|
|tt0000002|          6.0|     197|
|tt0000003|          6.5|    1286|
|tt0000004|          6.1|     121|
|tt0000005|          6.1|    2051|
|tt0000006|          5.1|     111|
|tt0000007|          5.4|     639|
|tt0000008|          5.4|    1761|
|tt0000009|          5.9|     145|
|tt0000010|          6.9|    5786|
|tt0000011|          5.2|     253|
|tt0000012|          7.4|    9920|
|tt0000013|          5.7|    1520|
|tt0000014|          7.1|    4330|
|tt0000015|          6.1|     784|
|tt0000016|          5.9|    1150|
|tt0000017|          4.6|     232|
|tt0000018|          5.3|     469|
|tt0000019|          5.5|      17|
|tt0000020|          5.0|     260|
+---------+-------------+--------+
only showing top 20 rows



In [18]:
execute_query("""
select tconst, 
    count(1) as nrorep 
from title a
group by tconst
having count(1)>1 
limit 10
""")

+------+------+
|tconst|nrorep|
+------+------+
+------+------+



In [19]:
execute_query("""
select nconst, 
    count(1) as nrorep 
from name a
group by nconst
having count(1)>1 
limit 10
""")

+------+------+
|nconst|nrorep|
+------+------+
+------+------+



In [34]:
execute_query("""
select tconst, 
    count(1) as nrorep 
from rating a
group by tconst
having count(1)>1 
limit 10
""")

+------+------+
|tconst|nrorep|
+------+------+
+------+------+



In [24]:
execute_query("""
select * from title_principal 
limit 10
""", 20)

+---------+--------+---------+---------------+--------------------+----------+
|   tconst|ordering|   nconst|       category|                 job|characters|
+---------+--------+---------+---------------+--------------------+----------+
|tt0000001|       1|nm1588970|           self|                  \N|  ["Self"]|
|tt0000001|       2|nm0005690|       director|                  \N|        \N|
|tt0000001|       3|nm0374658|cinematographer|director of photo...|        \N|
|tt0000002|       1|nm0721526|       director|                  \N|        \N|
|tt0000002|       2|nm1335271|       composer|                  \N|        \N|
|tt0000003|       1|nm0721526|       director|                  \N|        \N|
|tt0000003|       2|nm5442194|       producer|            producer|        \N|
|tt0000003|       3|nm1335271|       composer|                  \N|        \N|
|tt0000003|       4|nm5442200|         editor|                  \N|        \N|
|tt0000004|       1|nm0721526|       director|      

In [36]:
execute_query("""
select 
    b.originalTitle, 
    avg(d.averageRating*d.numVotes) rating
from title_principal a
inner join title b on a.tconst = b.tconst 
inner join name c on a.nconst = c.nconst
inner join rating d on a.tconst = d.tconst
where a.category = 'actor'
and b.genres like '%Comedy%'
group by b.originalTitle
order by 2 desc
limit 20
""", 20)

+--------------------+------------------+
|       originalTitle|            rating|
+--------------------+------------------+
|     The Truman Show| 7185890.699999999|
|      Monsters, Inc.|         6249232.0|
|        Intouchables|         6130565.5|
|    The Big Lebowski|         5646834.0|
|Le fabuleux desti...| 5642057.800000001|
|        Finding Nemo|       5542830.675|
|The Grand Budapes...| 5440316.399999999|
|        The Hangover|         5358052.7|
|Silver Linings Pl...| 4944462.600000001|
|         Ratatouille|         4902784.0|
|Guardians of the ...|        4835993.62|
|               Shrek|         4606399.2|
|Birdman or (The U...|         4263867.3|
|Lock, Stock and T...|         4224681.0|
|         Toy Story 2|         3991980.6|
|Monty Python and ...|3962403.9999999995|
|   Shaun of the Dead|         3903263.6|
|            Kick-Ass|3896276.7999999993|
|            Superbad|3827747.5999999996|
|       Despicable Me|         3677412.0|
+--------------------+------------