In [1]:
import wget
import findspark
findspark.init()

In [2]:
import pyspark
from pyspark.sql import SQLContext
from pyspark.sql import Row

sc = SparkContext.getOrCreate()
sqlContext = SQLContext(sc)


wget.download("https://datasets.imdbws.com/name.basics.tsv.gz")


## title.principals

#### title.principals.tsv.gz – Contains the principal cast/crew for titles
- tconst (string) - alphanumeric unique identifier of the title
- ordering (integer) – a number to uniquely identify rows for a given titleId
- nconst (string) - alphanumeric unique identifier of the name/person
- category (string) - the category of job that person was in
- job (string) - the specific job title if applicable, else '\N'
- characters (string) - the name of the character played if applicable, else '\N'

In [12]:
data_file = "./title.principals.tsv.gz"
raw_data = sc.textFile(data_file) 
header = raw_data.first() #remove header
raw_data = raw_data.filter(lambda x: header != x)

In [7]:
# dataframe
df = spark.read.option("header","true").option("inferSchema","true").csv("./title.principals.tsv.gz")
df.show()

+----------------------------------------------+
|tconst	ordering	nconst	category	job	characters|
+----------------------------------------------+
|                          tt0000001	1	nm158...|
|                          tt0000001	2	nm000...|
|                          tt0000001	3	nm037...|
|                          tt0000002	1	nm072...|
|                          tt0000002	2	nm133...|
|                          tt0000003	1	nm072...|
|                          tt0000003	2	nm544...|
|                          tt0000003	3	nm133...|
|                          tt0000003	4	nm544...|
|                          tt0000004	1	nm072...|
|                          tt0000004	2	nm133...|
|                          tt0000005	1	nm044...|
|                          tt0000005	2	nm065...|
|                          tt0000005	3	nm000...|
|                          tt0000005	4	nm024...|
|                          tt0000006	1	nm000...|
|                          tt0000007	1	nm017...|
|                   

In [16]:
tsv_data = raw_data.map(lambda l: l.split("\t"))
row_data = tsv_data.map(lambda p: Row(
    tconst=p[0],
    ordering=int(p[1]),
    nconst=p[2],
    category=p[3],
    job=p[4],
    characters=p[5]
    )
)

In [17]:
principals_df = sqlContext.createDataFrame(row_data)
principals_df.registerTempTable("principals")

In [20]:
# Select tcp network interactions with more than 1 second duration and no transfer from destination
tcp_principals = sqlContext.sql("""
    SELECT * FROM principals limit 10
""")
tcp_principals.show()

+-------------------+-------------+
|           category|sum(ordering)|
+-------------------+-------------+
|            actress|     23656276|
|           producer|     15159835|
|             writer|     34055856|
|           composer|      9380561|
|           director|     19989895|
|               self|     22967709|
|              actor|     32012753|
|             editor|      9122407|
|    cinematographer|      9634074|
|      archive_sound|        10128|
|production_designer|      2378722|
|    archive_footage|      1130628|
+-------------------+-------------+



## title.akas

#### title.akas.tsv.gz - Contains the following information for titles:

- titleId (string) - a tconst, an alphanumeric unique identifier of the title
- ordering (integer) – a number to uniquely identify rows for a given titleId
- title (string) – the localized title
- region (string) - the region for this version of the title
- language (string) - the language of the title
- types (array) - Enumerated set of attributes for this alternative title. One or more of the following: "alternative", "dvd", "festival", "tv", "video", "working", "original", "imdbDisplay". New values may be added in the future without warning
- attributes (array) - Additional terms to describe this alternative title, not enumerated
- isOriginalTitle (boolean) – 0: not original title; 1: original title

In [21]:
data_file = "./title.akas.tsv.gz"
raw_data = sc.textFile(data_file)
header = raw_data.first() #remove header
raw_data = raw_data.filter(lambda x: header != x)

In [22]:
tsv_data = raw_data.map(lambda l: l.split("\t"))

row_data = tsv_data.map(lambda p: Row(
    titleId=p[0],
    ordering=int(p[1]),
    title=p[2],
    region=p[3],
    language=p[4],
    types=list(p[5]),
    attributes=list(p[6]),
    isOriginalTitle=bool(p[7])
    )
)

In [23]:
akas_df = sqlContext.createDataFrame(row_data)
akas_df.registerTempTable("akas")

# Select tcp network interactions with more than 1 second duration and no transfer from destination
tcp_akas = sqlContext.sql("""
    SELECT * FROM akas limit 10
""")
tcp_akas.show()

+--------------------+---------------+--------+--------+------+--------------------+---------+--------------------+
|          attributes|isOriginalTitle|language|ordering|region|               title|  titleId|               types|
+--------------------+---------------+--------+--------+------+--------------------+---------+--------------------+
|              [\, N]|           true|      \N|       1|    UA|          Карменсіта|tt0000001|[i, m, d, b, D, i...|
|[l, i, t, e, r, a...|           true|      \N|       2|    DE|          Carmencita|tt0000001|              [\, N]|
|              [\, N]|           true|      \N|       3|    HU|Carmencita - span...|tt0000001|[i, m, d, b, D, i...|
|              [\, N]|           true|      \N|       4|    GR|          Καρμενσίτα|tt0000001|[i, m, d, b, D, i...|
|              [\, N]|           true|      \N|       5|    RU|          Карменсита|tt0000001|[i, m, d, b, D, i...|
|              [\, N]|           true|      \N|       6|    US|         

## title.basics

### title.basics.tsv.gz - Contains the following information for titles:
- tconst (string) - alphanumeric unique identifier of the title
- titleType (string) – the type/format of the title (e.g. movie, short, tvseries, tvepisode, video, etc)
- primaryTitle (string) – the more popular title / the title used by the filmmakers on promotional materials at the point of release
- originalTitle (string) - original title, in the original language
- isAdult (boolean) - 0: non-adult title; 1: adult title
- startYear (YYYY) – represents the release year of a title. In the case of TV Series, it is the series start year
- endYear (YYYY) – TV Series end year. ‘\N’ for all other title types
- runtimeMinutes – primary runtime of the title, in minutes
- genres (string array) – includes up to three genres associated with the title

In [52]:
data_file = "./title.basics.tsv.gz"
raw_data = sc.textFile(data_file) 
header = raw_data.first() #remove header
raw_data = raw_data.filter(lambda x: header != x)

In [60]:
tsv_data = raw_data.map(lambda l: l.split("\t"))

row_data = tsv_data.map(lambda p: Row(
    tconst=p[0],
    titleType=p[1],
    primaryTitle=p[2],
    originalTitle=p[3],
    isAdult=bool(p[4]),
    startYear=(p[5]),
    endYear=(p[6]),
    runtimeMinutes=(p[7]),
    genres=(p[8])
    )
)

In [61]:
basics_df = sqlContext.createDataFrame(row_data)
basics_df.registerTempTable("basics")

# Select tcp network interactions with more than 1 second duration and no transfer from destination
tcp_basics = sqlContext.sql("""
    SELECT * FROM basics limit 10
""")
tcp_basics.show()

+-------+--------------------+-------+--------------------+--------------------+--------------+---------+---------+---------+
|endYear|              genres|isAdult|       originalTitle|        primaryTitle|runtimeMinutes|startYear|   tconst|titleType|
+-------+--------------------+-------+--------------------+--------------------+--------------+---------+---------+---------+
|     \N|   Documentary,Short|   true|          Carmencita|          Carmencita|             1|     1894|tt0000001|    short|
|     \N|     Animation,Short|   true|Le clown et ses c...|Le clown et ses c...|             5|     1892|tt0000002|    short|
|     \N|Animation,Comedy,...|   true|      Pauvre Pierrot|      Pauvre Pierrot|             4|     1892|tt0000003|    short|
|     \N|     Animation,Short|   true|         Un bon bock|         Un bon bock|            12|     1892|tt0000004|    short|
|     \N|        Comedy,Short|   true|    Blacksmith Scene|    Blacksmith Scene|             1|     1893|tt0000005|   

In [62]:
# Select tcp network interactions with more than 1 second duration and no transfer from destination
tcp_basics = sqlContext.sql("""
    SELECT runtimeMinutes, count(*) FROM basics group by runtimeMinutes 
""")
tcp_basics.show()

+--------------+--------+
|runtimeMinutes|count(1)|
+--------------+--------+
|           467|       3|
|           296|      18|
|           691|       2|
|           675|       1|
|          2088|       1|
|           125|    2335|
|           451|       4|
|           800|       3|
|           853|       1|
|          1669|       1|
|           870|       1|
|             7|   36942|
|            51|    6724|
|           124|    1467|
|           447|       2|
|           475|       5|
|           307|      11|
|          1500|       4|
|          1773|       1|
|           613|       1|
+--------------+--------+
only showing top 20 rows



In [47]:
raw_data.count()

6782090