In [3]:
import os
import findspark

os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/spark/spark-3.1.1-bin-hadoop2.7"

findspark.init("/spark/spark-3.1.1-bin-hadoop2.7")

In [4]:
from pyspark import SparkContext
from pyspark.conf import SparkConf

In [6]:
import psutil

In [12]:
available_mem_gb = psutil.virtual_memory().total / 1024 ** 3 - 10
available_cpu_count = psutil.cpu_count()

print(available_mem_gb)
print(available_cpu_count)

241.71878814697266
32


In [13]:
mem_per_core = int(available_mem_gb / available_cpu_count)
print(mem_per_core)

7


In [14]:
conf = (
    SparkConf()
    .set("spark.ui.port", 5050)  # Админка работает на 5050 порту
    .set("spark.driver.memory", "4g")  # Драйверу много памяти не нужно - он лишь координатор
    .set("spark.executor.memory", "{}g".format(mem_per_core))
)

In [16]:
# По воркеру на каждое ядро
sc = SparkContext("local[{}]".format(available_cpu_count), "single-machine-spark", conf=conf)

In [17]:
from pyspark.sql import SparkSession

ss = SparkSession(sc)

In [18]:
! wget -O- https://datasets.imdbws.com/title.basics.tsv.gz | gunzip > title.basics.tsv

--2021-03-03 16:37:28--  https://datasets.imdbws.com/title.basics.tsv.gz
Resolving datasets.imdbws.com (datasets.imdbws.com)... 13.226.220.105, 13.226.220.70, 13.226.220.67, ...
Connecting to datasets.imdbws.com (datasets.imdbws.com)|13.226.220.105|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 136483291 (130M) [binary/octet-stream]
Saving to: ‘STDOUT’


2021-03-03 16:37:32 (32.3 MB/s) - written to stdout [136483291/136483291]



In [19]:
! head -n 3 title.basics.tsv

tconst	titleType	primaryTitle	originalTitle	isAdult	startYear	endYear	runtimeMinutes	genres
tt0000001	short	Carmencita	Carmencita	0	1894	\N	1	Documentary,Short
tt0000002	short	Le clown et ses chiens	Le clown et ses chiens	0	1892	\N	5	Animation,Short


In [24]:
rdd = sc.textFile('title.basics.tsv')

In [25]:
rdd.count()

7668282

In [21]:
title_df = ss.read.option("delimiter", "\t").csv('title.basics.tsv', header=True, inferSchema=True)

In [22]:
title_df.count()

7668281

In [26]:
title_df.registerTempTable('title')

In [29]:
ss.sql("""
    SELECT startYear, count(startYear) as amount
    FROM title
    WHERE titleType = 'short'
    GROUP BY startYear
    ORDER BY amount DESC
    LIMIT 10
""").show()

+---------+------+
|startYear|amount|
+---------+------+
|     2017| 49868|
|     2016| 49003|
|     2018| 47899|
|     2014| 46567|
|     2015| 45545|
|     2013| 42178|
|     2019| 42123|
|     2012| 37674|
|     2020| 34674|
|     2011| 32563|
+---------+------+



In [30]:
sc.stop()