In [None]:
pip install pyspark pyarrow

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark
  Downloading pyspark-3.3.0.tar.gz (281.3 MB)
[K     |████████████████████████████████| 281.3 MB 63 kB/s 
Collecting py4j==0.10.9.5
  Downloading py4j-0.10.9.5-py2.py3-none-any.whl (199 kB)
[K     |████████████████████████████████| 199 kB 78.4 MB/s 
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.3.0-py2.py3-none-any.whl size=281764026 sha256=4229fdad93d92c58ded0474e75c587b0481950af11f3cf31b5a21a6922627685
  Stored in directory: /root/.cache/pip/wheels/7a/8e/1b/f73a52650d2e5f337708d9f6a1750d451a7349a867f928b885
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9.5 pyspark-3.3.0


In [None]:
from pyspark.sql import SparkSession


spark = SparkSession.builder\
        .master('local[4]')\
        .appName('Lesson_2')\
        .config('spark.ui.port', '4050')\
        .config('spark.executor.instances', 4)\
        .config('spark.executor.memory', f'{int(3000/1.1)}mb')\
        .config('spark.executor.cores', 1)\
        .getOrCreate()

sc = spark.sparkContext

In [None]:
from pyspark.sql import functions as F
from pyspark.sql.types import *

data =[
    ("Banana",1000,"USA"), ("Carrots",1500,"USA"), ("Beans",1600,"USA"),\
    ("Orange",2000,"USA"), ("Orange",2000,"USA"), ("Banana",400,"China"),\
    ("Carrots",1200,"China"), ("Beans",1500,"China"), ("Orange",4000,"China"),\
    ("Banana",2000,"Canada"), ("Carrots",2000,"Canada"), ("Beans",2000,"Mexico")
    ]
columns = ["Product","Amount", "Country"]

df = spark.createDataFrame(data=data, schema=columns)
df.printSchema()
df.show(truncate=False)

root
 |-- Product: string (nullable = true)
 |-- Amount: long (nullable = true)
 |-- Country: string (nullable = true)

+-------+------+-------+
|Product|Amount|Country|
+-------+------+-------+
|Banana |1000  |USA    |
|Carrots|1500  |USA    |
|Beans  |1600  |USA    |
|Orange |2000  |USA    |
|Orange |2000  |USA    |
|Banana |400   |China  |
|Carrots|1200  |China  |
|Beans  |1500  |China  |
|Orange |4000  |China  |
|Banana |2000  |Canada |
|Carrots|2000  |Canada |
|Beans  |2000  |Mexico |
+-------+------+-------+



In [None]:
df.registerTempTable('df')



In [None]:
spark.sql('select * from df').show()

+-------+------+-------+
|Product|Amount|Country|
+-------+------+-------+
| Banana|  1000|    USA|
|Carrots|  1500|    USA|
|  Beans|  1600|    USA|
| Orange|  2000|    USA|
| Orange|  2000|    USA|
| Banana|   400|  China|
|Carrots|  1200|  China|
|  Beans|  1500|  China|
| Orange|  4000|  China|
| Banana|  2000| Canada|
|Carrots|  2000| Canada|
|  Beans|  2000| Mexico|
+-------+------+-------+



In [None]:
# spark может в sql
df.registerTempTable('df')

spark.sql('''
select *, 
row_number() over( partition by Country order by Amount ) as rn from df
''').show()



+-------+------+-------+---+
|Product|Amount|Country| rn|
+-------+------+-------+---+
| Banana|  2000| Canada|  1|
|Carrots|  2000| Canada|  2|
| Banana|   400|  China|  1|
|Carrots|  1200|  China|  2|
|  Beans|  1500|  China|  3|
| Orange|  4000|  China|  4|
|  Beans|  2000| Mexico|  1|
| Banana|  1000|    USA|  1|
|Carrots|  1500|    USA|  2|
|  Beans|  1600|    USA|  3|
| Orange|  2000|    USA|  4|
| Orange|  2000|    USA|  5|
+-------+------+-------+---+



# оконные функции

In [None]:
from pyspark.sql import Window


windSpec = Window()\
    .partitionBy('Country')\
    .orderBy('Amount')

In [None]:
windSpec

<pyspark.sql.window.WindowSpec at 0x7f3ab67360d0>

## Ранжирующие функции

In [None]:
df.withColumn('rn', F.row_number().over(windSpec)).show()

+-------+------+-------+---+
|Product|Amount|Country| rn|
+-------+------+-------+---+
| Banana|  2000| Canada|  1|
|Carrots|  2000| Canada|  2|
| Banana|   400|  China|  1|
|Carrots|  1200|  China|  2|
|  Beans|  1500|  China|  3|
| Orange|  4000|  China|  4|
|  Beans|  2000| Mexico|  1|
| Banana|  1000|    USA|  1|
|Carrots|  1500|    USA|  2|
|  Beans|  1600|    USA|  3|
| Orange|  2000|    USA|  4|
| Orange|  2000|    USA|  5|
+-------+------+-------+---+



## Аналитические функции

In [None]:
windSpec = Window()\
    .partitionBy('Product')\
    .orderBy('Amount')

df.withColumn('ntile', F.ntile(4).over(windSpec)).show()

+-------+------+-------+-----+
|Product|Amount|Country|ntile|
+-------+------+-------+-----+
| Banana|   400|  China|    1|
| Banana|  1000|    USA|    2|
| Banana|  2000| Canada|    3|
|  Beans|  1500|  China|    1|
|  Beans|  1600|    USA|    2|
|  Beans|  2000| Mexico|    3|
|Carrots|  1200|  China|    1|
|Carrots|  1500|    USA|    2|
|Carrots|  2000| Canada|    3|
| Orange|  2000|    USA|    1|
| Orange|  2000|    USA|    2|
| Orange|  4000|  China|    3|
+-------+------+-------+-----+



## Функции сдвига

In [None]:
windSpec = Window()\
    .partitionBy('Country')\
    .orderBy('Amount')\
    .rowsBetween(Window.unboundedPreceding, Window.currentRow - 1)

df.withColumn('lag', F.last('Product').over(windSpec)).show()

+-------+------+-------+-------+
|Product|Amount|Country|    lag|
+-------+------+-------+-------+
| Banana|  2000| Canada|   null|
|Carrots|  2000| Canada| Banana|
| Banana|   400|  China|   null|
|Carrots|  1200|  China| Banana|
|  Beans|  1500|  China|Carrots|
| Orange|  4000|  China|  Beans|
|  Beans|  2000| Mexico|   null|
| Banana|  1000|    USA|   null|
|Carrots|  1500|    USA| Banana|
|  Beans|  1600|    USA|Carrots|
| Orange|  2000|    USA|  Beans|
| Orange|  2000|    USA| Orange|
+-------+------+-------+-------+



## агрегатные функции

In [None]:
windSpec = Window()\
    .partitionBy('Country')\
    .orderBy('Product')\
    .rowsBetween(Window.currentRow - 2, Window.currentRow - 1)

df.withColumn('avg', F.avg('Amount').over(windSpec)).show()

+-------+------+-------+------+
|Product|Amount|Country|   avg|
+-------+------+-------+------+
| Banana|  2000| Canada|  null|
|Carrots|  2000| Canada|2000.0|
| Banana|   400|  China|  null|
|  Beans|  1500|  China| 400.0|
|Carrots|  1200|  China| 950.0|
| Orange|  4000|  China|1350.0|
|  Beans|  2000| Mexico|  null|
| Banana|  1000|    USA|  null|
|  Beans|  1600|    USA|1000.0|
|Carrots|  1500|    USA|1300.0|
| Orange|  2000|    USA|1550.0|
| Orange|  2000|    USA|1750.0|
+-------+------+-------+------+



# UDF

In [None]:
f('a')

'A'

In [None]:
@F.udf(returnType=StringType())
def f(value):
    return value.upper()

In [None]:
def f(value):
    return value.upper()

f = F.udf(f, returnType=StringType())

In [None]:
df.withColumn('value', f(df.Country)).show()

+-------+------+-------+------+
|Product|Amount|Country| value|
+-------+------+-------+------+
| Banana|  1000|    USA|   USA|
|Carrots|  1500|    USA|   USA|
|  Beans|  1600|    USA|   USA|
| Orange|  2000|    USA|   USA|
| Orange|  2000|    USA|   USA|
| Banana|   400|  China| CHINA|
|Carrots|  1200|  China| CHINA|
|  Beans|  1500|  China| CHINA|
| Orange|  4000|  China| CHINA|
| Banana|  2000| Canada|CANADA|
|Carrots|  2000| Canada|CANADA|
|  Beans|  2000| Mexico|MEXICO|
+-------+------+-------+------+



In [None]:
df.withColumn('value', F.array(df.Amount, df.Amount)).show()

+-------+------+-------+------------+
|Product|Amount|Country|       value|
+-------+------+-------+------------+
| Banana|  1000|    USA|[1000, 1000]|
|Carrots|  1500|    USA|[1500, 1500]|
|  Beans|  1600|    USA|[1600, 1600]|
| Orange|  2000|    USA|[2000, 2000]|
| Orange|  2000|    USA|[2000, 2000]|
| Banana|   400|  China|  [400, 400]|
|Carrots|  1200|  China|[1200, 1200]|
|  Beans|  1500|  China|[1500, 1500]|
| Orange|  4000|  China|[4000, 4000]|
| Banana|  2000| Canada|[2000, 2000]|
|Carrots|  2000| Canada|[2000, 2000]|
|  Beans|  2000| Mexico|[2000, 2000]|
+-------+------+-------+------------+



In [None]:
def f(value):
    return value + [1.]

udf_f = F.udf(f, returnType=ArrayType(FloatType()))

df.withColumn('value', udf_f(F.array(df.Amount, df.Amount))).show()

+-------+------+-------+-----------------+
|Product|Amount|Country|            value|
+-------+------+-------+-----------------+
| Banana|  1000|    USA|[null, null, 1.0]|
|Carrots|  1500|    USA|[null, null, 1.0]|
|  Beans|  1600|    USA|[null, null, 1.0]|
| Orange|  2000|    USA|[null, null, 1.0]|
| Orange|  2000|    USA|[null, null, 1.0]|
| Banana|   400|  China|[null, null, 1.0]|
|Carrots|  1200|  China|[null, null, 1.0]|
|  Beans|  1500|  China|[null, null, 1.0]|
| Orange|  4000|  China|[null, null, 1.0]|
| Banana|  2000| Canada|[null, null, 1.0]|
|Carrots|  2000| Canada|[null, null, 1.0]|
|  Beans|  2000| Mexico|[null, null, 1.0]|
+-------+------+-------+-----------------+



In [None]:
import pandas as pd

In [None]:
#PyArrow
spark.conf.set('sql.execution.arrow.pyspark.enable','true')

In [None]:
%timeit spark.createDataFrame(pd.DataFrame({'a': range(1000000)})).toPandas()

1 loop, best of 5: 26.2 s per loop


In [None]:
#PyArrow
spark.conf.set('sql.execution.arrow.pyspark.enable','false')

In [None]:
%timeit spark.createDataFrame(pd.DataFrame({'a': range(1000000)})).toPandas()

1 loop, best of 5: 26 s per loop


In [None]:
sql.functions
pandas_udf
rdd.mapInPartitions
udf